PyPI - diffusers - Versions diffs - 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl - Mend

diffusers 0.23.1py3-none-any.whl → 0.25.0py3-none-any.whl

Files changed (238) hide show

diffusers/pipelines/amused/pipeline_amused_inpaint.py ADDED Viewed

@@ -0,0 +1,378 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...models import UVit2DModel, VQModel
+from ...schedulers import AmusedScheduler
+from ...utils import replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AmusedInpaintPipeline
+        >>> from diffusers.utils import load_image
+        >>> pipe = AmusedInpaintPipeline.from_pretrained(
+        ...     "amused/amused-512", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "fall mountains"
+        >>> input_image = (
+        ...     load_image(
+        ...         "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg"
+        ...     )
+        ...     .resize((512, 512))
+        ...     .convert("RGB")
+        ... )
+        >>> mask = (
+        ...     load_image(
+        ...         "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
+        ...     )
+        ...     .resize((512, 512))
+        ...     .convert("L")
+        ... )
+        >>> pipe(prompt, input_image, mask).images[0].save("out.png")
+        ```
+"""
+class AmusedInpaintPipeline(DiffusionPipeline):
+    image_processor: VaeImageProcessor
+    vqvae: VQModel
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModelWithProjection
+    transformer: UVit2DModel
+    scheduler: AmusedScheduler
+    model_cpu_offload_seq = "text_encoder->transformer->vqvae"
+    # TODO - when calling self.vqvae.quantize, it uses self.vqvae.quantize.embedding.weight before
+    # the forward method of self.vqvae.quantize, so the hook doesn't get called to move the parameter
+    # off the meta device. There should be a way to fix this instead of just not offloading it
+    _exclude_from_cpu_offload = ["vqvae"]
+    def __init__(
+        self,
+        vqvae: VQModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        transformer: UVit2DModel,
+        scheduler: AmusedScheduler,
+    ):
+        super().__init__()
+        self.register_modules(
+            vqvae=vqvae,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_normalize=False,
+            do_binarize=True,
+            do_convert_grayscale=True,
+            do_resize=True,
+        )
+        self.scheduler.register_to_config(masking_schedule="linear")
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[List[str], str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 12,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[torch.Generator] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_encoder_hidden_states: Optional[torch.Tensor] = None,
+        output_type="pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        micro_conditioning_aesthetic_score: int = 6,
+        micro_conditioning_crop_coord: Tuple[int, int] = (0, 0),
+        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
+    ):
+        """
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 16):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument. A single vector from the
+                pooled and projected final hidden states.
+            encoder_hidden_states (`torch.FloatTensor`, *optional*):
+                Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_encoder_hidden_states (`torch.FloatTensor`, *optional*):
+                Analogous to `encoder_hidden_states` for the positive prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
+                The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/
+                and the micro-conditioning section of https://arxiv.org/abs/2307.01952.
+            micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952.
+            temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
+                Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
+        Examples:
+        Returns:
+            [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images.
+        """
+        if (prompt_embeds is not None and encoder_hidden_states is None) or (
+            prompt_embeds is None and encoder_hidden_states is not None
+        ):
+            raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither")
+        if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or (
+            negative_prompt_embeds is None and negative_encoder_hidden_states is not None
+        ):
+            raise ValueError(
+                "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither"
+            )
+        if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None):
+            raise ValueError("pass only one of `prompt` or `prompt_embeds`")
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        batch_size = batch_size * num_images_per_prompt
+        if prompt_embeds is None:
+            input_ids = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids.to(self._execution_device)
+            outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+            prompt_embeds = outputs.text_embeds
+            encoder_hidden_states = outputs.hidden_states[-2]
+        prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
+        encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+        if guidance_scale > 1.0:
+            if negative_prompt_embeds is None:
+                if negative_prompt is None:
+                    negative_prompt = [""] * len(prompt)
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+                input_ids = self.tokenizer(
+                    negative_prompt,
+                    return_tensors="pt",
+                    padding="max_length",
+                    truncation=True,
+                    max_length=self.tokenizer.model_max_length,
+                ).input_ids.to(self._execution_device)
+                outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+                negative_prompt_embeds = outputs.text_embeds
+                negative_encoder_hidden_states = outputs.hidden_states[-2]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1)
+            negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+            prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds])
+            encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states])
+        image = self.image_processor.preprocess(image)
+        height, width = image.shape[-2:]
+        # Note that the micro conditionings _do_ flip the order of width, height for the original size
+        # and the crop coordinates. This is how it was done in the original code base
+        micro_conds = torch.tensor(
+            [
+                width,
+                height,
+                micro_conditioning_crop_coord[0],
+                micro_conditioning_crop_coord[1],
+                micro_conditioning_aesthetic_score,
+            ],
+            device=self._execution_device,
+            dtype=encoder_hidden_states.dtype,
+        )
+        micro_conds = micro_conds.unsqueeze(0)
+        micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1)
+        self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device)
+        num_inference_steps = int(len(self.scheduler.timesteps) * strength)
+        start_timestep_idx = len(self.scheduler.timesteps) - num_inference_steps
+        needs_upcasting = self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast
+        if needs_upcasting:
+            self.vqvae.float()
+        latents = self.vqvae.encode(image.to(dtype=self.vqvae.dtype, device=self._execution_device)).latents
+        latents_bsz, channels, latents_height, latents_width = latents.shape
+        latents = self.vqvae.quantize(latents)[2][2].reshape(latents_bsz, latents_height, latents_width)
+        mask = self.mask_processor.preprocess(
+            mask_image, height // self.vae_scale_factor, width // self.vae_scale_factor
+        )
+        mask = mask.reshape(mask.shape[0], latents_height, latents_width).bool().to(latents.device)
+        latents[mask] = self.scheduler.config.mask_token_id
+        starting_mask_ratio = mask.sum() / latents.numel()
+        latents = latents.repeat(num_images_per_prompt, 1, 1)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i in range(start_timestep_idx, len(self.scheduler.timesteps)):
+                timestep = self.scheduler.timesteps[i]
+                if guidance_scale > 1.0:
+                    model_input = torch.cat([latents] * 2)
+                else:
+                    model_input = latents
+                model_output = self.transformer(
+                    model_input,
+                    micro_conds=micro_conds,
+                    pooled_text_emb=prompt_embeds,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+                if guidance_scale > 1.0:
+                    uncond_logits, cond_logits = model_output.chunk(2)
+                    model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits)
+                latents = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=timestep,
+                    sample=latents,
+                    generator=generator,
+                    starting_mask_ratio=starting_mask_ratio,
+                ).prev_sample
+                if i == len(self.scheduler.timesteps) - 1 or ((i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, timestep, latents)
+        if output_type == "latent":
+            output = latents
+        else:
+            output = self.vqvae.decode(
+                latents,
+                force_not_quantize=True,
+                shape=(
+                    batch_size,
+                    height // self.vae_scale_factor,
+                    width // self.vae_scale_factor,
+                    self.vqvae.config.latent_channels,
+                ),
+            ).sample.clip(0, 1)
+            output = self.image_processor.postprocess(output, output_type)
+            if needs_upcasting:
+                self.vqvae.half()
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (output,)
+        return ImagePipelineOutput(output)

diffusers/pipelines/animatediff/pipeline_animatediff.py CHANGED Viewed

@@ -18,11 +18,11 @@ from typing import Any, Callable, Dict, List, Optional, Union
 import numpy as np
 import torch
-from transformers import CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-from ...image_processor import VaeImageProcessor
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel, UNetMotionModel
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...models.unet_motion_model import MotionAdapter
 from ...schedulers import (
@@ -33,7 +33,14 @@ from ...schedulers import (
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
@@ -47,7 +54,7 @@ EXAMPLE_DOC_STRING = """
         >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif
-        >>> adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
+        >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
         >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
         >>> output = pipe(prompt="A corgi walking in the park")
@@ -77,13 +84,19 @@ class AnimateDiffPipelineOutput(BaseOutput):
     frames: Union[torch.Tensor, np.ndarray]
-class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
@@ -99,7 +112,9 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
-    model_cpu_offload_seq = "text_encoder->unet->vae"
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder"]
     def __init__(
         self,
@@ -116,6 +131,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
             EulerAncestralDiscreteScheduler,
             DPMSolverMultistepScheduler,
         ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
     ):
         super().__init__()
         unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
@@ -127,6 +144,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
             unet=unet,
             motion_adapter=motion_adapter,
             scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -313,6 +332,31 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
         return prompt_embeds, negative_prompt_embeds
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+            return image_embeds, uncond_image_embeds
     # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
@@ -496,6 +540,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
         return latents
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
@@ -511,6 +556,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -557,6 +603,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                 `np.array`.
@@ -628,6 +675,14 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
         if do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
+            )
+            if do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
@@ -648,6 +703,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
         # Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -663,6 +720,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLo
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
                 ).sample
                 # perform guidance

diffusers/pipelines/audioldm/pipeline_audioldm.py CHANGED Viewed

@@ -72,6 +72,7 @@ class AudioLDMPipeline(DiffusionPipeline):
         vocoder ([`~transformers.SpeechT5HifiGan`]):
             Vocoder of class `SpeechT5HifiGan`.
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     def __init__(

diffusers 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

diffusers 0.23.1py3-none-any.whl → 0.25.0py3-none-any.whl