PyPI - diffusers - Versions diffs - 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl - Mend

diffusers 0.27.1py3-none-any.whl → 0.28.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (270) hide show

diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py CHANGED Viewed

@@ -191,7 +191,12 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -219,10 +224,10 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
         callback_steps: int = 1,
         editing_prompt: Optional[Union[str, List[str]]] = None,
         editing_prompt_embeddings: Optional[torch.Tensor] = None,
@@ -263,7 +268,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                 generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
+            latents (`torch.Tensor`, *optional*):
                 Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor is generated by sampling using the supplied random `generator`.
@@ -274,7 +279,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function is called. If not specified, the callback is called at
                 every step.

diffusers/pipelines/shap_e/pipeline_shap_e.py CHANGED Viewed

@@ -69,7 +69,7 @@ class ShapEPipelineOutput(BaseOutput):
     Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
     Args:
-        images (`torch.FloatTensor`)
+        images (`torch.Tensor`)
             A list of images for 3D rendering.
     """
@@ -187,7 +187,7 @@ class ShapEPipeline(DiffusionPipeline):
         num_images_per_prompt: int = 1,
         num_inference_steps: int = 25,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
         guidance_scale: float = 4.0,
         frame_size: int = 64,
         output_type: Optional[str] = "pil",  # pil, np, latent, mesh
@@ -207,7 +207,7 @@ class ShapEPipeline(DiffusionPipeline):
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                 generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
+            latents (`torch.Tensor`, *optional*):
                 Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor is generated by sampling using the supplied random `generator`.

diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py CHANGED Viewed

@@ -70,7 +70,7 @@ class ShapEPipelineOutput(BaseOutput):
     Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
     Args:
-        images (`torch.FloatTensor`)
+        images (`torch.Tensor`)
             A list of images for 3D rendering.
     """
@@ -86,7 +86,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
     Args:
         prior ([`PriorTransformer`]):
-            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
         image_encoder ([`~transformers.CLIPVisionModel`]):
             Frozen image-encoder.
         image_processor ([`~transformers.CLIPImageProcessor`]):
@@ -169,7 +169,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
         num_images_per_prompt: int = 1,
         num_inference_steps: int = 25,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
         guidance_scale: float = 4.0,
         frame_size: int = 64,
         output_type: Optional[str] = "pil",  # pil, np, latent, mesh
@@ -179,7 +179,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
         The call function to the pipeline for generation.
         Args:
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
                 `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
                 latents as image, but if passing latents directly it is not encoded again.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -190,7 +190,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                 generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
+            latents (`torch.Tensor`, *optional*):
                 Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor is generated by sampling using the supplied random `generator`.
@@ -239,15 +239,15 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
         num_embeddings = self.prior.config.num_embeddings
         embedding_dim = self.prior.config.embedding_dim
-        latents = self.prepare_latents(
-            (batch_size, num_embeddings * embedding_dim),
-            image_embeds.dtype,
-            device,
-            generator,
-            latents,
-            self.scheduler,
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                (batch_size, num_embeddings * embedding_dim),
+                image_embeds.dtype,
+                device,
+                generator,
+                latents,
+                self.scheduler,
+            )
         # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
         latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)

diffusers/pipelines/shap_e/renderer.py CHANGED Viewed

@@ -844,7 +844,7 @@ class ShapERenderer(ModelMixin, ConfigMixin):
         transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
         math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
-        args:
+        Args:
             rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
             number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including

diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py CHANGED Viewed

@@ -100,8 +100,10 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
         )
         self.register_to_config(latent_dim_scale=latent_dim_scale)
-    def prepare_latents(self, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, scheduler):
-        batch_size, channels, height, width = image_embeddings.shape
+    def prepare_latents(
+        self, batch_size, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, scheduler
+    ):
+        _, channels, height, width = image_embeddings.shape
         latents_shape = (
             batch_size * num_images_per_prompt,
             4,
@@ -127,10 +129,10 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
         do_classifier_free_guidance,
         prompt=None,
         negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
     ):
         if prompt_embeds is None:
             # get prompt text embeddings
@@ -283,18 +285,18 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
-        image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        image_embeddings: Union[torch.Tensor, List[torch.Tensor]],
         prompt: Union[str, List[str]] = None,
         num_inference_steps: int = 10,
         guidance_scale: float = 0.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
         num_images_per_prompt: int = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -304,7 +306,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
         Function invoked when calling the pipeline for generation.
         Args:
-            image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+            image_embedding (`torch.Tensor` or `List[torch.Tensor]`):
                 Image Embeddings either extracted from an image or generated by a Prior Model.
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
@@ -320,26 +322,26 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `decoder_guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
+            prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                 provided, text embeddings will be generated from `prompt` input argument.
-            prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+            prompt_embeds_pooled (`torch.Tensor`, *optional*):
                 Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                 If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                 argument.
-            negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+            negative_prompt_embeds_pooled (`torch.Tensor`, *optional*):
                 Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
-                argument.
+                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
+                input argument.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                 to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
+            latents (`torch.Tensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
@@ -383,7 +385,19 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
         )
         if isinstance(image_embeddings, list):
             image_embeddings = torch.cat(image_embeddings, dim=0)
-        batch_size = image_embeddings.shape[0]
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # Compute the effective number of images per prompt
+        # We must account for the fact that the image embeddings from the prior can be generated with num_images_per_prompt > 1
+        # This results in a case where a single prompt is associated with multiple image embeddings
+        # Divide the number of image embeddings by the batch size to determine if this is the case.
+        num_images_per_prompt = num_images_per_prompt * (image_embeddings.shape[0] // batch_size)
         # 2. Encode caption
         if prompt_embeds is None and negative_prompt_embeds is None:
@@ -417,7 +431,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
         # 5. Prepare latents
         latents = self.prepare_latents(
-            image_embeddings, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
+            batch_size, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
         )
         # 6. Run denoising loop

diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py CHANGED Viewed

@@ -31,7 +31,10 @@ TEXT2IMAGE_EXAMPLE_DOC_STRING = """
         ```py
         >>> import torch
         >>> from diffusers import StableCascadeCombinedPipeline
-        >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16)
+        >>> pipe = StableCascadeCombinedPipeline.from_pretrained(
+        ...     "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16
+        ... )
         >>> pipe.enable_model_cpu_offload()
         >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
         >>> images = pipe(prompt=prompt)
@@ -68,6 +71,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
     """
     _load_connected_pipes = True
+    _optional_components = ["prior_feature_extractor", "prior_image_encoder"]
     def __init__(
         self,
@@ -117,25 +121,25 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
     def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
         self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
-    def enable_model_cpu_offload(self, gpu_id=0):
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
         to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
         method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
         `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
         """
-        self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
-        self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
+        self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
+        self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
-    def enable_sequential_cpu_offload(self, gpu_id=0):
+    def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
         r"""
         Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
         Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
         GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
         Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
         """
-        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
-        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
     def progress_bar(self, iterable=None, total=None):
         self.prior_pipe.progress_bar(iterable=iterable, total=total)
@@ -158,13 +162,13 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
         num_inference_steps: int = 12,
         decoder_guidance_scale: float = 0.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
         num_images_per_prompt: int = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -183,17 +187,17 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
+            prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, text embeddings will be generated from `prompt` input argument.
-            prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+            prompt_embeds_pooled (`torch.Tensor`, *optional*):
                 Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
                 prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
                 input argument.
-            negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+            negative_prompt_embeds_pooled (`torch.Tensor`, *optional*):
                 Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
                 prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
                 input argument.
@@ -226,7 +230,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                 to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
+            latents (`torch.Tensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
@@ -242,7 +246,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
             prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
                 The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
                 list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
-                the `._callback_tensor_inputs` attribute of your pipeine class.
+                the `._callback_tensor_inputs` attribute of your pipeline class.
             callback_on_step_end (`Callable`, *optional*):
                 A function that calls at the end of each denoising steps during the inference. The function is called
                 with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -251,7 +255,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
             callback_on_step_end_tensor_inputs (`List`, *optional*):
                 The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeine class.
+                `._callback_tensor_inputs` attribute of your pipeline class.
         Examples:

diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py CHANGED Viewed

@@ -54,19 +54,19 @@ class StableCascadePriorPipelineOutput(BaseOutput):
     Output class for WuerstchenPriorPipeline.
     Args:
-        image_embeddings (`torch.FloatTensor` or `np.ndarray`)
+        image_embeddings (`torch.Tensor` or `np.ndarray`)
             Prior image embeddings for text prompt
-        prompt_embeds (`torch.FloatTensor`):
+        prompt_embeds (`torch.Tensor`):
             Text embeddings for the prompt.
-        negative_prompt_embeds (`torch.FloatTensor`):
+        negative_prompt_embeds (`torch.Tensor`):
             Text embeddings for the negative prompt.
     """
-    image_embeddings: Union[torch.FloatTensor, np.ndarray]
-    prompt_embeds: Union[torch.FloatTensor, np.ndarray]
-    prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray]
-    negative_prompt_embeds: Union[torch.FloatTensor, np.ndarray]
-    negative_prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray]
+    image_embeddings: Union[torch.Tensor, np.ndarray]
+    prompt_embeds: Union[torch.Tensor, np.ndarray]
+    prompt_embeds_pooled: Union[torch.Tensor, np.ndarray]
+    negative_prompt_embeds: Union[torch.Tensor, np.ndarray]
+    negative_prompt_embeds_pooled: Union[torch.Tensor, np.ndarray]
 class StableCascadePriorPipeline(DiffusionPipeline):
@@ -80,7 +80,8 @@ class StableCascadePriorPipeline(DiffusionPipeline):
         prior ([`StableCascadeUNet`]):
             The Stable Cascade prior to approximate the image embedding from the text and/or image embedding.
         text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+            Frozen text-encoder
+            ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             Model that extracts features from generated images to be used as inputs for the `image_encoder`.
         image_encoder ([`CLIPVisionModelWithProjection`]):
@@ -149,10 +150,10 @@ class StableCascadePriorPipeline(DiffusionPipeline):
         do_classifier_free_guidance,
         prompt=None,
         negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
     ):
         if prompt_embeds is None:
             # get prompt text embeddings
@@ -373,14 +374,14 @@ class StableCascadePriorPipeline(DiffusionPipeline):
         timesteps: List[float] = None,
         guidance_scale: float = 4.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
-        image_embeds: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
         num_images_per_prompt: Optional[int] = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pt",
         return_dict: bool = True,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -408,29 +409,29 @@ class StableCascadePriorPipeline(DiffusionPipeline):
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `decoder_guidance_scale` is less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
+            prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                 provided, text embeddings will be generated from `prompt` input argument.
-            prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+            prompt_embeds_pooled (`torch.Tensor`, *optional*):
                 Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                 If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                 argument.
-            negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+            negative_prompt_embeds_pooled (`torch.Tensor`, *optional*):
                 Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
-                argument.
-            image_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting.
-                If not provided, image embeddings will be generated from `image` input argument if existing.
+                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
+                input argument.
+            image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If
+                not provided, image embeddings will be generated from `image` input argument if existing.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                 to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
+            latents (`torch.Tensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
@@ -452,9 +453,9 @@ class StableCascadePriorPipeline(DiffusionPipeline):
         Examples:
         Returns:
-            [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
-            generated image embeddings.
+            [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if `return_dict` is
+            True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
+            embeddings.
         """
         # 0. Define commonly used variables

diffusers/pipelines/stable_diffusion/__init__.py CHANGED Viewed

@@ -113,7 +113,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
         from .pipeline_stable_diffusion import (
             StableDiffusionPipeline,
             StableDiffusionPipelineOutput,
-            StableDiffusionSafetyChecker,
         )
         from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
         from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline

diffusers/pipelines/stable_diffusion/convert_from_ckpt.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the Stable Diffusion checkpoints."""
+"""Conversion script for the Stable Diffusion checkpoints."""
 import re
 from contextlib import nullcontext
@@ -557,7 +557,7 @@ def convert_ldm_unet_checkpoint(
                 paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
             )
-            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            output_block_list = {k: sorted(v) for k, v in sorted(output_block_list.items())}
             if ["conv.bias", "conv.weight"] in output_block_list.values():
                 index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
                 new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
@@ -1153,6 +1153,8 @@ def download_from_original_stable_diffusion_ckpt(
     controlnet: Optional[bool] = None,
     adapter: Optional[bool] = None,
     load_safety_checker: bool = True,
+    safety_checker: Optional[StableDiffusionSafetyChecker] = None,
+    feature_extractor: Optional[AutoFeatureExtractor] = None,
     pipeline_class: DiffusionPipeline = None,
     local_files_only=False,
     vae_path=None,
@@ -1205,6 +1207,12 @@ def download_from_original_stable_diffusion_ckpt(
             If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
         load_safety_checker (`bool`, *optional*, defaults to `True`):
             Whether to load the safety checker or not. Defaults to `True`.
+        safety_checker (`StableDiffusionSafetyChecker`, *optional*, defaults to `None`):
+            Safety checker to use. If this parameter is `None`, the function will load a new instance of
+            [StableDiffusionSafetyChecker] by itself, if needed.
+        feature_extractor (`AutoFeatureExtractor`, *optional*, defaults to `None`):
+            Feature extractor to use. If this parameter is `None`, the function will load a new instance of
+            [AutoFeatureExtractor] by itself, if needed.
         pipeline_class (`str`, *optional*, defaults to `None`):
             The pipeline class to use. Pass `None` to determine automatically.
         local_files_only (`bool`, *optional*, defaults to `False`):
@@ -1530,8 +1538,8 @@ def download_from_original_stable_diffusion_ckpt(
                     unet=unet,
                     scheduler=scheduler,
                     controlnet=controlnet,
-                    safety_checker=None,
-                    feature_extractor=None,
+                    safety_checker=safety_checker,
+                    feature_extractor=feature_extractor,
                 )
                 if hasattr(pipe, "requires_safety_checker"):
                     pipe.requires_safety_checker = False
@@ -1551,8 +1559,8 @@ def download_from_original_stable_diffusion_ckpt(
                     unet=unet,
                     scheduler=scheduler,
                     low_res_scheduler=low_res_scheduler,
-                    safety_checker=None,
-                    feature_extractor=None,
+                    safety_checker=safety_checker,
+                    feature_extractor=feature_extractor,
                 )
             else:
@@ -1562,8 +1570,8 @@ def download_from_original_stable_diffusion_ckpt(
                     tokenizer=tokenizer,
                     unet=unet,
                     scheduler=scheduler,
-                    safety_checker=None,
-                    feature_extractor=None,
+                    safety_checker=safety_checker,
+                    feature_extractor=feature_extractor,
                 )
                 if hasattr(pipe, "requires_safety_checker"):
                     pipe.requires_safety_checker = False
@@ -1684,9 +1692,6 @@ def download_from_original_stable_diffusion_ckpt(
             feature_extractor = AutoFeatureExtractor.from_pretrained(
                 "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
             )
-        else:
-            safety_checker = None
-            feature_extractor = None
         if controlnet:
             pipe = pipeline_class(
@@ -1838,6 +1843,8 @@ def download_controlnet_from_original_ckpt(
     while "state_dict" in checkpoint:
         checkpoint = checkpoint["state_dict"]
+    with open(original_config_file, "r") as f:
+        original_config_file = f.read()
     original_config = yaml.safe_load(original_config_file)
     if num_in_channels is not None:

diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py CHANGED Viewed

@@ -288,7 +288,7 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
-            image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`):
+            image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.Tensor`):
                 `Image`, or tensor representing an image batch which will be upscaled. *
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -329,7 +329,7 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.

diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl

diffusers 0.27.1py3-none-any.whl → 0.28.0py3-none-any.whl