PyPI - diffusers - Versions diffs - 0.28.2__py3-none-any.whl → 0.29.0__py3-none-any.whl - Mend

diffusers 0.28.2py3-none-any.whl → 0.29.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py CHANGED Viewed

@@ -376,6 +376,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
         # 2. Define call parameters
         batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
         if editing_prompt:
             enable_edit_guidance = True
@@ -405,7 +406,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
             text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
@@ -433,9 +434,9 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
                         f" {self.tokenizer.model_max_length} tokens: {removed_text}"
                     )
                     edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length]
-                edit_concepts = self.text_encoder(edit_concepts_input_ids.to(self.device))[0]
+                edit_concepts = self.text_encoder(edit_concepts_input_ids.to(device))[0]
             else:
-                edit_concepts = editing_prompt_embeddings.to(self.device).repeat(batch_size, 1, 1)
+                edit_concepts = editing_prompt_embeddings.to(device).repeat(batch_size, 1, 1)
             # duplicate text embeddings for each generation per prompt, using mps friendly method
             bs_embed_edit, seq_len_edit, _ = edit_concepts.shape
@@ -476,7 +477,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
                 truncation=True,
                 return_tensors="pt",
             )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
@@ -493,7 +494,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
         # get the initial random noise unless the user supplied it
         # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
         # 5. Prepare latent variables
@@ -504,7 +505,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
             height,
             width,
             text_embeddings.dtype,
-            self.device,
+            device,
             generator,
             latents,
         )
@@ -562,12 +563,12 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
                 if enable_edit_guidance:
                     concept_weights = torch.zeros(
                         (len(noise_pred_edit_concepts), noise_guidance.shape[0]),
-                        device=self.device,
+                        device=device,
                         dtype=noise_guidance.dtype,
                     )
                     noise_guidance_edit = torch.zeros(
                         (len(noise_pred_edit_concepts), *noise_guidance.shape),
-                        device=self.device,
+                        device=device,
                         dtype=noise_guidance.dtype,
                     )
                     # noise_guidance_edit = torch.zeros_like(noise_guidance)
@@ -644,21 +645,19 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
                         # noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
-                    warmup_inds = torch.tensor(warmup_inds).to(self.device)
+                    warmup_inds = torch.tensor(warmup_inds).to(device)
                     if len(noise_pred_edit_concepts) > warmup_inds.shape[0] > 0:
                         concept_weights = concept_weights.to("cpu")  # Offload to cpu
                         noise_guidance_edit = noise_guidance_edit.to("cpu")
-                        concept_weights_tmp = torch.index_select(concept_weights.to(self.device), 0, warmup_inds)
+                        concept_weights_tmp = torch.index_select(concept_weights.to(device), 0, warmup_inds)
                         concept_weights_tmp = torch.where(
                             concept_weights_tmp < 0, torch.zeros_like(concept_weights_tmp), concept_weights_tmp
                         )
                         concept_weights_tmp = concept_weights_tmp / concept_weights_tmp.sum(dim=0)
                         # concept_weights_tmp = torch.nan_to_num(concept_weights_tmp)
-                        noise_guidance_edit_tmp = torch.index_select(
-                            noise_guidance_edit.to(self.device), 0, warmup_inds
-                        )
+                        noise_guidance_edit_tmp = torch.index_select(noise_guidance_edit.to(device), 0, warmup_inds)
                         noise_guidance_edit_tmp = torch.einsum(
                             "cb,cbijk->bijk", concept_weights_tmp, noise_guidance_edit_tmp
                         )
@@ -669,8 +668,8 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
                         del noise_guidance_edit_tmp
                         del concept_weights_tmp
-                        concept_weights = concept_weights.to(self.device)
-                        noise_guidance_edit = noise_guidance_edit.to(self.device)
+                        concept_weights = concept_weights.to(device)
+                        noise_guidance_edit = noise_guidance_edit.to(device)
                     concept_weights = torch.where(
                         concept_weights < 0, torch.zeros_like(concept_weights), concept_weights
@@ -679,6 +678,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
                     concept_weights = torch.nan_to_num(concept_weights)
                     noise_guidance_edit = torch.einsum("cb,cbijk->bijk", concept_weights, noise_guidance_edit)
+                    noise_guidance_edit = noise_guidance_edit.to(edit_momentum.device)
                     noise_guidance_edit = noise_guidance_edit + edit_momentum_scale * edit_momentum
@@ -689,7 +689,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
                         self.sem_guidance[i] = noise_guidance_edit.detach().cpu()
                 if sem_guidance is not None:
-                    edit_guidance = sem_guidance[i].to(self.device)
+                    edit_guidance = sem_guidance[i].to(device)
                     noise_guidance = noise_guidance + edit_guidance
                 noise_pred = noise_pred_uncond + noise_guidance
@@ -705,7 +705,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
         # 8. Post-processing
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-            image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
         else:
             image = latents
             has_nsfw_concept = None

diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py CHANGED Viewed

@@ -474,9 +474,10 @@ class StableDiffusionPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py CHANGED Viewed

@@ -357,9 +357,10 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds
@@ -545,7 +546,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
         if depth_map is None:
             pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values
-            pixel_values = pixel_values.to(device=device)
+            pixel_values = pixel_values.to(device=device, dtype=dtype)
             # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
             # So we use `torch.autocast` here for half precision inference.
             if torch.backends.mps.is_available():

diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py CHANGED Viewed

@@ -517,9 +517,10 @@ class StableDiffusionImg2ImgPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py CHANGED Viewed

@@ -589,9 +589,10 @@ class StableDiffusionInpaintPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py CHANGED Viewed

@@ -377,9 +377,10 @@ class StableDiffusionUpscalePipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py CHANGED Viewed

@@ -458,9 +458,10 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py CHANGED Viewed

@@ -51,8 +51,8 @@ EXAMPLE_DOC_STRING = """
         >>> from diffusers import StableUnCLIPImg2ImgPipeline
         >>> pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
-        ...     "fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16
-        ... )  # TODO update model path
+        ...     "stabilityai/stable-diffusion-2-1-unclip-small", torch_dtype=torch.float16
+        ... )
         >>> pipe = pipe.to("cuda")
         >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
@@ -63,7 +63,7 @@ EXAMPLE_DOC_STRING = """
         >>> prompt = "A fantasy landscape, trending on artstation"
-        >>> images = pipe(prompt, init_image).images
+        >>> images = pipe(init_image, prompt).images
         >>> images[0].save("fantasy_landscape.png")
         ```
 """
@@ -422,9 +422,10 @@ class StableUnCLIPImg2ImgPipeline(
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
         return prompt_embeds, negative_prompt_embeds

diffusers/pipelines/stable_diffusion_3/__init__.py ADDED Viewed

@@ -0,0 +1,52 @@
+from typing import TYPE_CHECKING
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {"pipeline_output": ["StableDiffusion3PipelineOutput"]}
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_3"] = ["StableDiffusion3Pipeline"]
+    _import_structure["pipeline_stable_diffusion_3_img2img"] = ["StableDiffusion3Img2ImgPipeline"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
+        from .pipeline_stable_diffusion_3_img2img import StableDiffusion3Img2ImgPipeline
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)

diffusers/pipelines/stable_diffusion_3/pipeline_output.py ADDED Viewed

@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import List, Union
+import numpy as np
+import PIL.Image
+from ...utils import BaseOutput
+@dataclass
+class StableDiffusion3PipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+    images: Union[List[PIL.Image.Image], np.ndarray]

diffusers 0.28.2__py3-none-any.whl → 0.29.0__py3-none-any.whl

diffusers 0.28.2py3-none-any.whl → 0.29.0py3-none-any.whl