diffusers 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +7 -2
 - diffusers/configuration_utils.py +4 -0
 - diffusers/loaders.py +262 -12
 - diffusers/models/attention.py +31 -12
 - diffusers/models/attention_processor.py +189 -0
 - diffusers/models/controlnet.py +9 -2
 - diffusers/models/embeddings.py +66 -0
 - diffusers/models/modeling_pytorch_flax_utils.py +6 -0
 - diffusers/models/modeling_utils.py +5 -2
 - diffusers/models/transformer_2d.py +1 -1
 - diffusers/models/unet_2d_condition.py +45 -6
 - diffusers/models/vae.py +3 -0
 - diffusers/pipelines/__init__.py +8 -0
 - diffusers/pipelines/alt_diffusion/modeling_roberta_series.py +25 -10
 - diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +8 -0
 - diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +8 -0
 - diffusers/pipelines/audioldm/pipeline_audioldm.py +1 -1
 - diffusers/pipelines/deepfloyd_if/__init__.py +54 -0
 - diffusers/pipelines/deepfloyd_if/pipeline_if.py +854 -0
 - diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +979 -0
 - diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +1097 -0
 - diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +1098 -0
 - diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +1208 -0
 - diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +947 -0
 - diffusers/pipelines/deepfloyd_if/safety_checker.py +59 -0
 - diffusers/pipelines/deepfloyd_if/timesteps.py +579 -0
 - diffusers/pipelines/deepfloyd_if/watermark.py +46 -0
 - diffusers/pipelines/pipeline_utils.py +54 -25
 - diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +37 -20
 - diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py +1 -1
 - diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +12 -1
 - diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +10 -2
 - diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +10 -8
 - diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py +59 -4
 - diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +9 -2
 - diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +10 -2
 - diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +9 -2
 - diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +22 -12
 - diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +9 -2
 - diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +34 -30
 - diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +93 -10
 - diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +45 -6
 - diffusers/schedulers/scheduling_ddpm.py +63 -16
 - diffusers/schedulers/scheduling_heun_discrete.py +51 -1
 - diffusers/utils/__init__.py +4 -1
 - diffusers/utils/dummy_torch_and_transformers_objects.py +80 -5
 - diffusers/utils/dynamic_modules_utils.py +1 -1
 - diffusers/utils/hub_utils.py +4 -1
 - diffusers/utils/import_utils.py +41 -0
 - diffusers/utils/pil_utils.py +24 -0
 - diffusers/utils/testing_utils.py +10 -0
 - {diffusers-0.15.1.dist-info → diffusers-0.16.1.dist-info}/METADATA +1 -1
 - {diffusers-0.15.1.dist-info → diffusers-0.16.1.dist-info}/RECORD +57 -47
 - {diffusers-0.15.1.dist-info → diffusers-0.16.1.dist-info}/LICENSE +0 -0
 - {diffusers-0.15.1.dist-info → diffusers-0.16.1.dist-info}/WHEEL +0 -0
 - {diffusers-0.15.1.dist-info → diffusers-0.16.1.dist-info}/entry_points.txt +0 -0
 - {diffusers-0.15.1.dist-info → diffusers-0.16.1.dist-info}/top_level.txt +0 -0
 
| 
         @@ -0,0 +1,1098 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            import html
         
     | 
| 
      
 2 
     | 
    
         
            +
            import inspect
         
     | 
| 
      
 3 
     | 
    
         
            +
            import re
         
     | 
| 
      
 4 
     | 
    
         
            +
            import urllib.parse as ul
         
     | 
| 
      
 5 
     | 
    
         
            +
            from typing import Any, Callable, Dict, List, Optional, Union
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            import numpy as np
         
     | 
| 
      
 8 
     | 
    
         
            +
            import PIL
         
     | 
| 
      
 9 
     | 
    
         
            +
            import torch
         
     | 
| 
      
 10 
     | 
    
         
            +
            from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            from ...models import UNet2DConditionModel
         
     | 
| 
      
 13 
     | 
    
         
            +
            from ...schedulers import DDPMScheduler
         
     | 
| 
      
 14 
     | 
    
         
            +
            from ...utils import (
         
     | 
| 
      
 15 
     | 
    
         
            +
                BACKENDS_MAPPING,
         
     | 
| 
      
 16 
     | 
    
         
            +
                PIL_INTERPOLATION,
         
     | 
| 
      
 17 
     | 
    
         
            +
                is_accelerate_available,
         
     | 
| 
      
 18 
     | 
    
         
            +
                is_accelerate_version,
         
     | 
| 
      
 19 
     | 
    
         
            +
                is_bs4_available,
         
     | 
| 
      
 20 
     | 
    
         
            +
                is_ftfy_available,
         
     | 
| 
      
 21 
     | 
    
         
            +
                logging,
         
     | 
| 
      
 22 
     | 
    
         
            +
                randn_tensor,
         
     | 
| 
      
 23 
     | 
    
         
            +
                replace_example_docstring,
         
     | 
| 
      
 24 
     | 
    
         
            +
            )
         
     | 
| 
      
 25 
     | 
    
         
            +
            from ..pipeline_utils import DiffusionPipeline
         
     | 
| 
      
 26 
     | 
    
         
            +
            from . import IFPipelineOutput
         
     | 
| 
      
 27 
     | 
    
         
            +
            from .safety_checker import IFSafetyChecker
         
     | 
| 
      
 28 
     | 
    
         
            +
            from .watermark import IFWatermarker
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
            logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
            if is_bs4_available():
         
     | 
| 
      
 34 
     | 
    
         
            +
                from bs4 import BeautifulSoup
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
            if is_ftfy_available():
         
     | 
| 
      
 37 
     | 
    
         
            +
                import ftfy
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
            # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
         
     | 
| 
      
 41 
     | 
    
         
            +
            def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
         
     | 
| 
      
 42 
     | 
    
         
            +
                w, h = images.size
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                coef = w / h
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                w, h = img_size, img_size
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                if coef >= 1:
         
     | 
| 
      
 49 
     | 
    
         
            +
                    w = int(round(img_size / 8 * coef) * 8)
         
     | 
| 
      
 50 
     | 
    
         
            +
                else:
         
     | 
| 
      
 51 
     | 
    
         
            +
                    h = int(round(img_size / 8 / coef) * 8)
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
                images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                return images
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
            EXAMPLE_DOC_STRING = """
         
     | 
| 
      
 59 
     | 
    
         
            +
                Examples:
         
     | 
| 
      
 60 
     | 
    
         
            +
                    ```py
         
     | 
| 
      
 61 
     | 
    
         
            +
                    >>> from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
         
     | 
| 
      
 62 
     | 
    
         
            +
                    >>> from diffusers.utils import pt_to_pil
         
     | 
| 
      
 63 
     | 
    
         
            +
                    >>> import torch
         
     | 
| 
      
 64 
     | 
    
         
            +
                    >>> from PIL import Image
         
     | 
| 
      
 65 
     | 
    
         
            +
                    >>> import requests
         
     | 
| 
      
 66 
     | 
    
         
            +
                    >>> from io import BytesIO
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
                    >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
         
     | 
| 
      
 69 
     | 
    
         
            +
                    >>> response = requests.get(url)
         
     | 
| 
      
 70 
     | 
    
         
            +
                    >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
         
     | 
| 
      
 71 
     | 
    
         
            +
                    >>> original_image = original_image
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
                    >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
         
     | 
| 
      
 74 
     | 
    
         
            +
                    >>> response = requests.get(url)
         
     | 
| 
      
 75 
     | 
    
         
            +
                    >>> mask_image = Image.open(BytesIO(response.content))
         
     | 
| 
      
 76 
     | 
    
         
            +
                    >>> mask_image = mask_image
         
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
                    >>> pipe = IFInpaintingPipeline.from_pretrained(
         
     | 
| 
      
 79 
     | 
    
         
            +
                    ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
         
     | 
| 
      
 80 
     | 
    
         
            +
                    ... )
         
     | 
| 
      
 81 
     | 
    
         
            +
                    >>> pipe.enable_model_cpu_offload()
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
                    >>> prompt = "blue sunglasses"
         
     | 
| 
      
 84 
     | 
    
         
            +
                    >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                    >>> image = pipe(
         
     | 
| 
      
 87 
     | 
    
         
            +
                    ...     image=original_image,
         
     | 
| 
      
 88 
     | 
    
         
            +
                    ...     mask_image=mask_image,
         
     | 
| 
      
 89 
     | 
    
         
            +
                    ...     prompt_embeds=prompt_embeds,
         
     | 
| 
      
 90 
     | 
    
         
            +
                    ...     negative_prompt_embeds=negative_embeds,
         
     | 
| 
      
 91 
     | 
    
         
            +
                    ...     output_type="pt",
         
     | 
| 
      
 92 
     | 
    
         
            +
                    ... ).images
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
                    >>> # save intermediate image
         
     | 
| 
      
 95 
     | 
    
         
            +
                    >>> pil_image = pt_to_pil(image)
         
     | 
| 
      
 96 
     | 
    
         
            +
                    >>> pil_image[0].save("./if_stage_I.png")
         
     | 
| 
      
 97 
     | 
    
         
            +
             
     | 
| 
      
 98 
     | 
    
         
            +
                    >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
         
     | 
| 
      
 99 
     | 
    
         
            +
                    ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
         
     | 
| 
      
 100 
     | 
    
         
            +
                    ... )
         
     | 
| 
      
 101 
     | 
    
         
            +
                    >>> super_res_1_pipe.enable_model_cpu_offload()
         
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
      
 103 
     | 
    
         
            +
                    >>> image = super_res_1_pipe(
         
     | 
| 
      
 104 
     | 
    
         
            +
                    ...     image=image,
         
     | 
| 
      
 105 
     | 
    
         
            +
                    ...     mask_image=mask_image,
         
     | 
| 
      
 106 
     | 
    
         
            +
                    ...     original_image=original_image,
         
     | 
| 
      
 107 
     | 
    
         
            +
                    ...     prompt_embeds=prompt_embeds,
         
     | 
| 
      
 108 
     | 
    
         
            +
                    ...     negative_prompt_embeds=negative_embeds,
         
     | 
| 
      
 109 
     | 
    
         
            +
                    ... ).images
         
     | 
| 
      
 110 
     | 
    
         
            +
                    >>> image[0].save("./if_stage_II.png")
         
     | 
| 
      
 111 
     | 
    
         
            +
                    ```
         
     | 
| 
      
 112 
     | 
    
         
            +
            """
         
     | 
| 
      
 113 
     | 
    
         
            +
             
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
            class IFInpaintingPipeline(DiffusionPipeline):
         
     | 
| 
      
 116 
     | 
    
         
            +
                tokenizer: T5Tokenizer
         
     | 
| 
      
 117 
     | 
    
         
            +
                text_encoder: T5EncoderModel
         
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
      
 119 
     | 
    
         
            +
                unet: UNet2DConditionModel
         
     | 
| 
      
 120 
     | 
    
         
            +
                scheduler: DDPMScheduler
         
     | 
| 
      
 121 
     | 
    
         
            +
             
     | 
| 
      
 122 
     | 
    
         
            +
                feature_extractor: Optional[CLIPImageProcessor]
         
     | 
| 
      
 123 
     | 
    
         
            +
                safety_checker: Optional[IFSafetyChecker]
         
     | 
| 
      
 124 
     | 
    
         
            +
             
     | 
| 
      
 125 
     | 
    
         
            +
                watermarker: Optional[IFWatermarker]
         
     | 
| 
      
 126 
     | 
    
         
            +
             
     | 
| 
      
 127 
     | 
    
         
            +
                bad_punct_regex = re.compile(
         
     | 
| 
      
 128 
     | 
    
         
            +
                    r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
         
     | 
| 
      
 129 
     | 
    
         
            +
                )  # noqa
         
     | 
| 
      
 130 
     | 
    
         
            +
             
     | 
| 
      
 131 
     | 
    
         
            +
                _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
                def __init__(
         
     | 
| 
      
 134 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 135 
     | 
    
         
            +
                    tokenizer: T5Tokenizer,
         
     | 
| 
      
 136 
     | 
    
         
            +
                    text_encoder: T5EncoderModel,
         
     | 
| 
      
 137 
     | 
    
         
            +
                    unet: UNet2DConditionModel,
         
     | 
| 
      
 138 
     | 
    
         
            +
                    scheduler: DDPMScheduler,
         
     | 
| 
      
 139 
     | 
    
         
            +
                    safety_checker: Optional[IFSafetyChecker],
         
     | 
| 
      
 140 
     | 
    
         
            +
                    feature_extractor: Optional[CLIPImageProcessor],
         
     | 
| 
      
 141 
     | 
    
         
            +
                    watermarker: Optional[IFWatermarker],
         
     | 
| 
      
 142 
     | 
    
         
            +
                    requires_safety_checker: bool = True,
         
     | 
| 
      
 143 
     | 
    
         
            +
                ):
         
     | 
| 
      
 144 
     | 
    
         
            +
                    super().__init__()
         
     | 
| 
      
 145 
     | 
    
         
            +
             
     | 
| 
      
 146 
     | 
    
         
            +
                    if safety_checker is None and requires_safety_checker:
         
     | 
| 
      
 147 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 148 
     | 
    
         
            +
                            f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
         
     | 
| 
      
 149 
     | 
    
         
            +
                            " that you abide to the conditions of the IF license and do not expose unfiltered"
         
     | 
| 
      
 150 
     | 
    
         
            +
                            " results in services or applications open to the public. Both the diffusers team and Hugging Face"
         
     | 
| 
      
 151 
     | 
    
         
            +
                            " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
         
     | 
| 
      
 152 
     | 
    
         
            +
                            " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
         
     | 
| 
      
 153 
     | 
    
         
            +
                            " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
         
     | 
| 
      
 154 
     | 
    
         
            +
                        )
         
     | 
| 
      
 155 
     | 
    
         
            +
             
     | 
| 
      
 156 
     | 
    
         
            +
                    if safety_checker is not None and feature_extractor is None:
         
     | 
| 
      
 157 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 158 
     | 
    
         
            +
                            "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
         
     | 
| 
      
 159 
     | 
    
         
            +
                            " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
         
     | 
| 
      
 160 
     | 
    
         
            +
                        )
         
     | 
| 
      
 161 
     | 
    
         
            +
             
     | 
| 
      
 162 
     | 
    
         
            +
                    self.register_modules(
         
     | 
| 
      
 163 
     | 
    
         
            +
                        tokenizer=tokenizer,
         
     | 
| 
      
 164 
     | 
    
         
            +
                        text_encoder=text_encoder,
         
     | 
| 
      
 165 
     | 
    
         
            +
                        unet=unet,
         
     | 
| 
      
 166 
     | 
    
         
            +
                        scheduler=scheduler,
         
     | 
| 
      
 167 
     | 
    
         
            +
                        safety_checker=safety_checker,
         
     | 
| 
      
 168 
     | 
    
         
            +
                        feature_extractor=feature_extractor,
         
     | 
| 
      
 169 
     | 
    
         
            +
                        watermarker=watermarker,
         
     | 
| 
      
 170 
     | 
    
         
            +
                    )
         
     | 
| 
      
 171 
     | 
    
         
            +
                    self.register_to_config(requires_safety_checker=requires_safety_checker)
         
     | 
| 
      
 172 
     | 
    
         
            +
             
     | 
| 
      
 173 
     | 
    
         
            +
                # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
         
     | 
| 
      
 174 
     | 
    
         
            +
                def enable_sequential_cpu_offload(self, gpu_id=0):
         
     | 
| 
      
 175 
     | 
    
         
            +
                    r"""
         
     | 
| 
      
 176 
     | 
    
         
            +
                    Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
         
     | 
| 
      
 177 
     | 
    
         
            +
                    models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
         
     | 
| 
      
 178 
     | 
    
         
            +
                    when their specific submodule has its `forward` method called.
         
     | 
| 
      
 179 
     | 
    
         
            +
                    """
         
     | 
| 
      
 180 
     | 
    
         
            +
                    if is_accelerate_available():
         
     | 
| 
      
 181 
     | 
    
         
            +
                        from accelerate import cpu_offload
         
     | 
| 
      
 182 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 183 
     | 
    
         
            +
                        raise ImportError("Please install accelerate via `pip install accelerate`")
         
     | 
| 
      
 184 
     | 
    
         
            +
             
     | 
| 
      
 185 
     | 
    
         
            +
                    device = torch.device(f"cuda:{gpu_id}")
         
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
      
 187 
     | 
    
         
            +
                    models = [
         
     | 
| 
      
 188 
     | 
    
         
            +
                        self.text_encoder,
         
     | 
| 
      
 189 
     | 
    
         
            +
                        self.unet,
         
     | 
| 
      
 190 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 191 
     | 
    
         
            +
                    for cpu_offloaded_model in models:
         
     | 
| 
      
 192 
     | 
    
         
            +
                        if cpu_offloaded_model is not None:
         
     | 
| 
      
 193 
     | 
    
         
            +
                            cpu_offload(cpu_offloaded_model, device)
         
     | 
| 
      
 194 
     | 
    
         
            +
             
     | 
| 
      
 195 
     | 
    
         
            +
                    if self.safety_checker is not None:
         
     | 
| 
      
 196 
     | 
    
         
            +
                        cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
         
     | 
| 
      
 197 
     | 
    
         
            +
             
     | 
| 
      
 198 
     | 
    
         
            +
                # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
         
     | 
| 
      
 199 
     | 
    
         
            +
                def enable_model_cpu_offload(self, gpu_id=0):
         
     | 
| 
      
 200 
     | 
    
         
            +
                    r"""
         
     | 
| 
      
 201 
     | 
    
         
            +
                    Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
         
     | 
| 
      
 202 
     | 
    
         
            +
                    to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
         
     | 
| 
      
 203 
     | 
    
         
            +
                    method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
         
     | 
| 
      
 204 
     | 
    
         
            +
                    `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
         
     | 
| 
      
 205 
     | 
    
         
            +
                    """
         
     | 
| 
      
 206 
     | 
    
         
            +
                    if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
         
     | 
| 
      
 207 
     | 
    
         
            +
                        from accelerate import cpu_offload_with_hook
         
     | 
| 
      
 208 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 209 
     | 
    
         
            +
                        raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
         
     | 
| 
      
 210 
     | 
    
         
            +
             
     | 
| 
      
 211 
     | 
    
         
            +
                    device = torch.device(f"cuda:{gpu_id}")
         
     | 
| 
      
 212 
     | 
    
         
            +
             
     | 
| 
      
 213 
     | 
    
         
            +
                    if self.device.type != "cpu":
         
     | 
| 
      
 214 
     | 
    
         
            +
                        self.to("cpu", silence_dtype_warnings=True)
         
     | 
| 
      
 215 
     | 
    
         
            +
                        torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
         
     | 
| 
      
 216 
     | 
    
         
            +
             
     | 
| 
      
 217 
     | 
    
         
            +
                    hook = None
         
     | 
| 
      
 218 
     | 
    
         
            +
             
     | 
| 
      
 219 
     | 
    
         
            +
                    if self.text_encoder is not None:
         
     | 
| 
      
 220 
     | 
    
         
            +
                        _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
         
     | 
| 
      
 221 
     | 
    
         
            +
             
     | 
| 
      
 222 
     | 
    
         
            +
                        # Accelerate will move the next model to the device _before_ calling the offload hook of the
         
     | 
| 
      
 223 
     | 
    
         
            +
                        # previous model. This will cause both models to be present on the device at the same time.
         
     | 
| 
      
 224 
     | 
    
         
            +
                        # IF uses T5 for its text encoder which is really large. We can manually call the offload
         
     | 
| 
      
 225 
     | 
    
         
            +
                        # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
         
     | 
| 
      
 226 
     | 
    
         
            +
                        # the GPU.
         
     | 
| 
      
 227 
     | 
    
         
            +
                        self.text_encoder_offload_hook = hook
         
     | 
| 
      
 228 
     | 
    
         
            +
             
     | 
| 
      
 229 
     | 
    
         
            +
                    _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
         
     | 
| 
      
 230 
     | 
    
         
            +
             
     | 
| 
      
 231 
     | 
    
         
            +
                    # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
         
     | 
| 
      
 232 
     | 
    
         
            +
                    self.unet_offload_hook = hook
         
     | 
| 
      
 233 
     | 
    
         
            +
             
     | 
| 
      
 234 
     | 
    
         
            +
                    if self.safety_checker is not None:
         
     | 
| 
      
 235 
     | 
    
         
            +
                        _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
         
     | 
| 
      
 236 
     | 
    
         
            +
             
     | 
| 
      
 237 
     | 
    
         
            +
                    # We'll offload the last model manually.
         
     | 
| 
      
 238 
     | 
    
         
            +
                    self.final_offload_hook = hook
         
     | 
| 
      
 239 
     | 
    
         
            +
             
     | 
| 
      
 240 
     | 
    
         
            +
                # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
         
     | 
| 
      
 241 
     | 
    
         
            +
                def remove_all_hooks(self):
         
     | 
| 
      
 242 
     | 
    
         
            +
                    if is_accelerate_available():
         
     | 
| 
      
 243 
     | 
    
         
            +
                        from accelerate.hooks import remove_hook_from_module
         
     | 
| 
      
 244 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 245 
     | 
    
         
            +
                        raise ImportError("Please install accelerate via `pip install accelerate`")
         
     | 
| 
      
 246 
     | 
    
         
            +
             
     | 
| 
      
 247 
     | 
    
         
            +
                    for model in [self.text_encoder, self.unet, self.safety_checker]:
         
     | 
| 
      
 248 
     | 
    
         
            +
                        if model is not None:
         
     | 
| 
      
 249 
     | 
    
         
            +
                            remove_hook_from_module(model, recurse=True)
         
     | 
| 
      
 250 
     | 
    
         
            +
             
     | 
| 
      
 251 
     | 
    
         
            +
                    self.unet_offload_hook = None
         
     | 
| 
      
 252 
     | 
    
         
            +
                    self.text_encoder_offload_hook = None
         
     | 
| 
      
 253 
     | 
    
         
            +
                    self.final_offload_hook = None
         
     | 
| 
      
 254 
     | 
    
         
            +
             
     | 
| 
      
 255 
     | 
    
         
            +
                @property
         
     | 
| 
      
 256 
     | 
    
         
            +
                # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
         
     | 
| 
      
 257 
     | 
    
         
            +
                def _execution_device(self):
         
     | 
| 
      
 258 
     | 
    
         
            +
                    r"""
         
     | 
| 
      
 259 
     | 
    
         
            +
                    Returns the device on which the pipeline's models will be executed. After calling
         
     | 
| 
      
 260 
     | 
    
         
            +
                    `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
         
     | 
| 
      
 261 
     | 
    
         
            +
                    hooks.
         
     | 
| 
      
 262 
     | 
    
         
            +
                    """
         
     | 
| 
      
 263 
     | 
    
         
            +
                    if not hasattr(self.unet, "_hf_hook"):
         
     | 
| 
      
 264 
     | 
    
         
            +
                        return self.device
         
     | 
| 
      
 265 
     | 
    
         
            +
                    for module in self.unet.modules():
         
     | 
| 
      
 266 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 267 
     | 
    
         
            +
                            hasattr(module, "_hf_hook")
         
     | 
| 
      
 268 
     | 
    
         
            +
                            and hasattr(module._hf_hook, "execution_device")
         
     | 
| 
      
 269 
     | 
    
         
            +
                            and module._hf_hook.execution_device is not None
         
     | 
| 
      
 270 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 271 
     | 
    
         
            +
                            return torch.device(module._hf_hook.execution_device)
         
     | 
| 
      
 272 
     | 
    
         
            +
                    return self.device
         
     | 
| 
      
 273 
     | 
    
         
            +
             
     | 
| 
      
 274 
     | 
    
         
            +
                @torch.no_grad()
         
     | 
| 
      
 275 
     | 
    
         
            +
                # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
         
     | 
| 
      
 276 
     | 
    
         
            +
                def encode_prompt(
         
     | 
| 
      
 277 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 278 
     | 
    
         
            +
                    prompt,
         
     | 
| 
      
 279 
     | 
    
         
            +
                    do_classifier_free_guidance=True,
         
     | 
| 
      
 280 
     | 
    
         
            +
                    num_images_per_prompt=1,
         
     | 
| 
      
 281 
     | 
    
         
            +
                    device=None,
         
     | 
| 
      
 282 
     | 
    
         
            +
                    negative_prompt=None,
         
     | 
| 
      
 283 
     | 
    
         
            +
                    prompt_embeds: Optional[torch.FloatTensor] = None,
         
     | 
| 
      
 284 
     | 
    
         
            +
                    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         
     | 
| 
      
 285 
     | 
    
         
            +
                    clean_caption: bool = False,
         
     | 
| 
      
 286 
     | 
    
         
            +
                ):
         
     | 
| 
      
 287 
     | 
    
         
            +
                    r"""
         
     | 
| 
      
 288 
     | 
    
         
            +
                    Encodes the prompt into text encoder hidden states.
         
     | 
| 
      
 289 
     | 
    
         
            +
             
     | 
| 
      
 290 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 291 
     | 
    
         
            +
                         prompt (`str` or `List[str]`, *optional*):
         
     | 
| 
      
 292 
     | 
    
         
            +
                            prompt to be encoded
         
     | 
| 
      
 293 
     | 
    
         
            +
                        device: (`torch.device`, *optional*):
         
     | 
| 
      
 294 
     | 
    
         
            +
                            torch device to place the resulting embeddings on
         
     | 
| 
      
 295 
     | 
    
         
            +
                        num_images_per_prompt (`int`, *optional*, defaults to 1):
         
     | 
| 
      
 296 
     | 
    
         
            +
                            number of images that should be generated per prompt
         
     | 
| 
      
 297 
     | 
    
         
            +
                        do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
         
     | 
| 
      
 298 
     | 
    
         
            +
                            whether to use classifier free guidance or not
         
     | 
| 
      
 299 
     | 
    
         
            +
                        negative_prompt (`str` or `List[str]`, *optional*):
         
     | 
| 
      
 300 
     | 
    
         
            +
                            The prompt or prompts not to guide the image generation. If not defined, one has to pass
         
     | 
| 
      
 301 
     | 
    
         
            +
                            `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
         
     | 
| 
      
 302 
     | 
    
         
            +
                            Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
         
     | 
| 
      
 303 
     | 
    
         
            +
                        prompt_embeds (`torch.FloatTensor`, *optional*):
         
     | 
| 
      
 304 
     | 
    
         
            +
                            Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
         
     | 
| 
      
 305 
     | 
    
         
            +
                            provided, text embeddings will be generated from `prompt` input argument.
         
     | 
| 
      
 306 
     | 
    
         
            +
                        negative_prompt_embeds (`torch.FloatTensor`, *optional*):
         
     | 
| 
      
 307 
     | 
    
         
            +
                            Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
         
     | 
| 
      
 308 
     | 
    
         
            +
                            weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
         
     | 
| 
      
 309 
     | 
    
         
            +
                            argument.
         
     | 
| 
      
 310 
     | 
    
         
            +
                    """
         
     | 
| 
      
 311 
     | 
    
         
            +
                    if prompt is not None and negative_prompt is not None:
         
     | 
| 
      
 312 
     | 
    
         
            +
                        if type(prompt) is not type(negative_prompt):
         
     | 
| 
      
 313 
     | 
    
         
            +
                            raise TypeError(
         
     | 
| 
      
 314 
     | 
    
         
            +
                                f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
         
     | 
| 
      
 315 
     | 
    
         
            +
                                f" {type(prompt)}."
         
     | 
| 
      
 316 
     | 
    
         
            +
                            )
         
     | 
| 
      
 317 
     | 
    
         
            +
             
     | 
| 
      
 318 
     | 
    
         
            +
                    if device is None:
         
     | 
| 
      
 319 
     | 
    
         
            +
                        device = self._execution_device
         
     | 
| 
      
 320 
     | 
    
         
            +
             
     | 
| 
      
 321 
     | 
    
         
            +
                    if prompt is not None and isinstance(prompt, str):
         
     | 
| 
      
 322 
     | 
    
         
            +
                        batch_size = 1
         
     | 
| 
      
 323 
     | 
    
         
            +
                    elif prompt is not None and isinstance(prompt, list):
         
     | 
| 
      
 324 
     | 
    
         
            +
                        batch_size = len(prompt)
         
     | 
| 
      
 325 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 326 
     | 
    
         
            +
                        batch_size = prompt_embeds.shape[0]
         
     | 
| 
      
 327 
     | 
    
         
            +
             
     | 
| 
      
 328 
     | 
    
         
            +
                    # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
         
     | 
| 
      
 329 
     | 
    
         
            +
                    max_length = 77
         
     | 
| 
      
 330 
     | 
    
         
            +
             
     | 
| 
      
 331 
     | 
    
         
            +
                    if prompt_embeds is None:
         
     | 
| 
      
 332 
     | 
    
         
            +
                        prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
         
     | 
| 
      
 333 
     | 
    
         
            +
                        text_inputs = self.tokenizer(
         
     | 
| 
      
 334 
     | 
    
         
            +
                            prompt,
         
     | 
| 
      
 335 
     | 
    
         
            +
                            padding="max_length",
         
     | 
| 
      
 336 
     | 
    
         
            +
                            max_length=max_length,
         
     | 
| 
      
 337 
     | 
    
         
            +
                            truncation=True,
         
     | 
| 
      
 338 
     | 
    
         
            +
                            add_special_tokens=True,
         
     | 
| 
      
 339 
     | 
    
         
            +
                            return_tensors="pt",
         
     | 
| 
      
 340 
     | 
    
         
            +
                        )
         
     | 
| 
      
 341 
     | 
    
         
            +
                        text_input_ids = text_inputs.input_ids
         
     | 
| 
      
 342 
     | 
    
         
            +
                        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
         
     | 
| 
      
 343 
     | 
    
         
            +
             
     | 
| 
      
 344 
     | 
    
         
            +
                        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
         
     | 
| 
      
 345 
     | 
    
         
            +
                            text_input_ids, untruncated_ids
         
     | 
| 
      
 346 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 347 
     | 
    
         
            +
                            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
         
     | 
| 
      
 348 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 349 
     | 
    
         
            +
                                "The following part of your input was truncated because CLIP can only handle sequences up to"
         
     | 
| 
      
 350 
     | 
    
         
            +
                                f" {max_length} tokens: {removed_text}"
         
     | 
| 
      
 351 
     | 
    
         
            +
                            )
         
     | 
| 
      
 352 
     | 
    
         
            +
             
     | 
| 
      
 353 
     | 
    
         
            +
                        attention_mask = text_inputs.attention_mask.to(device)
         
     | 
| 
      
 354 
     | 
    
         
            +
             
     | 
| 
      
 355 
     | 
    
         
            +
                        prompt_embeds = self.text_encoder(
         
     | 
| 
      
 356 
     | 
    
         
            +
                            text_input_ids.to(device),
         
     | 
| 
      
 357 
     | 
    
         
            +
                            attention_mask=attention_mask,
         
     | 
| 
      
 358 
     | 
    
         
            +
                        )
         
     | 
| 
      
 359 
     | 
    
         
            +
                        prompt_embeds = prompt_embeds[0]
         
     | 
| 
      
 360 
     | 
    
         
            +
             
     | 
| 
      
 361 
     | 
    
         
            +
                    if self.text_encoder is not None:
         
     | 
| 
      
 362 
     | 
    
         
            +
                        dtype = self.text_encoder.dtype
         
     | 
| 
      
 363 
     | 
    
         
            +
                    elif self.unet is not None:
         
     | 
| 
      
 364 
     | 
    
         
            +
                        dtype = self.unet.dtype
         
     | 
| 
      
 365 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 366 
     | 
    
         
            +
                        dtype = None
         
     | 
| 
      
 367 
     | 
    
         
            +
             
     | 
| 
      
 368 
     | 
    
         
            +
                    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
         
     | 
| 
      
 369 
     | 
    
         
            +
             
     | 
| 
      
 370 
     | 
    
         
            +
                    bs_embed, seq_len, _ = prompt_embeds.shape
         
     | 
| 
      
 371 
     | 
    
         
            +
                    # duplicate text embeddings for each generation per prompt, using mps friendly method
         
     | 
| 
      
 372 
     | 
    
         
            +
                    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         
     | 
| 
      
 373 
     | 
    
         
            +
                    prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
         
     | 
| 
      
 374 
     | 
    
         
            +
             
     | 
| 
      
 375 
     | 
    
         
            +
                    # get unconditional embeddings for classifier free guidance
         
     | 
| 
      
 376 
     | 
    
         
            +
                    if do_classifier_free_guidance and negative_prompt_embeds is None:
         
     | 
| 
      
 377 
     | 
    
         
            +
                        uncond_tokens: List[str]
         
     | 
| 
      
 378 
     | 
    
         
            +
                        if negative_prompt is None:
         
     | 
| 
      
 379 
     | 
    
         
            +
                            uncond_tokens = [""] * batch_size
         
     | 
| 
      
 380 
     | 
    
         
            +
                        elif isinstance(negative_prompt, str):
         
     | 
| 
      
 381 
     | 
    
         
            +
                            uncond_tokens = [negative_prompt]
         
     | 
| 
      
 382 
     | 
    
         
            +
                        elif batch_size != len(negative_prompt):
         
     | 
| 
      
 383 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 384 
     | 
    
         
            +
                                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
         
     | 
| 
      
 385 
     | 
    
         
            +
                                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
         
     | 
| 
      
 386 
     | 
    
         
            +
                                " the batch size of `prompt`."
         
     | 
| 
      
 387 
     | 
    
         
            +
                            )
         
     | 
| 
      
 388 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 389 
     | 
    
         
            +
                            uncond_tokens = negative_prompt
         
     | 
| 
      
 390 
     | 
    
         
            +
             
     | 
| 
      
 391 
     | 
    
         
            +
                        uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
         
     | 
| 
      
 392 
     | 
    
         
            +
                        max_length = prompt_embeds.shape[1]
         
     | 
| 
      
 393 
     | 
    
         
            +
                        uncond_input = self.tokenizer(
         
     | 
| 
      
 394 
     | 
    
         
            +
                            uncond_tokens,
         
     | 
| 
      
 395 
     | 
    
         
            +
                            padding="max_length",
         
     | 
| 
      
 396 
     | 
    
         
            +
                            max_length=max_length,
         
     | 
| 
      
 397 
     | 
    
         
            +
                            truncation=True,
         
     | 
| 
      
 398 
     | 
    
         
            +
                            return_attention_mask=True,
         
     | 
| 
      
 399 
     | 
    
         
            +
                            add_special_tokens=True,
         
     | 
| 
      
 400 
     | 
    
         
            +
                            return_tensors="pt",
         
     | 
| 
      
 401 
     | 
    
         
            +
                        )
         
     | 
| 
      
 402 
     | 
    
         
            +
                        attention_mask = uncond_input.attention_mask.to(device)
         
     | 
| 
      
 403 
     | 
    
         
            +
             
     | 
| 
      
 404 
     | 
    
         
            +
                        negative_prompt_embeds = self.text_encoder(
         
     | 
| 
      
 405 
     | 
    
         
            +
                            uncond_input.input_ids.to(device),
         
     | 
| 
      
 406 
     | 
    
         
            +
                            attention_mask=attention_mask,
         
     | 
| 
      
 407 
     | 
    
         
            +
                        )
         
     | 
| 
      
 408 
     | 
    
         
            +
                        negative_prompt_embeds = negative_prompt_embeds[0]
         
     | 
| 
      
 409 
     | 
    
         
            +
             
     | 
| 
      
 410 
     | 
    
         
            +
                    if do_classifier_free_guidance:
         
     | 
| 
      
 411 
     | 
    
         
            +
                        # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
         
     | 
| 
      
 412 
     | 
    
         
            +
                        seq_len = negative_prompt_embeds.shape[1]
         
     | 
| 
      
 413 
     | 
    
         
            +
             
     | 
| 
      
 414 
     | 
    
         
            +
                        negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
         
     | 
| 
      
 415 
     | 
    
         
            +
             
     | 
| 
      
 416 
     | 
    
         
            +
                        negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
         
     | 
| 
      
 417 
     | 
    
         
            +
                        negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
         
     | 
| 
      
 418 
     | 
    
         
            +
             
     | 
| 
      
 419 
     | 
    
         
            +
                        # For classifier free guidance, we need to do two forward passes.
         
     | 
| 
      
 420 
     | 
    
         
            +
                        # Here we concatenate the unconditional and text embeddings into a single batch
         
     | 
| 
      
 421 
     | 
    
         
            +
                        # to avoid doing two forward passes
         
     | 
| 
      
 422 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 423 
     | 
    
         
            +
                        negative_prompt_embeds = None
         
     | 
| 
      
 424 
     | 
    
         
            +
             
     | 
| 
      
 425 
     | 
    
         
            +
                    return prompt_embeds, negative_prompt_embeds
         
     | 
| 
      
 426 
     | 
    
         
            +
             
     | 
| 
      
 427 
     | 
    
         
            +
                # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
         
     | 
| 
      
 428 
     | 
    
         
            +
                def run_safety_checker(self, image, device, dtype):
         
     | 
| 
      
 429 
     | 
    
         
            +
                    if self.safety_checker is not None:
         
     | 
| 
      
 430 
     | 
    
         
            +
                        safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
         
     | 
| 
      
 431 
     | 
    
         
            +
                        image, nsfw_detected, watermark_detected = self.safety_checker(
         
     | 
| 
      
 432 
     | 
    
         
            +
                            images=image,
         
     | 
| 
      
 433 
     | 
    
         
            +
                            clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
         
     | 
| 
      
 434 
     | 
    
         
            +
                        )
         
     | 
| 
      
 435 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 436 
     | 
    
         
            +
                        nsfw_detected = None
         
     | 
| 
      
 437 
     | 
    
         
            +
                        watermark_detected = None
         
     | 
| 
      
 438 
     | 
    
         
            +
             
     | 
| 
      
 439 
     | 
    
         
            +
                        if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
         
     | 
| 
      
 440 
     | 
    
         
            +
                            self.unet_offload_hook.offload()
         
     | 
| 
      
 441 
     | 
    
         
            +
             
     | 
| 
      
 442 
     | 
    
         
            +
                    return image, nsfw_detected, watermark_detected
         
     | 
| 
      
 443 
     | 
    
         
            +
             
     | 
| 
      
 444 
     | 
    
         
            +
                # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
         
     | 
| 
      
 445 
     | 
    
         
            +
                def prepare_extra_step_kwargs(self, generator, eta):
         
     | 
| 
      
 446 
     | 
    
         
            +
                    # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         
     | 
| 
      
 447 
     | 
    
         
            +
                    # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         
     | 
| 
      
 448 
     | 
    
         
            +
                    # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         
     | 
| 
      
 449 
     | 
    
         
            +
                    # and should be between [0, 1]
         
     | 
| 
      
 450 
     | 
    
         
            +
             
     | 
| 
      
 451 
     | 
    
         
            +
                    accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         
     | 
| 
      
 452 
     | 
    
         
            +
                    extra_step_kwargs = {}
         
     | 
| 
      
 453 
     | 
    
         
            +
                    if accepts_eta:
         
     | 
| 
      
 454 
     | 
    
         
            +
                        extra_step_kwargs["eta"] = eta
         
     | 
| 
      
 455 
     | 
    
         
            +
             
     | 
| 
      
 456 
     | 
    
         
            +
                    # check if the scheduler accepts generator
         
     | 
| 
      
 457 
     | 
    
         
            +
                    accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         
     | 
| 
      
 458 
     | 
    
         
            +
                    if accepts_generator:
         
     | 
| 
      
 459 
     | 
    
         
            +
                        extra_step_kwargs["generator"] = generator
         
     | 
| 
      
 460 
     | 
    
         
            +
                    return extra_step_kwargs
         
     | 
| 
      
 461 
     | 
    
         
            +
             
     | 
| 
      
 462 
     | 
    
         
            +
                def check_inputs(
         
     | 
| 
      
 463 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 464 
     | 
    
         
            +
                    prompt,
         
     | 
| 
      
 465 
     | 
    
         
            +
                    image,
         
     | 
| 
      
 466 
     | 
    
         
            +
                    mask_image,
         
     | 
| 
      
 467 
     | 
    
         
            +
                    batch_size,
         
     | 
| 
      
 468 
     | 
    
         
            +
                    callback_steps,
         
     | 
| 
      
 469 
     | 
    
         
            +
                    negative_prompt=None,
         
     | 
| 
      
 470 
     | 
    
         
            +
                    prompt_embeds=None,
         
     | 
| 
      
 471 
     | 
    
         
            +
                    negative_prompt_embeds=None,
         
     | 
| 
      
 472 
     | 
    
         
            +
                ):
         
     | 
| 
      
 473 
     | 
    
         
            +
                    if (callback_steps is None) or (
         
     | 
| 
      
 474 
     | 
    
         
            +
                        callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         
     | 
| 
      
 475 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 476 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 477 
     | 
    
         
            +
                            f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
         
     | 
| 
      
 478 
     | 
    
         
            +
                            f" {type(callback_steps)}."
         
     | 
| 
      
 479 
     | 
    
         
            +
                        )
         
     | 
| 
      
 480 
     | 
    
         
            +
             
     | 
| 
      
 481 
     | 
    
         
            +
                    if prompt is not None and prompt_embeds is not None:
         
     | 
| 
      
 482 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 483 
     | 
    
         
            +
                            f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
         
     | 
| 
      
 484 
     | 
    
         
            +
                            " only forward one of the two."
         
     | 
| 
      
 485 
     | 
    
         
            +
                        )
         
     | 
| 
      
 486 
     | 
    
         
            +
                    elif prompt is None and prompt_embeds is None:
         
     | 
| 
      
 487 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 488 
     | 
    
         
            +
                            "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
         
     | 
| 
      
 489 
     | 
    
         
            +
                        )
         
     | 
| 
      
 490 
     | 
    
         
            +
                    elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
         
     | 
| 
      
 491 
     | 
    
         
            +
                        raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         
     | 
| 
      
 492 
     | 
    
         
            +
             
     | 
| 
      
 493 
     | 
    
         
            +
                    if negative_prompt is not None and negative_prompt_embeds is not None:
         
     | 
| 
      
 494 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 495 
     | 
    
         
            +
                            f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
         
     | 
| 
      
 496 
     | 
    
         
            +
                            f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
         
     | 
| 
      
 497 
     | 
    
         
            +
                        )
         
     | 
| 
      
 498 
     | 
    
         
            +
             
     | 
| 
      
 499 
     | 
    
         
            +
                    if prompt_embeds is not None and negative_prompt_embeds is not None:
         
     | 
| 
      
 500 
     | 
    
         
            +
                        if prompt_embeds.shape != negative_prompt_embeds.shape:
         
     | 
| 
      
 501 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 502 
     | 
    
         
            +
                                "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
         
     | 
| 
      
 503 
     | 
    
         
            +
                                f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
         
     | 
| 
      
 504 
     | 
    
         
            +
                                f" {negative_prompt_embeds.shape}."
         
     | 
| 
      
 505 
     | 
    
         
            +
                            )
         
     | 
| 
      
 506 
     | 
    
         
            +
             
     | 
| 
      
 507 
     | 
    
         
            +
                    # image
         
     | 
| 
      
 508 
     | 
    
         
            +
             
     | 
| 
      
 509 
     | 
    
         
            +
                    if isinstance(image, list):
         
     | 
| 
      
 510 
     | 
    
         
            +
                        check_image_type = image[0]
         
     | 
| 
      
 511 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 512 
     | 
    
         
            +
                        check_image_type = image
         
     | 
| 
      
 513 
     | 
    
         
            +
             
     | 
| 
      
 514 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 515 
     | 
    
         
            +
                        not isinstance(check_image_type, torch.Tensor)
         
     | 
| 
      
 516 
     | 
    
         
            +
                        and not isinstance(check_image_type, PIL.Image.Image)
         
     | 
| 
      
 517 
     | 
    
         
            +
                        and not isinstance(check_image_type, np.ndarray)
         
     | 
| 
      
 518 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 519 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 520 
     | 
    
         
            +
                            "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
         
     | 
| 
      
 521 
     | 
    
         
            +
                            f" {type(check_image_type)}"
         
     | 
| 
      
 522 
     | 
    
         
            +
                        )
         
     | 
| 
      
 523 
     | 
    
         
            +
             
     | 
| 
      
 524 
     | 
    
         
            +
                    if isinstance(image, list):
         
     | 
| 
      
 525 
     | 
    
         
            +
                        image_batch_size = len(image)
         
     | 
| 
      
 526 
     | 
    
         
            +
                    elif isinstance(image, torch.Tensor):
         
     | 
| 
      
 527 
     | 
    
         
            +
                        image_batch_size = image.shape[0]
         
     | 
| 
      
 528 
     | 
    
         
            +
                    elif isinstance(image, PIL.Image.Image):
         
     | 
| 
      
 529 
     | 
    
         
            +
                        image_batch_size = 1
         
     | 
| 
      
 530 
     | 
    
         
            +
                    elif isinstance(image, np.ndarray):
         
     | 
| 
      
 531 
     | 
    
         
            +
                        image_batch_size = image.shape[0]
         
     | 
| 
      
 532 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 533 
     | 
    
         
            +
                        assert False
         
     | 
| 
      
 534 
     | 
    
         
            +
             
     | 
| 
      
 535 
     | 
    
         
            +
                    if batch_size != image_batch_size:
         
     | 
| 
      
 536 
     | 
    
         
            +
                        raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
         
     | 
| 
      
 537 
     | 
    
         
            +
             
     | 
| 
      
 538 
     | 
    
         
            +
                    # mask_image
         
     | 
| 
      
 539 
     | 
    
         
            +
             
     | 
| 
      
 540 
     | 
    
         
            +
                    if isinstance(mask_image, list):
         
     | 
| 
      
 541 
     | 
    
         
            +
                        check_image_type = mask_image[0]
         
     | 
| 
      
 542 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 543 
     | 
    
         
            +
                        check_image_type = mask_image
         
     | 
| 
      
 544 
     | 
    
         
            +
             
     | 
| 
      
 545 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 546 
     | 
    
         
            +
                        not isinstance(check_image_type, torch.Tensor)
         
     | 
| 
      
 547 
     | 
    
         
            +
                        and not isinstance(check_image_type, PIL.Image.Image)
         
     | 
| 
      
 548 
     | 
    
         
            +
                        and not isinstance(check_image_type, np.ndarray)
         
     | 
| 
      
 549 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 550 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 551 
     | 
    
         
            +
                            "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
         
     | 
| 
      
 552 
     | 
    
         
            +
                            f" {type(check_image_type)}"
         
     | 
| 
      
 553 
     | 
    
         
            +
                        )
         
     | 
| 
      
 554 
     | 
    
         
            +
             
     | 
| 
      
 555 
     | 
    
         
            +
                    if isinstance(mask_image, list):
         
     | 
| 
      
 556 
     | 
    
         
            +
                        image_batch_size = len(mask_image)
         
     | 
| 
      
 557 
     | 
    
         
            +
                    elif isinstance(mask_image, torch.Tensor):
         
     | 
| 
      
 558 
     | 
    
         
            +
                        image_batch_size = mask_image.shape[0]
         
     | 
| 
      
 559 
     | 
    
         
            +
                    elif isinstance(mask_image, PIL.Image.Image):
         
     | 
| 
      
 560 
     | 
    
         
            +
                        image_batch_size = 1
         
     | 
| 
      
 561 
     | 
    
         
            +
                    elif isinstance(mask_image, np.ndarray):
         
     | 
| 
      
 562 
     | 
    
         
            +
                        image_batch_size = mask_image.shape[0]
         
     | 
| 
      
 563 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 564 
     | 
    
         
            +
                        assert False
         
     | 
| 
      
 565 
     | 
    
         
            +
             
     | 
| 
      
 566 
     | 
    
         
            +
                    if image_batch_size != 1 and batch_size != image_batch_size:
         
     | 
| 
      
 567 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 568 
     | 
    
         
            +
                            f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}"
         
     | 
| 
      
 569 
     | 
    
         
            +
                        )
         
     | 
| 
      
 570 
     | 
    
         
            +
             
     | 
| 
      
 571 
     | 
    
         
            +
                # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
         
     | 
| 
      
 572 
     | 
    
         
            +
                def _text_preprocessing(self, text, clean_caption=False):
         
     | 
| 
      
 573 
     | 
    
         
            +
                    if clean_caption and not is_bs4_available():
         
     | 
| 
      
 574 
     | 
    
         
            +
                        logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
         
     | 
| 
      
 575 
     | 
    
         
            +
                        logger.warn("Setting `clean_caption` to False...")
         
     | 
| 
      
 576 
     | 
    
         
            +
                        clean_caption = False
         
     | 
| 
      
 577 
     | 
    
         
            +
             
     | 
| 
      
 578 
     | 
    
         
            +
                    if clean_caption and not is_ftfy_available():
         
     | 
| 
      
 579 
     | 
    
         
            +
                        logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
         
     | 
| 
      
 580 
     | 
    
         
            +
                        logger.warn("Setting `clean_caption` to False...")
         
     | 
| 
      
 581 
     | 
    
         
            +
                        clean_caption = False
         
     | 
| 
      
 582 
     | 
    
         
            +
             
     | 
| 
      
 583 
     | 
    
         
            +
                    if not isinstance(text, (tuple, list)):
         
     | 
| 
      
 584 
     | 
    
         
            +
                        text = [text]
         
     | 
| 
      
 585 
     | 
    
         
            +
             
     | 
| 
      
 586 
     | 
    
         
            +
                    def process(text: str):
         
     | 
| 
      
 587 
     | 
    
         
            +
                        if clean_caption:
         
     | 
| 
      
 588 
     | 
    
         
            +
                            text = self._clean_caption(text)
         
     | 
| 
      
 589 
     | 
    
         
            +
                            text = self._clean_caption(text)
         
     | 
| 
      
 590 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 591 
     | 
    
         
            +
                            text = text.lower().strip()
         
     | 
| 
      
 592 
     | 
    
         
            +
                        return text
         
     | 
| 
      
 593 
     | 
    
         
            +
             
     | 
| 
      
 594 
     | 
    
         
            +
                    return [process(t) for t in text]
         
     | 
| 
      
 595 
     | 
    
         
            +
             
     | 
| 
      
 596 
     | 
    
         
            +
                # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
         
     | 
| 
      
 597 
     | 
    
         
            +
                def _clean_caption(self, caption):
         
     | 
| 
      
 598 
     | 
    
         
            +
                    caption = str(caption)
         
     | 
| 
      
 599 
     | 
    
         
            +
                    caption = ul.unquote_plus(caption)
         
     | 
| 
      
 600 
     | 
    
         
            +
                    caption = caption.strip().lower()
         
     | 
| 
      
 601 
     | 
    
         
            +
                    caption = re.sub("<person>", "person", caption)
         
     | 
| 
      
 602 
     | 
    
         
            +
                    # urls:
         
     | 
| 
      
 603 
     | 
    
         
            +
                    caption = re.sub(
         
     | 
| 
      
 604 
     | 
    
         
            +
                        r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
         
     | 
| 
      
 605 
     | 
    
         
            +
                        "",
         
     | 
| 
      
 606 
     | 
    
         
            +
                        caption,
         
     | 
| 
      
 607 
     | 
    
         
            +
                    )  # regex for urls
         
     | 
| 
      
 608 
     | 
    
         
            +
                    caption = re.sub(
         
     | 
| 
      
 609 
     | 
    
         
            +
                        r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
         
     | 
| 
      
 610 
     | 
    
         
            +
                        "",
         
     | 
| 
      
 611 
     | 
    
         
            +
                        caption,
         
     | 
| 
      
 612 
     | 
    
         
            +
                    )  # regex for urls
         
     | 
| 
      
 613 
     | 
    
         
            +
                    # html:
         
     | 
| 
      
 614 
     | 
    
         
            +
                    caption = BeautifulSoup(caption, features="html.parser").text
         
     | 
| 
      
 615 
     | 
    
         
            +
             
     | 
| 
      
 616 
     | 
    
         
            +
                    # @<nickname>
         
     | 
| 
      
 617 
     | 
    
         
            +
                    caption = re.sub(r"@[\w\d]+\b", "", caption)
         
     | 
| 
      
 618 
     | 
    
         
            +
             
     | 
| 
      
 619 
     | 
    
         
            +
                    # 31C0—31EF CJK Strokes
         
     | 
| 
      
 620 
     | 
    
         
            +
                    # 31F0—31FF Katakana Phonetic Extensions
         
     | 
| 
      
 621 
     | 
    
         
            +
                    # 3200—32FF Enclosed CJK Letters and Months
         
     | 
| 
      
 622 
     | 
    
         
            +
                    # 3300—33FF CJK Compatibility
         
     | 
| 
      
 623 
     | 
    
         
            +
                    # 3400—4DBF CJK Unified Ideographs Extension A
         
     | 
| 
      
 624 
     | 
    
         
            +
                    # 4DC0—4DFF Yijing Hexagram Symbols
         
     | 
| 
      
 625 
     | 
    
         
            +
                    # 4E00—9FFF CJK Unified Ideographs
         
     | 
| 
      
 626 
     | 
    
         
            +
                    caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
         
     | 
| 
      
 627 
     | 
    
         
            +
                    caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
         
     | 
| 
      
 628 
     | 
    
         
            +
                    caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
         
     | 
| 
      
 629 
     | 
    
         
            +
                    caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
         
     | 
| 
      
 630 
     | 
    
         
            +
                    caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
         
     | 
| 
      
 631 
     | 
    
         
            +
                    caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
         
     | 
| 
      
 632 
     | 
    
         
            +
                    caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
         
     | 
| 
      
 633 
     | 
    
         
            +
                    #######################################################
         
     | 
| 
      
 634 
     | 
    
         
            +
             
     | 
| 
      
 635 
     | 
    
         
            +
                    # все виды тире / all types of dash --> "-"
         
     | 
| 
      
 636 
     | 
    
         
            +
                    caption = re.sub(
         
     | 
| 
      
 637 
     | 
    
         
            +
                        r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
         
     | 
| 
      
 638 
     | 
    
         
            +
                        "-",
         
     | 
| 
      
 639 
     | 
    
         
            +
                        caption,
         
     | 
| 
      
 640 
     | 
    
         
            +
                    )
         
     | 
| 
      
 641 
     | 
    
         
            +
             
     | 
| 
      
 642 
     | 
    
         
            +
                    # кавычки к одному стандарту
         
     | 
| 
      
 643 
     | 
    
         
            +
                    caption = re.sub(r"[`´«»“”¨]", '"', caption)
         
     | 
| 
      
 644 
     | 
    
         
            +
                    caption = re.sub(r"[‘’]", "'", caption)
         
     | 
| 
      
 645 
     | 
    
         
            +
             
     | 
| 
      
 646 
     | 
    
         
            +
                    # "
         
     | 
| 
      
 647 
     | 
    
         
            +
                    caption = re.sub(r""?", "", caption)
         
     | 
| 
      
 648 
     | 
    
         
            +
                    # &
         
     | 
| 
      
 649 
     | 
    
         
            +
                    caption = re.sub(r"&", "", caption)
         
     | 
| 
      
 650 
     | 
    
         
            +
             
     | 
| 
      
 651 
     | 
    
         
            +
                    # ip adresses:
         
     | 
| 
      
 652 
     | 
    
         
            +
                    caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
         
     | 
| 
      
 653 
     | 
    
         
            +
             
     | 
| 
      
 654 
     | 
    
         
            +
                    # article ids:
         
     | 
| 
      
 655 
     | 
    
         
            +
                    caption = re.sub(r"\d:\d\d\s+$", "", caption)
         
     | 
| 
      
 656 
     | 
    
         
            +
             
     | 
| 
      
 657 
     | 
    
         
            +
                    # \n
         
     | 
| 
      
 658 
     | 
    
         
            +
                    caption = re.sub(r"\\n", " ", caption)
         
     | 
| 
      
 659 
     | 
    
         
            +
             
     | 
| 
      
 660 
     | 
    
         
            +
                    # "#123"
         
     | 
| 
      
 661 
     | 
    
         
            +
                    caption = re.sub(r"#\d{1,3}\b", "", caption)
         
     | 
| 
      
 662 
     | 
    
         
            +
                    # "#12345.."
         
     | 
| 
      
 663 
     | 
    
         
            +
                    caption = re.sub(r"#\d{5,}\b", "", caption)
         
     | 
| 
      
 664 
     | 
    
         
            +
                    # "123456.."
         
     | 
| 
      
 665 
     | 
    
         
            +
                    caption = re.sub(r"\b\d{6,}\b", "", caption)
         
     | 
| 
      
 666 
     | 
    
         
            +
                    # filenames:
         
     | 
| 
      
 667 
     | 
    
         
            +
                    caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
         
     | 
| 
      
 668 
     | 
    
         
            +
             
     | 
| 
      
 669 
     | 
    
         
            +
                    #
         
     | 
| 
      
 670 
     | 
    
         
            +
                    caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
         
     | 
| 
      
 671 
     | 
    
         
            +
                    caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
         
     | 
| 
      
 672 
     | 
    
         
            +
             
     | 
| 
      
 673 
     | 
    
         
            +
                    caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
         
     | 
| 
      
 674 
     | 
    
         
            +
                    caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
         
     | 
| 
      
 675 
     | 
    
         
            +
             
     | 
| 
      
 676 
     | 
    
         
            +
                    # this-is-my-cute-cat / this_is_my_cute_cat
         
     | 
| 
      
 677 
     | 
    
         
            +
                    regex2 = re.compile(r"(?:\-|\_)")
         
     | 
| 
      
 678 
     | 
    
         
            +
                    if len(re.findall(regex2, caption)) > 3:
         
     | 
| 
      
 679 
     | 
    
         
            +
                        caption = re.sub(regex2, " ", caption)
         
     | 
| 
      
 680 
     | 
    
         
            +
             
     | 
| 
      
 681 
     | 
    
         
            +
                    caption = ftfy.fix_text(caption)
         
     | 
| 
      
 682 
     | 
    
         
            +
                    caption = html.unescape(html.unescape(caption))
         
     | 
| 
      
 683 
     | 
    
         
            +
             
     | 
| 
      
 684 
     | 
    
         
            +
                    caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
         
     | 
| 
      
 685 
     | 
    
         
            +
                    caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
         
     | 
| 
      
 686 
     | 
    
         
            +
                    caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
         
     | 
| 
      
 687 
     | 
    
         
            +
             
     | 
| 
      
 688 
     | 
    
         
            +
                    caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
         
     | 
| 
      
 689 
     | 
    
         
            +
                    caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
         
     | 
| 
      
 690 
     | 
    
         
            +
                    caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
         
     | 
| 
      
 691 
     | 
    
         
            +
                    caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
         
     | 
| 
      
 692 
     | 
    
         
            +
                    caption = re.sub(r"\bpage\s+\d+\b", "", caption)
         
     | 
| 
      
 693 
     | 
    
         
            +
             
     | 
| 
      
 694 
     | 
    
         
            +
                    caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
         
     | 
| 
      
 695 
     | 
    
         
            +
             
     | 
| 
      
 696 
     | 
    
         
            +
                    caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
         
     | 
| 
      
 697 
     | 
    
         
            +
             
     | 
| 
      
 698 
     | 
    
         
            +
                    caption = re.sub(r"\b\s+\:\s+", r": ", caption)
         
     | 
| 
      
 699 
     | 
    
         
            +
                    caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
         
     | 
| 
      
 700 
     | 
    
         
            +
                    caption = re.sub(r"\s+", " ", caption)
         
     | 
| 
      
 701 
     | 
    
         
            +
             
     | 
| 
      
 702 
     | 
    
         
            +
                    caption.strip()
         
     | 
| 
      
 703 
     | 
    
         
            +
             
     | 
| 
      
 704 
     | 
    
         
            +
                    caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
         
     | 
| 
      
 705 
     | 
    
         
            +
                    caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
         
     | 
| 
      
 706 
     | 
    
         
            +
                    caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
         
     | 
| 
      
 707 
     | 
    
         
            +
                    caption = re.sub(r"^\.\S+$", "", caption)
         
     | 
| 
      
 708 
     | 
    
         
            +
             
     | 
| 
      
 709 
     | 
    
         
            +
                    return caption.strip()
         
     | 
| 
      
 710 
     | 
    
         
            +
             
     | 
| 
      
 711 
     | 
    
         
            +
                # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image
         
     | 
| 
      
 712 
     | 
    
         
            +
                def preprocess_image(self, image: PIL.Image.Image) -> torch.Tensor:
         
     | 
| 
      
 713 
     | 
    
         
            +
                    if not isinstance(image, list):
         
     | 
| 
      
 714 
     | 
    
         
            +
                        image = [image]
         
     | 
| 
      
 715 
     | 
    
         
            +
             
     | 
| 
      
 716 
     | 
    
         
            +
                    def numpy_to_pt(images):
         
     | 
| 
      
 717 
     | 
    
         
            +
                        if images.ndim == 3:
         
     | 
| 
      
 718 
     | 
    
         
            +
                            images = images[..., None]
         
     | 
| 
      
 719 
     | 
    
         
            +
             
     | 
| 
      
 720 
     | 
    
         
            +
                        images = torch.from_numpy(images.transpose(0, 3, 1, 2))
         
     | 
| 
      
 721 
     | 
    
         
            +
                        return images
         
     | 
| 
      
 722 
     | 
    
         
            +
             
     | 
| 
      
 723 
     | 
    
         
            +
                    if isinstance(image[0], PIL.Image.Image):
         
     | 
| 
      
 724 
     | 
    
         
            +
                        new_image = []
         
     | 
| 
      
 725 
     | 
    
         
            +
             
     | 
| 
      
 726 
     | 
    
         
            +
                        for image_ in image:
         
     | 
| 
      
 727 
     | 
    
         
            +
                            image_ = image_.convert("RGB")
         
     | 
| 
      
 728 
     | 
    
         
            +
                            image_ = resize(image_, self.unet.sample_size)
         
     | 
| 
      
 729 
     | 
    
         
            +
                            image_ = np.array(image_)
         
     | 
| 
      
 730 
     | 
    
         
            +
                            image_ = image_.astype(np.float32)
         
     | 
| 
      
 731 
     | 
    
         
            +
                            image_ = image_ / 127.5 - 1
         
     | 
| 
      
 732 
     | 
    
         
            +
                            new_image.append(image_)
         
     | 
| 
      
 733 
     | 
    
         
            +
             
     | 
| 
      
 734 
     | 
    
         
            +
                        image = new_image
         
     | 
| 
      
 735 
     | 
    
         
            +
             
     | 
| 
      
 736 
     | 
    
         
            +
                        image = np.stack(image, axis=0)  # to np
         
     | 
| 
      
 737 
     | 
    
         
            +
                        image = numpy_to_pt(image)  # to pt
         
     | 
| 
      
 738 
     | 
    
         
            +
             
     | 
| 
      
 739 
     | 
    
         
            +
                    elif isinstance(image[0], np.ndarray):
         
     | 
| 
      
 740 
     | 
    
         
            +
                        image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
         
     | 
| 
      
 741 
     | 
    
         
            +
                        image = numpy_to_pt(image)
         
     | 
| 
      
 742 
     | 
    
         
            +
             
     | 
| 
      
 743 
     | 
    
         
            +
                    elif isinstance(image[0], torch.Tensor):
         
     | 
| 
      
 744 
     | 
    
         
            +
                        image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
         
     | 
| 
      
 745 
     | 
    
         
            +
             
     | 
| 
      
 746 
     | 
    
         
            +
                    return image
         
     | 
| 
      
 747 
     | 
    
         
            +
             
     | 
| 
      
 748 
     | 
    
         
            +
                def preprocess_mask_image(self, mask_image) -> torch.Tensor:
         
     | 
| 
      
 749 
     | 
    
         
            +
                    if not isinstance(mask_image, list):
         
     | 
| 
      
 750 
     | 
    
         
            +
                        mask_image = [mask_image]
         
     | 
| 
      
 751 
     | 
    
         
            +
             
     | 
| 
      
 752 
     | 
    
         
            +
                    if isinstance(mask_image[0], torch.Tensor):
         
     | 
| 
      
 753 
     | 
    
         
            +
                        mask_image = torch.cat(mask_image, axis=0) if mask_image[0].ndim == 4 else torch.stack(mask_image, axis=0)
         
     | 
| 
      
 754 
     | 
    
         
            +
             
     | 
| 
      
 755 
     | 
    
         
            +
                        if mask_image.ndim == 2:
         
     | 
| 
      
 756 
     | 
    
         
            +
                            # Batch and add channel dim for single mask
         
     | 
| 
      
 757 
     | 
    
         
            +
                            mask_image = mask_image.unsqueeze(0).unsqueeze(0)
         
     | 
| 
      
 758 
     | 
    
         
            +
                        elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
         
     | 
| 
      
 759 
     | 
    
         
            +
                            # Single mask, the 0'th dimension is considered to be
         
     | 
| 
      
 760 
     | 
    
         
            +
                            # the existing batch size of 1
         
     | 
| 
      
 761 
     | 
    
         
            +
                            mask_image = mask_image.unsqueeze(0)
         
     | 
| 
      
 762 
     | 
    
         
            +
                        elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
         
     | 
| 
      
 763 
     | 
    
         
            +
                            # Batch of mask, the 0'th dimension is considered to be
         
     | 
| 
      
 764 
     | 
    
         
            +
                            # the batching dimension
         
     | 
| 
      
 765 
     | 
    
         
            +
                            mask_image = mask_image.unsqueeze(1)
         
     | 
| 
      
 766 
     | 
    
         
            +
             
     | 
| 
      
 767 
     | 
    
         
            +
                        mask_image[mask_image < 0.5] = 0
         
     | 
| 
      
 768 
     | 
    
         
            +
                        mask_image[mask_image >= 0.5] = 1
         
     | 
| 
      
 769 
     | 
    
         
            +
             
     | 
| 
      
 770 
     | 
    
         
            +
                    elif isinstance(mask_image[0], PIL.Image.Image):
         
     | 
| 
      
 771 
     | 
    
         
            +
                        new_mask_image = []
         
     | 
| 
      
 772 
     | 
    
         
            +
             
     | 
| 
      
 773 
     | 
    
         
            +
                        for mask_image_ in mask_image:
         
     | 
| 
      
 774 
     | 
    
         
            +
                            mask_image_ = mask_image_.convert("L")
         
     | 
| 
      
 775 
     | 
    
         
            +
                            mask_image_ = resize(mask_image_, self.unet.sample_size)
         
     | 
| 
      
 776 
     | 
    
         
            +
                            mask_image_ = np.array(mask_image_)
         
     | 
| 
      
 777 
     | 
    
         
            +
                            mask_image_ = mask_image_[None, None, :]
         
     | 
| 
      
 778 
     | 
    
         
            +
                            new_mask_image.append(mask_image_)
         
     | 
| 
      
 779 
     | 
    
         
            +
             
     | 
| 
      
 780 
     | 
    
         
            +
                        mask_image = new_mask_image
         
     | 
| 
      
 781 
     | 
    
         
            +
             
     | 
| 
      
 782 
     | 
    
         
            +
                        mask_image = np.concatenate(mask_image, axis=0)
         
     | 
| 
      
 783 
     | 
    
         
            +
                        mask_image = mask_image.astype(np.float32) / 255.0
         
     | 
| 
      
 784 
     | 
    
         
            +
                        mask_image[mask_image < 0.5] = 0
         
     | 
| 
      
 785 
     | 
    
         
            +
                        mask_image[mask_image >= 0.5] = 1
         
     | 
| 
      
 786 
     | 
    
         
            +
                        mask_image = torch.from_numpy(mask_image)
         
     | 
| 
      
 787 
     | 
    
         
            +
             
     | 
| 
      
 788 
     | 
    
         
            +
                    elif isinstance(mask_image[0], np.ndarray):
         
     | 
| 
      
 789 
     | 
    
         
            +
                        mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
         
     | 
| 
      
 790 
     | 
    
         
            +
             
     | 
| 
      
 791 
     | 
    
         
            +
                        mask_image[mask_image < 0.5] = 0
         
     | 
| 
      
 792 
     | 
    
         
            +
                        mask_image[mask_image >= 0.5] = 1
         
     | 
| 
      
 793 
     | 
    
         
            +
                        mask_image = torch.from_numpy(mask_image)
         
     | 
| 
      
 794 
     | 
    
         
            +
             
     | 
| 
      
 795 
     | 
    
         
            +
                    return mask_image
         
     | 
| 
      
 796 
     | 
    
         
            +
             
     | 
| 
      
 797 
     | 
    
         
            +
                # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
         
     | 
| 
      
 798 
     | 
    
         
            +
                def get_timesteps(self, num_inference_steps, strength):
         
     | 
| 
      
 799 
     | 
    
         
            +
                    # get the original timestep using init_timestep
         
     | 
| 
      
 800 
     | 
    
         
            +
                    init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
         
     | 
| 
      
 801 
     | 
    
         
            +
             
     | 
| 
      
 802 
     | 
    
         
            +
                    t_start = max(num_inference_steps - init_timestep, 0)
         
     | 
| 
      
 803 
     | 
    
         
            +
                    timesteps = self.scheduler.timesteps[t_start:]
         
     | 
| 
      
 804 
     | 
    
         
            +
             
     | 
| 
      
 805 
     | 
    
         
            +
                    return timesteps, num_inference_steps - t_start
         
     | 
| 
      
 806 
     | 
    
         
            +
             
     | 
| 
      
 807 
     | 
    
         
            +
                def prepare_intermediate_images(
         
     | 
| 
      
 808 
     | 
    
         
            +
                    self, image, timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator=None
         
     | 
| 
      
 809 
     | 
    
         
            +
                ):
         
     | 
| 
      
 810 
     | 
    
         
            +
                    image_batch_size, channels, height, width = image.shape
         
     | 
| 
      
 811 
     | 
    
         
            +
             
     | 
| 
      
 812 
     | 
    
         
            +
                    batch_size = batch_size * num_images_per_prompt
         
     | 
| 
      
 813 
     | 
    
         
            +
             
     | 
| 
      
 814 
     | 
    
         
            +
                    shape = (batch_size, channels, height, width)
         
     | 
| 
      
 815 
     | 
    
         
            +
             
     | 
| 
      
 816 
     | 
    
         
            +
                    if isinstance(generator, list) and len(generator) != batch_size:
         
     | 
| 
      
 817 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 818 
     | 
    
         
            +
                            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
         
     | 
| 
      
 819 
     | 
    
         
            +
                            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
         
     | 
| 
      
 820 
     | 
    
         
            +
                        )
         
     | 
| 
      
 821 
     | 
    
         
            +
             
     | 
| 
      
 822 
     | 
    
         
            +
                    noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         
     | 
| 
      
 823 
     | 
    
         
            +
             
     | 
| 
      
 824 
     | 
    
         
            +
                    image = image.repeat_interleave(num_images_per_prompt, dim=0)
         
     | 
| 
      
 825 
     | 
    
         
            +
                    noised_image = self.scheduler.add_noise(image, noise, timestep)
         
     | 
| 
      
 826 
     | 
    
         
            +
             
     | 
| 
      
 827 
     | 
    
         
            +
                    image = (1 - mask_image) * image + mask_image * noised_image
         
     | 
| 
      
 828 
     | 
    
         
            +
             
     | 
| 
      
 829 
     | 
    
         
            +
                    return image
         
     | 
| 
      
 830 
     | 
    
         
            +
             
     | 
| 
      
 831 
     | 
    
         
            +
                @torch.no_grad()
         
     | 
| 
      
 832 
     | 
    
         
            +
                @replace_example_docstring(EXAMPLE_DOC_STRING)
         
     | 
| 
      
 833 
     | 
    
         
            +
                def __call__(
         
     | 
| 
      
 834 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 835 
     | 
    
         
            +
                    prompt: Union[str, List[str]] = None,
         
     | 
| 
      
 836 
     | 
    
         
            +
                    image: Union[
         
     | 
| 
      
 837 
     | 
    
         
            +
                        PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
         
     | 
| 
      
 838 
     | 
    
         
            +
                    ] = None,
         
     | 
| 
      
 839 
     | 
    
         
            +
                    mask_image: Union[
         
     | 
| 
      
 840 
     | 
    
         
            +
                        PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
         
     | 
| 
      
 841 
     | 
    
         
            +
                    ] = None,
         
     | 
| 
      
 842 
     | 
    
         
            +
                    strength: float = 1.0,
         
     | 
| 
      
 843 
     | 
    
         
            +
                    num_inference_steps: int = 50,
         
     | 
| 
      
 844 
     | 
    
         
            +
                    timesteps: List[int] = None,
         
     | 
| 
      
 845 
     | 
    
         
            +
                    guidance_scale: float = 7.0,
         
     | 
| 
      
 846 
     | 
    
         
            +
                    negative_prompt: Optional[Union[str, List[str]]] = None,
         
     | 
| 
      
 847 
     | 
    
         
            +
                    num_images_per_prompt: Optional[int] = 1,
         
     | 
| 
      
 848 
     | 
    
         
            +
                    eta: float = 0.0,
         
     | 
| 
      
 849 
     | 
    
         
            +
                    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         
     | 
| 
      
 850 
     | 
    
         
            +
                    prompt_embeds: Optional[torch.FloatTensor] = None,
         
     | 
| 
      
 851 
     | 
    
         
            +
                    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         
     | 
| 
      
 852 
     | 
    
         
            +
                    output_type: Optional[str] = "pil",
         
     | 
| 
      
 853 
     | 
    
         
            +
                    return_dict: bool = True,
         
     | 
| 
      
 854 
     | 
    
         
            +
                    callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         
     | 
| 
      
 855 
     | 
    
         
            +
                    callback_steps: int = 1,
         
     | 
| 
      
 856 
     | 
    
         
            +
                    clean_caption: bool = True,
         
     | 
| 
      
 857 
     | 
    
         
            +
                    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         
     | 
| 
      
 858 
     | 
    
         
            +
                ):
         
     | 
| 
      
 859 
     | 
    
         
            +
                    """
         
     | 
| 
      
 860 
     | 
    
         
            +
                    Function invoked when calling the pipeline for generation.
         
     | 
| 
      
 861 
     | 
    
         
            +
             
     | 
| 
      
 862 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 863 
     | 
    
         
            +
                        prompt (`str` or `List[str]`, *optional*):
         
     | 
| 
      
 864 
     | 
    
         
            +
                            The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
         
     | 
| 
      
 865 
     | 
    
         
            +
                            instead.
         
     | 
| 
      
 866 
     | 
    
         
            +
                        image (`torch.FloatTensor` or `PIL.Image.Image`):
         
     | 
| 
      
 867 
     | 
    
         
            +
                            `Image`, or tensor representing an image batch, that will be used as the starting point for the
         
     | 
| 
      
 868 
     | 
    
         
            +
                            process.
         
     | 
| 
      
 869 
     | 
    
         
            +
                        mask_image (`PIL.Image.Image`):
         
     | 
| 
      
 870 
     | 
    
         
            +
                            `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
         
     | 
| 
      
 871 
     | 
    
         
            +
                            repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
         
     | 
| 
      
 872 
     | 
    
         
            +
                            to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
         
     | 
| 
      
 873 
     | 
    
         
            +
                            instead of 3, so the expected shape would be `(B, H, W, 1)`.
         
     | 
| 
      
 874 
     | 
    
         
            +
                        strength (`float`, *optional*, defaults to 0.8):
         
     | 
| 
      
 875 
     | 
    
         
            +
                            Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
         
     | 
| 
      
 876 
     | 
    
         
            +
                            will be used as a starting point, adding more noise to it the larger the `strength`. The number of
         
     | 
| 
      
 877 
     | 
    
         
            +
                            denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
         
     | 
| 
      
 878 
     | 
    
         
            +
                            be maximum and the denoising process will run for the full number of iterations specified in
         
     | 
| 
      
 879 
     | 
    
         
            +
                            `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
         
     | 
| 
      
 880 
     | 
    
         
            +
                        num_inference_steps (`int`, *optional*, defaults to 50):
         
     | 
| 
      
 881 
     | 
    
         
            +
                            The number of denoising steps. More denoising steps usually lead to a higher quality image at the
         
     | 
| 
      
 882 
     | 
    
         
            +
                            expense of slower inference.
         
     | 
| 
      
 883 
     | 
    
         
            +
                        timesteps (`List[int]`, *optional*):
         
     | 
| 
      
 884 
     | 
    
         
            +
                            Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
         
     | 
| 
      
 885 
     | 
    
         
            +
                            timesteps are used. Must be in descending order.
         
     | 
| 
      
 886 
     | 
    
         
            +
                        guidance_scale (`float`, *optional*, defaults to 7.5):
         
     | 
| 
      
 887 
     | 
    
         
            +
                            Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
         
     | 
| 
      
 888 
     | 
    
         
            +
                            `guidance_scale` is defined as `w` of equation 2. of [Imagen
         
     | 
| 
      
 889 
     | 
    
         
            +
                            Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
         
     | 
| 
      
 890 
     | 
    
         
            +
                            1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
         
     | 
| 
      
 891 
     | 
    
         
            +
                            usually at the expense of lower image quality.
         
     | 
| 
      
 892 
     | 
    
         
            +
                        negative_prompt (`str` or `List[str]`, *optional*):
         
     | 
| 
      
 893 
     | 
    
         
            +
                            The prompt or prompts not to guide the image generation. If not defined, one has to pass
         
     | 
| 
      
 894 
     | 
    
         
            +
                            `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
         
     | 
| 
      
 895 
     | 
    
         
            +
                            less than `1`).
         
     | 
| 
      
 896 
     | 
    
         
            +
                        num_images_per_prompt (`int`, *optional*, defaults to 1):
         
     | 
| 
      
 897 
     | 
    
         
            +
                            The number of images to generate per prompt.
         
     | 
| 
      
 898 
     | 
    
         
            +
                        eta (`float`, *optional*, defaults to 0.0):
         
     | 
| 
      
 899 
     | 
    
         
            +
                            Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
         
     | 
| 
      
 900 
     | 
    
         
            +
                            [`schedulers.DDIMScheduler`], will be ignored for others.
         
     | 
| 
      
 901 
     | 
    
         
            +
                        generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
         
     | 
| 
      
 902 
     | 
    
         
            +
                            One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
         
     | 
| 
      
 903 
     | 
    
         
            +
                            to make generation deterministic.
         
     | 
| 
      
 904 
     | 
    
         
            +
                        prompt_embeds (`torch.FloatTensor`, *optional*):
         
     | 
| 
      
 905 
     | 
    
         
            +
                            Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
         
     | 
| 
      
 906 
     | 
    
         
            +
                            provided, text embeddings will be generated from `prompt` input argument.
         
     | 
| 
      
 907 
     | 
    
         
            +
                        negative_prompt_embeds (`torch.FloatTensor`, *optional*):
         
     | 
| 
      
 908 
     | 
    
         
            +
                            Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
         
     | 
| 
      
 909 
     | 
    
         
            +
                            weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
         
     | 
| 
      
 910 
     | 
    
         
            +
                            argument.
         
     | 
| 
      
 911 
     | 
    
         
            +
                        output_type (`str`, *optional*, defaults to `"pil"`):
         
     | 
| 
      
 912 
     | 
    
         
            +
                            The output format of the generate image. Choose between
         
     | 
| 
      
 913 
     | 
    
         
            +
                            [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
         
     | 
| 
      
 914 
     | 
    
         
            +
                        return_dict (`bool`, *optional*, defaults to `True`):
         
     | 
| 
      
 915 
     | 
    
         
            +
                            Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
         
     | 
| 
      
 916 
     | 
    
         
            +
                        callback (`Callable`, *optional*):
         
     | 
| 
      
 917 
     | 
    
         
            +
                            A function that will be called every `callback_steps` steps during inference. The function will be
         
     | 
| 
      
 918 
     | 
    
         
            +
                            called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
         
     | 
| 
      
 919 
     | 
    
         
            +
                        callback_steps (`int`, *optional*, defaults to 1):
         
     | 
| 
      
 920 
     | 
    
         
            +
                            The frequency at which the `callback` function will be called. If not specified, the callback will be
         
     | 
| 
      
 921 
     | 
    
         
            +
                            called at every step.
         
     | 
| 
      
 922 
     | 
    
         
            +
                        clean_caption (`bool`, *optional*, defaults to `True`):
         
     | 
| 
      
 923 
     | 
    
         
            +
                            Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
         
     | 
| 
      
 924 
     | 
    
         
            +
                            be installed. If the dependencies are not installed, the embeddings will be created from the raw
         
     | 
| 
      
 925 
     | 
    
         
            +
                            prompt.
         
     | 
| 
      
 926 
     | 
    
         
            +
                        cross_attention_kwargs (`dict`, *optional*):
         
     | 
| 
      
 927 
     | 
    
         
            +
                            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
         
     | 
| 
      
 928 
     | 
    
         
            +
                            `self.processor` in
         
     | 
| 
      
 929 
     | 
    
         
            +
                            [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
         
     | 
| 
      
 930 
     | 
    
         
            +
             
     | 
| 
      
 931 
     | 
    
         
            +
                    Examples:
         
     | 
| 
      
 932 
     | 
    
         
            +
             
     | 
| 
      
 933 
     | 
    
         
            +
                    Returns:
         
     | 
| 
      
 934 
     | 
    
         
            +
                        [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
         
     | 
| 
      
 935 
     | 
    
         
            +
                        [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
         
     | 
| 
      
 936 
     | 
    
         
            +
                        returning a tuple, the first element is a list with the generated images, and the second element is a list
         
     | 
| 
      
 937 
     | 
    
         
            +
                        of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
         
     | 
| 
      
 938 
     | 
    
         
            +
                        or watermarked content, according to the `safety_checker`.
         
     | 
| 
      
 939 
     | 
    
         
            +
                    """
         
     | 
| 
      
 940 
     | 
    
         
            +
                    # 1. Check inputs. Raise error if not correct
         
     | 
| 
      
 941 
     | 
    
         
            +
                    if prompt is not None and isinstance(prompt, str):
         
     | 
| 
      
 942 
     | 
    
         
            +
                        batch_size = 1
         
     | 
| 
      
 943 
     | 
    
         
            +
                    elif prompt is not None and isinstance(prompt, list):
         
     | 
| 
      
 944 
     | 
    
         
            +
                        batch_size = len(prompt)
         
     | 
| 
      
 945 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 946 
     | 
    
         
            +
                        batch_size = prompt_embeds.shape[0]
         
     | 
| 
      
 947 
     | 
    
         
            +
             
     | 
| 
      
 948 
     | 
    
         
            +
                    self.check_inputs(
         
     | 
| 
      
 949 
     | 
    
         
            +
                        prompt,
         
     | 
| 
      
 950 
     | 
    
         
            +
                        image,
         
     | 
| 
      
 951 
     | 
    
         
            +
                        mask_image,
         
     | 
| 
      
 952 
     | 
    
         
            +
                        batch_size,
         
     | 
| 
      
 953 
     | 
    
         
            +
                        callback_steps,
         
     | 
| 
      
 954 
     | 
    
         
            +
                        negative_prompt,
         
     | 
| 
      
 955 
     | 
    
         
            +
                        prompt_embeds,
         
     | 
| 
      
 956 
     | 
    
         
            +
                        negative_prompt_embeds,
         
     | 
| 
      
 957 
     | 
    
         
            +
                    )
         
     | 
| 
      
 958 
     | 
    
         
            +
             
     | 
| 
      
 959 
     | 
    
         
            +
                    # 2. Define call parameters
         
     | 
| 
      
 960 
     | 
    
         
            +
                    device = self._execution_device
         
     | 
| 
      
 961 
     | 
    
         
            +
             
     | 
| 
      
 962 
     | 
    
         
            +
                    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         
     | 
| 
      
 963 
     | 
    
         
            +
                    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         
     | 
| 
      
 964 
     | 
    
         
            +
                    # corresponds to doing no classifier free guidance.
         
     | 
| 
      
 965 
     | 
    
         
            +
                    do_classifier_free_guidance = guidance_scale > 1.0
         
     | 
| 
      
 966 
     | 
    
         
            +
             
     | 
| 
      
 967 
     | 
    
         
            +
                    # 3. Encode input prompt
         
     | 
| 
      
 968 
     | 
    
         
            +
                    prompt_embeds, negative_prompt_embeds = self.encode_prompt(
         
     | 
| 
      
 969 
     | 
    
         
            +
                        prompt,
         
     | 
| 
      
 970 
     | 
    
         
            +
                        do_classifier_free_guidance,
         
     | 
| 
      
 971 
     | 
    
         
            +
                        num_images_per_prompt=num_images_per_prompt,
         
     | 
| 
      
 972 
     | 
    
         
            +
                        device=device,
         
     | 
| 
      
 973 
     | 
    
         
            +
                        negative_prompt=negative_prompt,
         
     | 
| 
      
 974 
     | 
    
         
            +
                        prompt_embeds=prompt_embeds,
         
     | 
| 
      
 975 
     | 
    
         
            +
                        negative_prompt_embeds=negative_prompt_embeds,
         
     | 
| 
      
 976 
     | 
    
         
            +
                        clean_caption=clean_caption,
         
     | 
| 
      
 977 
     | 
    
         
            +
                    )
         
     | 
| 
      
 978 
     | 
    
         
            +
             
     | 
| 
      
 979 
     | 
    
         
            +
                    if do_classifier_free_guidance:
         
     | 
| 
      
 980 
     | 
    
         
            +
                        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         
     | 
| 
      
 981 
     | 
    
         
            +
             
     | 
| 
      
 982 
     | 
    
         
            +
                    dtype = prompt_embeds.dtype
         
     | 
| 
      
 983 
     | 
    
         
            +
             
     | 
| 
      
 984 
     | 
    
         
            +
                    # 4. Prepare timesteps
         
     | 
| 
      
 985 
     | 
    
         
            +
                    if timesteps is not None:
         
     | 
| 
      
 986 
     | 
    
         
            +
                        self.scheduler.set_timesteps(timesteps=timesteps, device=device)
         
     | 
| 
      
 987 
     | 
    
         
            +
                        timesteps = self.scheduler.timesteps
         
     | 
| 
      
 988 
     | 
    
         
            +
                        num_inference_steps = len(timesteps)
         
     | 
| 
      
 989 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 990 
     | 
    
         
            +
                        self.scheduler.set_timesteps(num_inference_steps, device=device)
         
     | 
| 
      
 991 
     | 
    
         
            +
                        timesteps = self.scheduler.timesteps
         
     | 
| 
      
 992 
     | 
    
         
            +
             
     | 
| 
      
 993 
     | 
    
         
            +
                    timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
         
     | 
| 
      
 994 
     | 
    
         
            +
             
     | 
| 
      
 995 
     | 
    
         
            +
                    # 5. Prepare intermediate images
         
     | 
| 
      
 996 
     | 
    
         
            +
                    image = self.preprocess_image(image)
         
     | 
| 
      
 997 
     | 
    
         
            +
                    image = image.to(device=device, dtype=dtype)
         
     | 
| 
      
 998 
     | 
    
         
            +
             
     | 
| 
      
 999 
     | 
    
         
            +
                    mask_image = self.preprocess_mask_image(mask_image)
         
     | 
| 
      
 1000 
     | 
    
         
            +
                    mask_image = mask_image.to(device=device, dtype=dtype)
         
     | 
| 
      
 1001 
     | 
    
         
            +
             
     | 
| 
      
 1002 
     | 
    
         
            +
                    if mask_image.shape[0] == 1:
         
     | 
| 
      
 1003 
     | 
    
         
            +
                        mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
         
     | 
| 
      
 1004 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 1005 
     | 
    
         
            +
                        mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
         
     | 
| 
      
 1006 
     | 
    
         
            +
             
     | 
| 
      
 1007 
     | 
    
         
            +
                    noise_timestep = timesteps[0:1]
         
     | 
| 
      
 1008 
     | 
    
         
            +
                    noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
         
     | 
| 
      
 1009 
     | 
    
         
            +
             
     | 
| 
      
 1010 
     | 
    
         
            +
                    intermediate_images = self.prepare_intermediate_images(
         
     | 
| 
      
 1011 
     | 
    
         
            +
                        image, noise_timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator
         
     | 
| 
      
 1012 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1013 
     | 
    
         
            +
             
     | 
| 
      
 1014 
     | 
    
         
            +
                    # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         
     | 
| 
      
 1015 
     | 
    
         
            +
                    extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         
     | 
| 
      
 1016 
     | 
    
         
            +
             
     | 
| 
      
 1017 
     | 
    
         
            +
                    # HACK: see comment in `enable_model_cpu_offload`
         
     | 
| 
      
 1018 
     | 
    
         
            +
                    if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
         
     | 
| 
      
 1019 
     | 
    
         
            +
                        self.text_encoder_offload_hook.offload()
         
     | 
| 
      
 1020 
     | 
    
         
            +
             
     | 
| 
      
 1021 
     | 
    
         
            +
                    # 7. Denoising loop
         
     | 
| 
      
 1022 
     | 
    
         
            +
                    num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         
     | 
| 
      
 1023 
     | 
    
         
            +
                    with self.progress_bar(total=num_inference_steps) as progress_bar:
         
     | 
| 
      
 1024 
     | 
    
         
            +
                        for i, t in enumerate(timesteps):
         
     | 
| 
      
 1025 
     | 
    
         
            +
                            model_input = (
         
     | 
| 
      
 1026 
     | 
    
         
            +
                                torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
         
     | 
| 
      
 1027 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1028 
     | 
    
         
            +
                            model_input = self.scheduler.scale_model_input(model_input, t)
         
     | 
| 
      
 1029 
     | 
    
         
            +
             
     | 
| 
      
 1030 
     | 
    
         
            +
                            # predict the noise residual
         
     | 
| 
      
 1031 
     | 
    
         
            +
                            noise_pred = self.unet(
         
     | 
| 
      
 1032 
     | 
    
         
            +
                                model_input,
         
     | 
| 
      
 1033 
     | 
    
         
            +
                                t,
         
     | 
| 
      
 1034 
     | 
    
         
            +
                                encoder_hidden_states=prompt_embeds,
         
     | 
| 
      
 1035 
     | 
    
         
            +
                                cross_attention_kwargs=cross_attention_kwargs,
         
     | 
| 
      
 1036 
     | 
    
         
            +
                            ).sample
         
     | 
| 
      
 1037 
     | 
    
         
            +
             
     | 
| 
      
 1038 
     | 
    
         
            +
                            # perform guidance
         
     | 
| 
      
 1039 
     | 
    
         
            +
                            if do_classifier_free_guidance:
         
     | 
| 
      
 1040 
     | 
    
         
            +
                                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         
     | 
| 
      
 1041 
     | 
    
         
            +
                                noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
         
     | 
| 
      
 1042 
     | 
    
         
            +
                                noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
         
     | 
| 
      
 1043 
     | 
    
         
            +
                                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
         
     | 
| 
      
 1044 
     | 
    
         
            +
                                noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
         
     | 
| 
      
 1045 
     | 
    
         
            +
             
     | 
| 
      
 1046 
     | 
    
         
            +
                            # compute the previous noisy sample x_t -> x_t-1
         
     | 
| 
      
 1047 
     | 
    
         
            +
                            prev_intermediate_images = intermediate_images
         
     | 
| 
      
 1048 
     | 
    
         
            +
             
     | 
| 
      
 1049 
     | 
    
         
            +
                            intermediate_images = self.scheduler.step(
         
     | 
| 
      
 1050 
     | 
    
         
            +
                                noise_pred, t, intermediate_images, **extra_step_kwargs
         
     | 
| 
      
 1051 
     | 
    
         
            +
                            ).prev_sample
         
     | 
| 
      
 1052 
     | 
    
         
            +
             
     | 
| 
      
 1053 
     | 
    
         
            +
                            intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
         
     | 
| 
      
 1054 
     | 
    
         
            +
             
     | 
| 
      
 1055 
     | 
    
         
            +
                            # call the callback, if provided
         
     | 
| 
      
 1056 
     | 
    
         
            +
                            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
         
     | 
| 
      
 1057 
     | 
    
         
            +
                                progress_bar.update()
         
     | 
| 
      
 1058 
     | 
    
         
            +
                                if callback is not None and i % callback_steps == 0:
         
     | 
| 
      
 1059 
     | 
    
         
            +
                                    callback(i, t, intermediate_images)
         
     | 
| 
      
 1060 
     | 
    
         
            +
             
     | 
| 
      
 1061 
     | 
    
         
            +
                    image = intermediate_images
         
     | 
| 
      
 1062 
     | 
    
         
            +
             
     | 
| 
      
 1063 
     | 
    
         
            +
                    if output_type == "pil":
         
     | 
| 
      
 1064 
     | 
    
         
            +
                        # 8. Post-processing
         
     | 
| 
      
 1065 
     | 
    
         
            +
                        image = (image / 2 + 0.5).clamp(0, 1)
         
     | 
| 
      
 1066 
     | 
    
         
            +
                        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         
     | 
| 
      
 1067 
     | 
    
         
            +
             
     | 
| 
      
 1068 
     | 
    
         
            +
                        # 9. Run safety checker
         
     | 
| 
      
 1069 
     | 
    
         
            +
                        image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
         
     | 
| 
      
 1070 
     | 
    
         
            +
             
     | 
| 
      
 1071 
     | 
    
         
            +
                        # 10. Convert to PIL
         
     | 
| 
      
 1072 
     | 
    
         
            +
                        image = self.numpy_to_pil(image)
         
     | 
| 
      
 1073 
     | 
    
         
            +
             
     | 
| 
      
 1074 
     | 
    
         
            +
                        # 11. Apply watermark
         
     | 
| 
      
 1075 
     | 
    
         
            +
                        if self.watermarker is not None:
         
     | 
| 
      
 1076 
     | 
    
         
            +
                            self.watermarker.apply_watermark(image, self.unet.config.sample_size)
         
     | 
| 
      
 1077 
     | 
    
         
            +
                    elif output_type == "pt":
         
     | 
| 
      
 1078 
     | 
    
         
            +
                        nsfw_detected = None
         
     | 
| 
      
 1079 
     | 
    
         
            +
                        watermark_detected = None
         
     | 
| 
      
 1080 
     | 
    
         
            +
             
     | 
| 
      
 1081 
     | 
    
         
            +
                        if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
         
     | 
| 
      
 1082 
     | 
    
         
            +
                            self.unet_offload_hook.offload()
         
     | 
| 
      
 1083 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 1084 
     | 
    
         
            +
                        # 8. Post-processing
         
     | 
| 
      
 1085 
     | 
    
         
            +
                        image = (image / 2 + 0.5).clamp(0, 1)
         
     | 
| 
      
 1086 
     | 
    
         
            +
                        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         
     | 
| 
      
 1087 
     | 
    
         
            +
             
     | 
| 
      
 1088 
     | 
    
         
            +
                        # 9. Run safety checker
         
     | 
| 
      
 1089 
     | 
    
         
            +
                        image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
         
     | 
| 
      
 1090 
     | 
    
         
            +
             
     | 
| 
      
 1091 
     | 
    
         
            +
                    # Offload last model to CPU
         
     | 
| 
      
 1092 
     | 
    
         
            +
                    if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
         
     | 
| 
      
 1093 
     | 
    
         
            +
                        self.final_offload_hook.offload()
         
     | 
| 
      
 1094 
     | 
    
         
            +
             
     | 
| 
      
 1095 
     | 
    
         
            +
                    if not return_dict:
         
     | 
| 
      
 1096 
     | 
    
         
            +
                        return (image, nsfw_detected, watermark_detected)
         
     | 
| 
      
 1097 
     | 
    
         
            +
             
     | 
| 
      
 1098 
     | 
    
         
            +
                    return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
         
     |