diffusers 0.19.3__py3-none-any.whl → 0.20.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. diffusers/__init__.py +3 -1
  2. diffusers/commands/fp16_safetensors.py +2 -7
  3. diffusers/configuration_utils.py +23 -1
  4. diffusers/dependency_versions_table.py +1 -1
  5. diffusers/loaders.py +62 -64
  6. diffusers/models/__init__.py +1 -0
  7. diffusers/models/activations.py +2 -0
  8. diffusers/models/attention.py +45 -1
  9. diffusers/models/autoencoder_tiny.py +193 -0
  10. diffusers/models/controlnet.py +1 -1
  11. diffusers/models/embeddings.py +56 -0
  12. diffusers/models/lora.py +0 -6
  13. diffusers/models/modeling_flax_utils.py +28 -2
  14. diffusers/models/modeling_utils.py +33 -16
  15. diffusers/models/transformer_2d.py +26 -9
  16. diffusers/models/unet_1d.py +2 -2
  17. diffusers/models/unet_2d_blocks.py +106 -56
  18. diffusers/models/unet_2d_condition.py +20 -5
  19. diffusers/models/vae.py +106 -1
  20. diffusers/pipelines/__init__.py +1 -0
  21. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +10 -3
  22. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +10 -3
  23. diffusers/pipelines/audioldm/pipeline_audioldm.py +1 -1
  24. diffusers/pipelines/auto_pipeline.py +33 -43
  25. diffusers/pipelines/controlnet/multicontrolnet.py +4 -2
  26. diffusers/pipelines/controlnet/pipeline_controlnet.py +20 -4
  27. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +15 -7
  28. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +14 -4
  29. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +157 -10
  30. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -10
  31. diffusers/pipelines/deepfloyd_if/pipeline_if.py +1 -1
  32. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +1 -1
  33. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +1 -1
  34. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +1 -1
  35. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +1 -1
  36. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +1 -1
  37. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +43 -2
  38. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +44 -2
  39. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +1 -1
  40. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +1 -1
  41. diffusers/pipelines/pipeline_flax_utils.py +41 -4
  42. diffusers/pipelines/pipeline_utils.py +60 -16
  43. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +2 -2
  44. diffusers/pipelines/stable_diffusion/__init__.py +1 -0
  45. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +81 -37
  46. diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +10 -3
  47. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +10 -3
  48. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +10 -3
  49. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +10 -3
  50. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py +12 -5
  51. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py +832 -0
  52. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +10 -3
  53. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +10 -3
  54. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +10 -3
  55. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +9 -2
  56. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +17 -8
  57. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +10 -3
  58. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +10 -3
  59. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py +10 -3
  60. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +10 -3
  61. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +10 -3
  62. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +10 -3
  63. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +10 -3
  64. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +10 -3
  65. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +3 -5
  66. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +75 -3
  67. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +76 -6
  68. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +1 -2
  69. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +10 -3
  70. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +10 -3
  71. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +11 -4
  72. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +1 -1
  73. diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +131 -28
  74. diffusers/schedulers/scheduling_consistency_models.py +70 -57
  75. diffusers/schedulers/scheduling_ddim.py +76 -71
  76. diffusers/schedulers/scheduling_ddim_inverse.py +76 -44
  77. diffusers/schedulers/scheduling_ddim_parallel.py +11 -8
  78. diffusers/schedulers/scheduling_ddpm.py +68 -67
  79. diffusers/schedulers/scheduling_ddpm_parallel.py +18 -15
  80. diffusers/schedulers/scheduling_deis_multistep.py +93 -85
  81. diffusers/schedulers/scheduling_dpmsolver_multistep.py +118 -120
  82. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +116 -109
  83. diffusers/schedulers/scheduling_dpmsolver_sde.py +57 -43
  84. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +122 -121
  85. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +54 -44
  86. diffusers/schedulers/scheduling_euler_discrete.py +63 -56
  87. diffusers/schedulers/scheduling_heun_discrete.py +57 -45
  88. diffusers/schedulers/scheduling_ipndm.py +27 -22
  89. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +54 -41
  90. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +52 -41
  91. diffusers/schedulers/scheduling_karras_ve.py +55 -45
  92. diffusers/schedulers/scheduling_lms_discrete.py +58 -52
  93. diffusers/schedulers/scheduling_pndm.py +77 -62
  94. diffusers/schedulers/scheduling_repaint.py +56 -38
  95. diffusers/schedulers/scheduling_sde_ve.py +62 -50
  96. diffusers/schedulers/scheduling_sde_vp.py +32 -11
  97. diffusers/schedulers/scheduling_unclip.py +3 -3
  98. diffusers/schedulers/scheduling_unipc_multistep.py +131 -91
  99. diffusers/schedulers/scheduling_utils.py +41 -35
  100. diffusers/schedulers/scheduling_utils_flax.py +8 -2
  101. diffusers/schedulers/scheduling_vq_diffusion.py +39 -68
  102. diffusers/utils/__init__.py +2 -2
  103. diffusers/utils/dummy_pt_objects.py +15 -0
  104. diffusers/utils/dummy_torch_and_transformers_objects.py +15 -0
  105. diffusers/utils/hub_utils.py +105 -2
  106. diffusers/utils/import_utils.py +0 -4
  107. diffusers/utils/pil_utils.py +19 -0
  108. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/METADATA +5 -7
  109. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/RECORD +113 -112
  110. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/WHEEL +1 -1
  111. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/entry_points.txt +0 -1
  112. diffusers/models/cross_attention.py +0 -94
  113. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/LICENSE +0 -0
  114. {diffusers-0.19.3.dist-info → diffusers-0.20.1.dist-info}/top_level.txt +0 -0
@@ -23,14 +23,22 @@ import flax
23
23
  import numpy as np
24
24
  import PIL
25
25
  from flax.core.frozen_dict import FrozenDict
26
- from huggingface_hub import snapshot_download
26
+ from huggingface_hub import create_repo, snapshot_download
27
27
  from PIL import Image
28
28
  from tqdm.auto import tqdm
29
29
 
30
30
  from ..configuration_utils import ConfigMixin
31
31
  from ..models.modeling_flax_utils import FLAX_WEIGHTS_NAME, FlaxModelMixin
32
32
  from ..schedulers.scheduling_utils_flax import SCHEDULER_CONFIG_NAME, FlaxSchedulerMixin
33
- from ..utils import CONFIG_NAME, DIFFUSERS_CACHE, BaseOutput, http_user_agent, is_transformers_available, logging
33
+ from ..utils import (
34
+ CONFIG_NAME,
35
+ DIFFUSERS_CACHE,
36
+ BaseOutput,
37
+ PushToHubMixin,
38
+ http_user_agent,
39
+ is_transformers_available,
40
+ logging,
41
+ )
34
42
 
35
43
 
36
44
  if is_transformers_available():
@@ -90,7 +98,7 @@ class FlaxImagePipelineOutput(BaseOutput):
90
98
  images: Union[List[PIL.Image.Image], np.ndarray]
91
99
 
92
100
 
93
- class FlaxDiffusionPipeline(ConfigMixin):
101
+ class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
94
102
  r"""
95
103
  Base class for Flax-based pipelines.
96
104
 
@@ -139,7 +147,13 @@ class FlaxDiffusionPipeline(ConfigMixin):
139
147
  # set models
140
148
  setattr(self, name, module)
141
149
 
142
- def save_pretrained(self, save_directory: Union[str, os.PathLike], params: Union[Dict, FrozenDict]):
150
+ def save_pretrained(
151
+ self,
152
+ save_directory: Union[str, os.PathLike],
153
+ params: Union[Dict, FrozenDict],
154
+ push_to_hub: bool = False,
155
+ **kwargs,
156
+ ):
143
157
  # TODO: handle inference_state
144
158
  """
145
159
  Save all saveable variables of the pipeline to a directory. A pipeline variable can be saved and loaded if its
@@ -149,6 +163,12 @@ class FlaxDiffusionPipeline(ConfigMixin):
149
163
  Arguments:
150
164
  save_directory (`str` or `os.PathLike`):
151
165
  Directory to which to save. Will be created if it doesn't exist.
166
+ push_to_hub (`bool`, *optional*, defaults to `False`):
167
+ Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
168
+ repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
169
+ namespace).
170
+ kwargs (`Dict[str, Any]`, *optional*):
171
+ Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
152
172
  """
153
173
  self.save_config(save_directory)
154
174
 
@@ -157,6 +177,14 @@ class FlaxDiffusionPipeline(ConfigMixin):
157
177
  model_index_dict.pop("_diffusers_version")
158
178
  model_index_dict.pop("_module", None)
159
179
 
180
+ if push_to_hub:
181
+ commit_message = kwargs.pop("commit_message", None)
182
+ private = kwargs.pop("private", False)
183
+ create_pr = kwargs.pop("create_pr", False)
184
+ token = kwargs.pop("token", None)
185
+ repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
186
+ repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
187
+
160
188
  for pipeline_component_name in model_index_dict.keys():
161
189
  sub_model = getattr(self, pipeline_component_name)
162
190
  if sub_model is None:
@@ -188,6 +216,15 @@ class FlaxDiffusionPipeline(ConfigMixin):
188
216
  else:
189
217
  save_method(os.path.join(save_directory, pipeline_component_name))
190
218
 
219
+ if push_to_hub:
220
+ self._upload_folder(
221
+ save_directory,
222
+ repo_id,
223
+ token=token,
224
+ commit_message=commit_message,
225
+ create_pr=create_pr,
226
+ )
227
+
191
228
  @classmethod
192
229
  def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
193
230
  r"""
@@ -28,7 +28,7 @@ from typing import Any, Callable, Dict, List, Optional, Union
28
28
  import numpy as np
29
29
  import PIL
30
30
  import torch
31
- from huggingface_hub import ModelCard, hf_hub_download, model_info, snapshot_download
31
+ from huggingface_hub import ModelCard, create_repo, hf_hub_download, model_info, snapshot_download
32
32
  from packaging import version
33
33
  from requests.exceptions import HTTPError
34
34
  from tqdm.auto import tqdm
@@ -52,7 +52,6 @@ from ..utils import (
52
52
  is_accelerate_available,
53
53
  is_accelerate_version,
54
54
  is_compiled_module,
55
- is_safetensors_available,
56
55
  is_torch_version,
57
56
  is_transformers_available,
58
57
  logging,
@@ -67,7 +66,7 @@ if is_transformers_available():
67
66
  from transformers.utils import SAFE_WEIGHTS_NAME as TRANSFORMERS_SAFE_WEIGHTS_NAME
68
67
  from transformers.utils import WEIGHTS_NAME as TRANSFORMERS_WEIGHTS_NAME
69
68
 
70
- from ..utils import FLAX_WEIGHTS_NAME, ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME
69
+ from ..utils import FLAX_WEIGHTS_NAME, ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, PushToHubMixin
71
70
 
72
71
 
73
72
  if is_accelerate_available():
@@ -473,7 +472,7 @@ def load_sub_model(
473
472
  return loaded_sub_model
474
473
 
475
474
 
476
- class DiffusionPipeline(ConfigMixin):
475
+ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
477
476
  r"""
478
477
  Base class for all pipelines.
479
478
 
@@ -557,8 +556,10 @@ class DiffusionPipeline(ConfigMixin):
557
556
  def save_pretrained(
558
557
  self,
559
558
  save_directory: Union[str, os.PathLike],
560
- safe_serialization: bool = False,
559
+ safe_serialization: bool = True,
561
560
  variant: Optional[str] = None,
561
+ push_to_hub: bool = False,
562
+ **kwargs,
562
563
  ):
563
564
  """
564
565
  Save all saveable variables of the pipeline to a directory. A pipeline variable can be saved and loaded if its
@@ -568,10 +569,16 @@ class DiffusionPipeline(ConfigMixin):
568
569
  Arguments:
569
570
  save_directory (`str` or `os.PathLike`):
570
571
  Directory to save a pipeline to. Will be created if it doesn't exist.
571
- safe_serialization (`bool`, *optional*, defaults to `False`):
572
+ safe_serialization (`bool`, *optional*, defaults to `True`):
572
573
  Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
573
574
  variant (`str`, *optional*):
574
575
  If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
576
+ push_to_hub (`bool`, *optional*, defaults to `False`):
577
+ Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
578
+ repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
579
+ namespace).
580
+ kwargs (`Dict[str, Any]`, *optional*):
581
+ Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
575
582
  """
576
583
  model_index_dict = dict(self.config)
577
584
  model_index_dict.pop("_class_name", None)
@@ -579,6 +586,14 @@ class DiffusionPipeline(ConfigMixin):
579
586
  model_index_dict.pop("_module", None)
580
587
  model_index_dict.pop("_name_or_path", None)
581
588
 
589
+ if push_to_hub:
590
+ commit_message = kwargs.pop("commit_message", None)
591
+ private = kwargs.pop("private", False)
592
+ create_pr = kwargs.pop("create_pr", False)
593
+ token = kwargs.pop("token", None)
594
+ repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
595
+ repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
596
+
582
597
  expected_modules, optional_kwargs = self._get_signature_keys(self)
583
598
 
584
599
  def is_saveable_module(name, value):
@@ -642,6 +657,15 @@ class DiffusionPipeline(ConfigMixin):
642
657
  # finally save the config
643
658
  self.save_config(save_directory)
644
659
 
660
+ if push_to_hub:
661
+ self._upload_folder(
662
+ save_directory,
663
+ repo_id,
664
+ token=token,
665
+ commit_message=commit_message,
666
+ create_pr=create_pr,
667
+ )
668
+
645
669
  def to(
646
670
  self,
647
671
  torch_device: Optional[Union[str, torch.device]] = None,
@@ -899,7 +923,7 @@ class DiffusionPipeline(ConfigMixin):
899
923
  offload_state_dict = kwargs.pop("offload_state_dict", False)
900
924
  low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
901
925
  variant = kwargs.pop("variant", None)
902
- use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
926
+ use_safetensors = kwargs.pop("use_safetensors", None)
903
927
  load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
904
928
 
905
929
  # 1. Download the checkpoints and configs
@@ -1311,14 +1335,9 @@ class DiffusionPipeline(ConfigMixin):
1311
1335
  use_onnx = kwargs.pop("use_onnx", None)
1312
1336
  load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
1313
1337
 
1314
- if use_safetensors and not is_safetensors_available():
1315
- raise ValueError(
1316
- "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
1317
- )
1318
-
1319
1338
  allow_pickle = False
1320
1339
  if use_safetensors is None:
1321
- use_safetensors = is_safetensors_available()
1340
+ use_safetensors = True
1322
1341
  allow_pickle = True
1323
1342
 
1324
1343
  allow_patterns = None
@@ -1375,7 +1394,7 @@ class DiffusionPipeline(ConfigMixin):
1375
1394
  # if the whole pipeline is cached we don't have to ping the Hub
1376
1395
  if revision in DEPRECATED_REVISION_ARGS and version.parse(
1377
1396
  version.parse(__version__).base_version
1378
- ) >= version.parse("0.20.0"):
1397
+ ) >= version.parse("0.22.0"):
1379
1398
  warn_deprecated_model_variant(
1380
1399
  pretrained_model_name, use_auth_token, variant, revision, model_filenames
1381
1400
  )
@@ -1669,8 +1688,16 @@ class DiffusionPipeline(ConfigMixin):
1669
1688
  def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
1670
1689
  r"""
1671
1690
  Enable sliced attention computation. When this option is enabled, the attention module splits the input tensor
1672
- in slices to compute attention in several steps. This is useful to save some memory in exchange for a small
1673
- speed decrease.
1691
+ in slices to compute attention in several steps. For more than one attention head, the computation is performed
1692
+ sequentially over each head. This is useful to save some memory in exchange for a small speed decrease.
1693
+
1694
+ <Tip warning={true}>
1695
+
1696
+ ⚠️ Don't enable attention slicing if you're already using `scaled_dot_product_attention` (SDPA) from PyTorch
1697
+ 2.0 or xFormers. These attention computations are already very memory efficient so you won't need to enable
1698
+ this function. If you enable attention slicing with SDPA or xFormers, it can lead to serious slow downs!
1699
+
1700
+ </Tip>
1674
1701
 
1675
1702
  Args:
1676
1703
  slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
@@ -1678,6 +1705,23 @@ class DiffusionPipeline(ConfigMixin):
1678
1705
  `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
1679
1706
  provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
1680
1707
  must be a multiple of `slice_size`.
1708
+
1709
+ Examples:
1710
+
1711
+ ```py
1712
+ >>> import torch
1713
+ >>> from diffusers import StableDiffusionPipeline
1714
+
1715
+ >>> pipe = StableDiffusionPipeline.from_pretrained(
1716
+ ... "runwayml/stable-diffusion-v1-5",
1717
+ ... torch_dtype=torch.float16,
1718
+ ... use_safetensors=True,
1719
+ ... )
1720
+
1721
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
1722
+ >>> pipe.enable_attention_slicing()
1723
+ >>> image = pipe(prompt).images[0]
1724
+ ```
1681
1725
  """
1682
1726
  self.set_attention_slice(slice_size)
1683
1727
 
@@ -442,7 +442,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
442
442
  if do_classifier_free_guidance:
443
443
  uncond_tokens: List[str]
444
444
  if negative_prompt is None:
445
- uncond_tokens = [""]
445
+ uncond_tokens = [""] * batch_size
446
446
  elif type(prompt) is not type(negative_prompt):
447
447
  raise TypeError(
448
448
  f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -471,7 +471,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
471
471
 
472
472
  # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
473
473
  seq_len = uncond_embeddings.shape[1]
474
- uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
474
+ uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
475
475
  uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
476
476
 
477
477
  # For classifier free guidance, we need to do two forward passes.
@@ -45,6 +45,7 @@ else:
45
45
  from .pipeline_cycle_diffusion import CycleDiffusionPipeline
46
46
  from .pipeline_stable_diffusion import StableDiffusionPipeline
47
47
  from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
48
+ from .pipeline_stable_diffusion_gligen import StableDiffusionGLIGENPipeline
48
49
  from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
49
50
  from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
50
51
  from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
@@ -50,7 +50,7 @@ from ...schedulers import (
50
50
  PNDMScheduler,
51
51
  UnCLIPScheduler,
52
52
  )
53
- from ...utils import is_accelerate_available, is_omegaconf_available, is_safetensors_available, logging
53
+ from ...utils import is_accelerate_available, is_omegaconf_available, logging
54
54
  from ...utils.import_utils import BACKENDS_MAPPING
55
55
  from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
56
56
  from ..paint_by_example import PaintByExampleImageEncoder
@@ -367,7 +367,7 @@ def create_diffusers_schedular(original_config):
367
367
 
368
368
 
369
369
  def create_ldm_bert_config(original_config):
370
- bert_params = original_config.model.parms.cond_stage_config.params
370
+ bert_params = original_config.model.params.cond_stage_config.params
371
371
  config = LDMBertConfig(
372
372
  d_model=bert_params.n_embed,
373
373
  encoder_layers=bert_params.n_layer,
@@ -778,11 +778,13 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
778
778
  def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
779
779
  if text_encoder is None:
780
780
  config_name = "openai/clip-vit-large-patch14"
781
- config = CLIPTextConfig.from_pretrained(config_name)
781
+ config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
782
782
 
783
783
  ctx = init_empty_weights if is_accelerate_available() else nullcontext
784
784
  with ctx():
785
785
  text_model = CLIPTextModel(config)
786
+ else:
787
+ text_model = text_encoder
786
788
 
787
789
  keys = list(checkpoint.keys())
788
790
 
@@ -832,8 +834,8 @@ protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
832
834
  textenc_pattern = re.compile("|".join(protected.keys()))
833
835
 
834
836
 
835
- def convert_paint_by_example_checkpoint(checkpoint):
836
- config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
837
+ def convert_paint_by_example_checkpoint(checkpoint, local_files_only=False):
838
+ config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
837
839
  model = PaintByExampleImageEncoder(config)
838
840
 
839
841
  keys = list(checkpoint.keys())
@@ -900,13 +902,18 @@ def convert_paint_by_example_checkpoint(checkpoint):
900
902
 
901
903
 
902
904
  def convert_open_clip_checkpoint(
903
- checkpoint, config_name, prefix="cond_stage_model.model.", has_projection=False, **config_kwargs
905
+ checkpoint,
906
+ config_name,
907
+ prefix="cond_stage_model.model.",
908
+ has_projection=False,
909
+ local_files_only=False,
910
+ **config_kwargs,
904
911
  ):
905
912
  # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
906
913
  # text_model = CLIPTextModelWithProjection.from_pretrained(
907
914
  # "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
908
915
  # )
909
- config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs)
916
+ config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
910
917
 
911
918
  ctx = init_empty_weights if is_accelerate_available() else nullcontext
912
919
  with ctx():
@@ -971,7 +978,7 @@ def convert_open_clip_checkpoint(
971
978
  return text_model
972
979
 
973
980
 
974
- def stable_unclip_image_encoder(original_config):
981
+ def stable_unclip_image_encoder(original_config, local_files_only=False):
975
982
  """
976
983
  Returns the image processor and clip image encoder for the img2img unclip pipeline.
977
984
 
@@ -989,13 +996,17 @@ def stable_unclip_image_encoder(original_config):
989
996
 
990
997
  if clip_model_name == "ViT-L/14":
991
998
  feature_extractor = CLIPImageProcessor()
992
- image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
999
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
1000
+ "openai/clip-vit-large-patch14", local_files_only=local_files_only
1001
+ )
993
1002
  else:
994
1003
  raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
995
1004
 
996
1005
  elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
997
1006
  feature_extractor = CLIPImageProcessor()
998
- image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
1007
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
1008
+ "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", local_files_only=local_files_only
1009
+ )
999
1010
  else:
1000
1011
  raise NotImplementedError(
1001
1012
  f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
@@ -1070,7 +1081,9 @@ def convert_controlnet_checkpoint(
1070
1081
  if cross_attention_dim is not None:
1071
1082
  ctrlnet_config["cross_attention_dim"] = cross_attention_dim
1072
1083
 
1073
- controlnet = ControlNetModel(**ctrlnet_config)
1084
+ ctx = init_empty_weights if is_accelerate_available() else nullcontext
1085
+ with ctx():
1086
+ controlnet = ControlNetModel(**ctrlnet_config)
1074
1087
 
1075
1088
  # Some controlnet ckpt files are distributed independently from the rest of the
1076
1089
  # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
@@ -1088,7 +1101,11 @@ def convert_controlnet_checkpoint(
1088
1101
  skip_extract_state_dict=skip_extract_state_dict,
1089
1102
  )
1090
1103
 
1091
- controlnet.load_state_dict(converted_ctrl_checkpoint)
1104
+ if is_accelerate_available():
1105
+ for param_name, param in converted_ctrl_checkpoint.items():
1106
+ set_module_tensor_to_device(controlnet, param_name, "cpu", value=param)
1107
+ else:
1108
+ controlnet.load_state_dict(converted_ctrl_checkpoint)
1092
1109
 
1093
1110
  return controlnet
1094
1111
 
@@ -1116,6 +1133,7 @@ def download_from_original_stable_diffusion_ckpt(
1116
1133
  vae=None,
1117
1134
  text_encoder=None,
1118
1135
  tokenizer=None,
1136
+ config_files=None,
1119
1137
  ) -> DiffusionPipeline:
1120
1138
  """
1121
1139
  Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
@@ -1175,6 +1193,13 @@ def download_from_original_stable_diffusion_ckpt(
1175
1193
  [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
1176
1194
  to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by itself, if
1177
1195
  needed.
1196
+ config_files (`Dict[str, str]`, *optional*, defaults to `None`):
1197
+ A dictionary mapping from config file names to their contents. If this parameter is `None`, the function
1198
+ will load the config files by itself, if needed. Valid keys are:
1199
+ - `v1`: Config file for Stable Diffusion v1
1200
+ - `v2`: Config file for Stable Diffusion v2
1201
+ - `xl`: Config file for Stable Diffusion XL
1202
+ - `xl_refiner`: Config file for Stable Diffusion XL Refiner
1178
1203
  return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
1179
1204
  """
1180
1205
 
@@ -1186,7 +1211,6 @@ def download_from_original_stable_diffusion_ckpt(
1186
1211
  StableDiffusionInpaintPipeline,
1187
1212
  StableDiffusionPipeline,
1188
1213
  StableDiffusionXLImg2ImgPipeline,
1189
- StableDiffusionXLPipeline,
1190
1214
  StableUnCLIPImg2ImgPipeline,
1191
1215
  StableUnCLIPPipeline,
1192
1216
  )
@@ -1203,9 +1227,6 @@ def download_from_original_stable_diffusion_ckpt(
1203
1227
  from omegaconf import OmegaConf
1204
1228
 
1205
1229
  if from_safetensors:
1206
- if not is_safetensors_available():
1207
- raise ValueError(BACKENDS_MAPPING["safetensors"][1])
1208
-
1209
1230
  from safetensors.torch import load_file as safe_load
1210
1231
 
1211
1232
  checkpoint = safe_load(checkpoint_path, device="cpu")
@@ -1397,14 +1418,16 @@ def download_from_original_stable_diffusion_ckpt(
1397
1418
  else:
1398
1419
  vae.load_state_dict(converted_vae_checkpoint)
1399
1420
  elif vae is None:
1400
- vae = AutoencoderKL.from_pretrained(vae_path)
1421
+ vae = AutoencoderKL.from_pretrained(vae_path, local_files_only=local_files_only)
1401
1422
 
1402
1423
  if model_type == "FrozenOpenCLIPEmbedder":
1403
1424
  config_name = "stabilityai/stable-diffusion-2"
1404
1425
  config_kwargs = {"subfolder": "text_encoder"}
1405
1426
 
1406
1427
  text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs)
1407
- tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
1428
+ tokenizer = CLIPTokenizer.from_pretrained(
1429
+ "stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only
1430
+ )
1408
1431
 
1409
1432
  if stable_unclip is None:
1410
1433
  if controlnet:
@@ -1456,12 +1479,20 @@ def download_from_original_stable_diffusion_ckpt(
1456
1479
  elif stable_unclip == "txt2img":
1457
1480
  if stable_unclip_prior is None or stable_unclip_prior == "karlo":
1458
1481
  karlo_model = "kakaobrain/karlo-v1-alpha"
1459
- prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior")
1460
-
1461
- prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
1462
- prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
1463
-
1464
- prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler")
1482
+ prior = PriorTransformer.from_pretrained(
1483
+ karlo_model, subfolder="prior", local_files_only=local_files_only
1484
+ )
1485
+
1486
+ prior_tokenizer = CLIPTokenizer.from_pretrained(
1487
+ "openai/clip-vit-large-patch14", local_files_only=local_files_only
1488
+ )
1489
+ prior_text_model = CLIPTextModelWithProjection.from_pretrained(
1490
+ "openai/clip-vit-large-patch14", local_files_only=local_files_only
1491
+ )
1492
+
1493
+ prior_scheduler = UnCLIPScheduler.from_pretrained(
1494
+ karlo_model, subfolder="prior_scheduler", local_files_only=local_files_only
1495
+ )
1465
1496
  prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
1466
1497
  else:
1467
1498
  raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
@@ -1487,8 +1518,10 @@ def download_from_original_stable_diffusion_ckpt(
1487
1518
  raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
1488
1519
  elif model_type == "PaintByExample":
1489
1520
  vision_model = convert_paint_by_example_checkpoint(checkpoint)
1490
- tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
1491
- feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
1521
+ tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
1522
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
1523
+ "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
1524
+ )
1492
1525
  pipe = PaintByExamplePipeline(
1493
1526
  vae=vae,
1494
1527
  image_encoder=vision_model,
@@ -1501,11 +1534,19 @@ def download_from_original_stable_diffusion_ckpt(
1501
1534
  text_model = convert_ldm_clip_checkpoint(
1502
1535
  checkpoint, local_files_only=local_files_only, text_encoder=text_encoder
1503
1536
  )
1504
- tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") if tokenizer is None else tokenizer
1537
+ tokenizer = (
1538
+ CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
1539
+ if tokenizer is None
1540
+ else tokenizer
1541
+ )
1505
1542
 
1506
1543
  if load_safety_checker:
1507
- safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
1508
- feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
1544
+ safety_checker = StableDiffusionSafetyChecker.from_pretrained(
1545
+ "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
1546
+ )
1547
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
1548
+ "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
1549
+ )
1509
1550
  else:
1510
1551
  safety_checker = None
1511
1552
  feature_extractor = None
@@ -1533,9 +1574,13 @@ def download_from_original_stable_diffusion_ckpt(
1533
1574
  )
1534
1575
  elif model_type in ["SDXL", "SDXL-Refiner"]:
1535
1576
  if model_type == "SDXL":
1536
- tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
1577
+ tokenizer = CLIPTokenizer.from_pretrained(
1578
+ "openai/clip-vit-large-patch14", local_files_only=local_files_only
1579
+ )
1537
1580
  text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
1538
- tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
1581
+ tokenizer_2 = CLIPTokenizer.from_pretrained(
1582
+ "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
1583
+ )
1539
1584
 
1540
1585
  config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
1541
1586
  config_kwargs = {"projection_dim": 1280}
@@ -1543,7 +1588,7 @@ def download_from_original_stable_diffusion_ckpt(
1543
1588
  checkpoint, config_name, prefix="conditioner.embedders.1.model.", has_projection=True, **config_kwargs
1544
1589
  )
1545
1590
 
1546
- pipe = StableDiffusionXLPipeline(
1591
+ pipe = pipeline_class(
1547
1592
  vae=vae,
1548
1593
  text_encoder=text_encoder,
1549
1594
  tokenizer=tokenizer,
@@ -1556,7 +1601,9 @@ def download_from_original_stable_diffusion_ckpt(
1556
1601
  else:
1557
1602
  tokenizer = None
1558
1603
  text_encoder = None
1559
- tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
1604
+ tokenizer_2 = CLIPTokenizer.from_pretrained(
1605
+ "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
1606
+ )
1560
1607
 
1561
1608
  config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
1562
1609
  config_kwargs = {"projection_dim": 1280}
@@ -1578,7 +1625,7 @@ def download_from_original_stable_diffusion_ckpt(
1578
1625
  else:
1579
1626
  text_config = create_ldm_bert_config(original_config)
1580
1627
  text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
1581
- tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
1628
+ tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
1582
1629
  pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
1583
1630
 
1584
1631
  return pipe
@@ -1602,9 +1649,6 @@ def download_controlnet_from_original_ckpt(
1602
1649
  from omegaconf import OmegaConf
1603
1650
 
1604
1651
  if from_safetensors:
1605
- if not is_safetensors_available():
1606
- raise ValueError(BACKENDS_MAPPING["safetensors"][1])
1607
-
1608
1652
  from safetensors import safe_open
1609
1653
 
1610
1654
  checkpoint = {}
@@ -346,7 +346,14 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
346
346
  )
347
347
  prompt_embeds = prompt_embeds[0]
348
348
 
349
- prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
349
+ if self.text_encoder is not None:
350
+ prompt_embeds_dtype = self.text_encoder.dtype
351
+ elif self.unet is not None:
352
+ prompt_embeds_dtype = self.unet.dtype
353
+ else:
354
+ prompt_embeds_dtype = prompt_embeds.dtype
355
+
356
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
350
357
 
351
358
  bs_embed, seq_len, _ = prompt_embeds.shape
352
359
  # duplicate text embeddings for each generation per prompt, using mps friendly method
@@ -402,7 +409,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
402
409
  # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
403
410
  seq_len = negative_prompt_embeds.shape[1]
404
411
 
405
- negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
412
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
406
413
 
407
414
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
408
415
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
@@ -644,7 +651,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
644
651
  every step.
645
652
  cross_attention_kwargs (`dict`, *optional*):
646
653
  A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
647
- [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
654
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
648
655
 
649
656
  Example:
650
657
 
@@ -336,7 +336,14 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
336
336
  )
337
337
  prompt_embeds = prompt_embeds[0]
338
338
 
339
- prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
339
+ if self.text_encoder is not None:
340
+ prompt_embeds_dtype = self.text_encoder.dtype
341
+ elif self.unet is not None:
342
+ prompt_embeds_dtype = self.unet.dtype
343
+ else:
344
+ prompt_embeds_dtype = prompt_embeds.dtype
345
+
346
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
340
347
 
341
348
  bs_embed, seq_len, _ = prompt_embeds.shape
342
349
  # duplicate text embeddings for each generation per prompt, using mps friendly method
@@ -392,7 +399,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
392
399
  # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
393
400
  seq_len = negative_prompt_embeds.shape[1]
394
401
 
395
- negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
402
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
396
403
 
397
404
  negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
398
405
  negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
@@ -585,7 +592,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
585
592
  every step.
586
593
  cross_attention_kwargs (`dict`, *optional*):
587
594
  A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
588
- [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
595
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
589
596
  guidance_rescale (`float`, *optional*, defaults to 0.7):
590
597
  Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
591
598
  Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when