diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +20 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +23 -25
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +46 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
  229. diffusers/schedulers/scheduling_edm_euler.py +53 -30
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
  231. diffusers/schedulers/scheduling_euler_discrete.py +163 -67
  232. diffusers/schedulers/scheduling_heun_discrete.py +60 -38
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +27 -25
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. diffusers-0.27.1.dist-info/RECORD +0 -399
  267. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  268. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
  269. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -13,13 +13,14 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import inspect
16
- from typing import Callable, Dict, List, Optional, Union
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
17
 
18
18
  import numpy as np
19
19
  import PIL.Image
20
20
  import torch
21
21
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
22
22
 
23
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
23
24
  from ...image_processor import PipelineImageInput, VaeImageProcessor
24
25
  from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
25
26
  from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
@@ -168,14 +169,18 @@ class StableDiffusionInstructPix2PixPipeline(
168
169
  num_images_per_prompt: Optional[int] = 1,
169
170
  eta: float = 0.0,
170
171
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
171
- latents: Optional[torch.FloatTensor] = None,
172
- prompt_embeds: Optional[torch.FloatTensor] = None,
173
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
172
+ latents: Optional[torch.Tensor] = None,
173
+ prompt_embeds: Optional[torch.Tensor] = None,
174
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
174
175
  ip_adapter_image: Optional[PipelineImageInput] = None,
176
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
175
177
  output_type: Optional[str] = "pil",
176
178
  return_dict: bool = True,
177
- callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
179
+ callback_on_step_end: Optional[
180
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
181
+ ] = None,
178
182
  callback_on_step_end_tensor_inputs: List[str] = ["latents"],
183
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
179
184
  **kwargs,
180
185
  ):
181
186
  r"""
@@ -184,7 +189,7 @@ class StableDiffusionInstructPix2PixPipeline(
184
189
  Args:
185
190
  prompt (`str` or `List[str]`, *optional*):
186
191
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
187
- image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
192
+ image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
188
193
  `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept
189
194
  image latents as `image`, but if passing latents directly it is not encoded again.
190
195
  num_inference_steps (`int`, *optional*, defaults to 100):
@@ -194,7 +199,7 @@ class StableDiffusionInstructPix2PixPipeline(
194
199
  A higher guidance scale value encourages the model to generate images closely linked to the text
195
200
  `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
196
201
  image_guidance_scale (`float`, *optional*, defaults to 1.5):
197
- Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
202
+ Push the generated image towards the initial `image`. Image guidance scale is enabled by setting
198
203
  `image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
199
204
  linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
200
205
  value of at least `1`.
@@ -209,14 +214,14 @@ class StableDiffusionInstructPix2PixPipeline(
209
214
  generator (`torch.Generator`, *optional*):
210
215
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
211
216
  generation deterministic.
212
- latents (`torch.FloatTensor`, *optional*):
217
+ latents (`torch.Tensor`, *optional*):
213
218
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
214
219
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
215
220
  tensor is generated by sampling using the supplied random `generator`.
216
- prompt_embeds (`torch.FloatTensor`, *optional*):
221
+ prompt_embeds (`torch.Tensor`, *optional*):
217
222
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
218
223
  provided, text embeddings are generated from the `prompt` input argument.
219
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
224
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
220
225
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
221
226
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
222
227
  ip_adapter_image: (`PipelineImageInput`, *optional*):
@@ -226,15 +231,18 @@ class StableDiffusionInstructPix2PixPipeline(
226
231
  return_dict (`bool`, *optional*, defaults to `True`):
227
232
  Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
228
233
  plain tuple.
229
- callback_on_step_end (`Callable`, *optional*):
230
- A function that calls at the end of each denoising steps during the inference. The function is called
231
- with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
232
- callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
233
- `callback_on_step_end_tensor_inputs`.
234
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
235
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
236
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
237
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
238
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
234
239
  callback_on_step_end_tensor_inputs (`List`, *optional*):
235
240
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
236
241
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
237
242
  `._callback_tensor_inputs` attribute of your pipeline class.
243
+ cross_attention_kwargs (`dict`, *optional*):
244
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
245
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
238
246
 
239
247
  Examples:
240
248
 
@@ -289,6 +297,9 @@ class StableDiffusionInstructPix2PixPipeline(
289
297
  "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
290
298
  )
291
299
 
300
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
301
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
302
+
292
303
  # 0. Check inputs
293
304
  self.check_inputs(
294
305
  prompt,
@@ -296,6 +307,8 @@ class StableDiffusionInstructPix2PixPipeline(
296
307
  negative_prompt,
297
308
  prompt_embeds,
298
309
  negative_prompt_embeds,
310
+ ip_adapter_image,
311
+ ip_adapter_image_embeds,
299
312
  callback_on_step_end_tensor_inputs,
300
313
  )
301
314
  self._guidance_scale = guidance_scale
@@ -303,14 +316,6 @@ class StableDiffusionInstructPix2PixPipeline(
303
316
 
304
317
  device = self._execution_device
305
318
 
306
- if ip_adapter_image is not None:
307
- output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
308
- image_embeds, negative_image_embeds = self.encode_image(
309
- ip_adapter_image, device, num_images_per_prompt, output_hidden_state
310
- )
311
- if self.do_classifier_free_guidance:
312
- image_embeds = torch.cat([image_embeds, negative_image_embeds, negative_image_embeds])
313
-
314
319
  if image is None:
315
320
  raise ValueError("`image` input cannot be undefined.")
316
321
 
@@ -335,6 +340,14 @@ class StableDiffusionInstructPix2PixPipeline(
335
340
  negative_prompt_embeds=negative_prompt_embeds,
336
341
  )
337
342
 
343
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
344
+ image_embeds = self.prepare_ip_adapter_image_embeds(
345
+ ip_adapter_image,
346
+ ip_adapter_image_embeds,
347
+ device,
348
+ batch_size * num_images_per_prompt,
349
+ self.do_classifier_free_guidance,
350
+ )
338
351
  # 3. Preprocess image
339
352
  image = self.image_processor.preprocess(image)
340
353
 
@@ -406,6 +419,7 @@ class StableDiffusionInstructPix2PixPipeline(
406
419
  t,
407
420
  encoder_hidden_states=prompt_embeds,
408
421
  added_cond_kwargs=added_cond_kwargs,
422
+ cross_attention_kwargs=cross_attention_kwargs,
409
423
  return_dict=False,
410
424
  )[0]
411
425
 
@@ -468,8 +482,8 @@ class StableDiffusionInstructPix2PixPipeline(
468
482
  num_images_per_prompt,
469
483
  do_classifier_free_guidance,
470
484
  negative_prompt=None,
471
- prompt_embeds: Optional[torch.FloatTensor] = None,
472
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
485
+ prompt_embeds: Optional[torch.Tensor] = None,
486
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
473
487
  ):
474
488
  r"""
475
489
  Encodes the prompt into text encoder hidden states.
@@ -487,10 +501,10 @@ class StableDiffusionInstructPix2PixPipeline(
487
501
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
488
502
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
489
503
  less than `1`).
490
- prompt_embeds (`torch.FloatTensor`, *optional*):
504
+ prompt_embeds (`torch.Tensor`, *optional*):
491
505
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
492
506
  provided, text embeddings will be generated from `prompt` input argument.
493
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
507
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
494
508
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
495
509
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
496
510
  argument.
@@ -635,6 +649,65 @@ class StableDiffusionInstructPix2PixPipeline(
635
649
 
636
650
  return image_embeds, uncond_image_embeds
637
651
 
652
+ def prepare_ip_adapter_image_embeds(
653
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
654
+ ):
655
+ if ip_adapter_image_embeds is None:
656
+ if not isinstance(ip_adapter_image, list):
657
+ ip_adapter_image = [ip_adapter_image]
658
+
659
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
660
+ raise ValueError(
661
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
662
+ )
663
+
664
+ image_embeds = []
665
+ for single_ip_adapter_image, image_proj_layer in zip(
666
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
667
+ ):
668
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
669
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
670
+ single_ip_adapter_image, device, 1, output_hidden_state
671
+ )
672
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
673
+ single_negative_image_embeds = torch.stack(
674
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
675
+ )
676
+
677
+ if do_classifier_free_guidance:
678
+ single_image_embeds = torch.cat(
679
+ [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
680
+ )
681
+ single_image_embeds = single_image_embeds.to(device)
682
+
683
+ image_embeds.append(single_image_embeds)
684
+ else:
685
+ repeat_dims = [1]
686
+ image_embeds = []
687
+ for single_image_embeds in ip_adapter_image_embeds:
688
+ if do_classifier_free_guidance:
689
+ (
690
+ single_image_embeds,
691
+ single_negative_image_embeds,
692
+ single_negative_image_embeds,
693
+ ) = single_image_embeds.chunk(3)
694
+ single_image_embeds = single_image_embeds.repeat(
695
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
696
+ )
697
+ single_negative_image_embeds = single_negative_image_embeds.repeat(
698
+ num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
699
+ )
700
+ single_image_embeds = torch.cat(
701
+ [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
702
+ )
703
+ else:
704
+ single_image_embeds = single_image_embeds.repeat(
705
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
706
+ )
707
+ image_embeds.append(single_image_embeds)
708
+
709
+ return image_embeds
710
+
638
711
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
639
712
  def run_safety_checker(self, image, device, dtype):
640
713
  if self.safety_checker is None:
@@ -687,6 +760,8 @@ class StableDiffusionInstructPix2PixPipeline(
687
760
  negative_prompt=None,
688
761
  prompt_embeds=None,
689
762
  negative_prompt_embeds=None,
763
+ ip_adapter_image=None,
764
+ ip_adapter_image_embeds=None,
690
765
  callback_on_step_end_tensor_inputs=None,
691
766
  ):
692
767
  if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
@@ -728,9 +803,29 @@ class StableDiffusionInstructPix2PixPipeline(
728
803
  f" {negative_prompt_embeds.shape}."
729
804
  )
730
805
 
806
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
807
+ raise ValueError(
808
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
809
+ )
810
+
811
+ if ip_adapter_image_embeds is not None:
812
+ if not isinstance(ip_adapter_image_embeds, list):
813
+ raise ValueError(
814
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
815
+ )
816
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
817
+ raise ValueError(
818
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
819
+ )
820
+
731
821
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
732
822
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
733
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
823
+ shape = (
824
+ batch_size,
825
+ num_channels_latents,
826
+ int(height) // self.vae_scale_factor,
827
+ int(width) // self.vae_scale_factor,
828
+ )
734
829
  if isinstance(generator, list) and len(generator) != batch_size:
735
830
  raise ValueError(
736
831
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -221,7 +221,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
221
221
  )
222
222
 
223
223
  # verify batch size of prompt and image are same if image is a list or tensor
224
- if isinstance(image, list) or isinstance(image, torch.Tensor):
224
+ if isinstance(image, (list, torch.Tensor)):
225
225
  if isinstance(prompt, str):
226
226
  batch_size = 1
227
227
  else:
@@ -267,10 +267,10 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
267
267
  guidance_scale: float = 9.0,
268
268
  negative_prompt: Optional[Union[str, List[str]]] = None,
269
269
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
270
- latents: Optional[torch.FloatTensor] = None,
270
+ latents: Optional[torch.Tensor] = None,
271
271
  output_type: Optional[str] = "pil",
272
272
  return_dict: bool = True,
273
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
273
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
274
274
  callback_steps: int = 1,
275
275
  ):
276
276
  r"""
@@ -279,7 +279,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
279
279
  Args:
280
280
  prompt (`str` or `List[str]`):
281
281
  The prompt or prompts to guide image upscaling.
282
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
282
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
283
283
  `Image` or tensor representing an image batch to be upscaled. If it's a tensor, it can be either a
284
284
  latent output from a Stable Diffusion model or an image tensor in the range `[-1, 1]`. It is considered
285
285
  a `latent` if `image.shape[1]` is `4`; otherwise, it is considered to be an image representation and
@@ -299,7 +299,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
299
299
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
300
300
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
301
301
  generation deterministic.
302
- latents (`torch.FloatTensor`, *optional*):
302
+ latents (`torch.Tensor`, *optional*):
303
303
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
304
304
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
305
305
  tensor is generated by sampling using the supplied random `generator`.
@@ -310,7 +310,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
310
310
  plain tuple.
311
311
  callback (`Callable`, *optional*):
312
312
  A function that calls every `callback_steps` steps during inference. The function is called with the
313
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
313
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
314
314
  callback_steps (`int`, *optional*, defaults to 1):
315
315
  The frequency at which the `callback` function is called. If not specified, the callback is called at
316
316
  every step.
@@ -176,8 +176,8 @@ class StableDiffusionUpscalePipeline(
176
176
  num_images_per_prompt,
177
177
  do_classifier_free_guidance,
178
178
  negative_prompt=None,
179
- prompt_embeds: Optional[torch.FloatTensor] = None,
180
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
179
+ prompt_embeds: Optional[torch.Tensor] = None,
180
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
181
181
  lora_scale: Optional[float] = None,
182
182
  **kwargs,
183
183
  ):
@@ -209,8 +209,8 @@ class StableDiffusionUpscalePipeline(
209
209
  num_images_per_prompt,
210
210
  do_classifier_free_guidance,
211
211
  negative_prompt=None,
212
- prompt_embeds: Optional[torch.FloatTensor] = None,
213
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
212
+ prompt_embeds: Optional[torch.Tensor] = None,
213
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
214
214
  lora_scale: Optional[float] = None,
215
215
  clip_skip: Optional[int] = None,
216
216
  ):
@@ -230,10 +230,10 @@ class StableDiffusionUpscalePipeline(
230
230
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
231
231
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
232
232
  less than `1`).
233
- prompt_embeds (`torch.FloatTensor`, *optional*):
233
+ prompt_embeds (`torch.Tensor`, *optional*):
234
234
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
235
235
  provided, text embeddings will be generated from `prompt` input argument.
236
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
236
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
237
237
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
238
238
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
239
239
  argument.
@@ -468,7 +468,7 @@ class StableDiffusionUpscalePipeline(
468
468
  )
469
469
 
470
470
  # verify batch size of prompt and image are same if image is a list or tensor or numpy array
471
- if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray):
471
+ if isinstance(image, (list, np.ndarray, torch.Tensor)):
472
472
  if prompt is not None and isinstance(prompt, str):
473
473
  batch_size = 1
474
474
  elif prompt is not None and isinstance(prompt, list):
@@ -542,12 +542,12 @@ class StableDiffusionUpscalePipeline(
542
542
  num_images_per_prompt: Optional[int] = 1,
543
543
  eta: float = 0.0,
544
544
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
545
- latents: Optional[torch.FloatTensor] = None,
546
- prompt_embeds: Optional[torch.FloatTensor] = None,
547
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
545
+ latents: Optional[torch.Tensor] = None,
546
+ prompt_embeds: Optional[torch.Tensor] = None,
547
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
548
548
  output_type: Optional[str] = "pil",
549
549
  return_dict: bool = True,
550
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
550
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
551
551
  callback_steps: int = 1,
552
552
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
553
553
  clip_skip: int = None,
@@ -558,7 +558,7 @@ class StableDiffusionUpscalePipeline(
558
558
  Args:
559
559
  prompt (`str` or `List[str]`, *optional*):
560
560
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
561
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
561
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
562
562
  `Image` or tensor representing an image batch to be upscaled.
563
563
  num_inference_steps (`int`, *optional*, defaults to 50):
564
564
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -577,14 +577,14 @@ class StableDiffusionUpscalePipeline(
577
577
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
578
578
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
579
579
  generation deterministic.
580
- latents (`torch.FloatTensor`, *optional*):
580
+ latents (`torch.Tensor`, *optional*):
581
581
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
582
582
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
583
583
  tensor is generated by sampling using the supplied random `generator`.
584
- prompt_embeds (`torch.FloatTensor`, *optional*):
584
+ prompt_embeds (`torch.Tensor`, *optional*):
585
585
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
586
586
  provided, text embeddings are generated from the `prompt` input argument.
587
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
587
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
588
588
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
589
589
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
590
590
  output_type (`str`, *optional*, defaults to `"pil"`):
@@ -594,7 +594,7 @@ class StableDiffusionUpscalePipeline(
594
594
  plain tuple.
595
595
  callback (`Callable`, *optional*):
596
596
  A function that calls every `callback_steps` steps during inference. The function is called with the
597
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
597
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
598
598
  callback_steps (`int`, *optional*, defaults to 1):
599
599
  The frequency at which the `callback` function is called. If not specified, the callback is called at
600
600
  every step.
@@ -76,7 +76,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
76
76
  prior_text_encoder ([`CLIPTextModelWithProjection`]):
77
77
  Frozen [`CLIPTextModelWithProjection`] text-encoder.
78
78
  prior ([`PriorTransformer`]):
79
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
79
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
80
80
  prior_scheduler ([`KarrasDiffusionSchedulers`]):
81
81
  Scheduler used in the prior denoising process.
82
82
  image_normalizer ([`StableUnCLIPImageNormalizer`]):
@@ -257,8 +257,8 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
257
257
  num_images_per_prompt,
258
258
  do_classifier_free_guidance,
259
259
  negative_prompt=None,
260
- prompt_embeds: Optional[torch.FloatTensor] = None,
261
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
260
+ prompt_embeds: Optional[torch.Tensor] = None,
261
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
262
262
  lora_scale: Optional[float] = None,
263
263
  **kwargs,
264
264
  ):
@@ -290,8 +290,8 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
290
290
  num_images_per_prompt,
291
291
  do_classifier_free_guidance,
292
292
  negative_prompt=None,
293
- prompt_embeds: Optional[torch.FloatTensor] = None,
294
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
293
+ prompt_embeds: Optional[torch.Tensor] = None,
294
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
295
295
  lora_scale: Optional[float] = None,
296
296
  clip_skip: Optional[int] = None,
297
297
  ):
@@ -311,10 +311,10 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
311
311
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
312
312
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
313
313
  less than `1`).
314
- prompt_embeds (`torch.FloatTensor`, *optional*):
314
+ prompt_embeds (`torch.Tensor`, *optional*):
315
315
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
316
316
  provided, text embeddings will be generated from `prompt` input argument.
317
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
317
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
318
318
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
319
319
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
320
320
  argument.
@@ -588,7 +588,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
588
588
  self,
589
589
  image_embeds: torch.Tensor,
590
590
  noise_level: int,
591
- noise: Optional[torch.FloatTensor] = None,
591
+ noise: Optional[torch.Tensor] = None,
592
592
  generator: Optional[torch.Generator] = None,
593
593
  ):
594
594
  """
@@ -644,19 +644,19 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
644
644
  num_images_per_prompt: Optional[int] = 1,
645
645
  eta: float = 0.0,
646
646
  generator: Optional[torch.Generator] = None,
647
- latents: Optional[torch.FloatTensor] = None,
648
- prompt_embeds: Optional[torch.FloatTensor] = None,
649
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
647
+ latents: Optional[torch.Tensor] = None,
648
+ prompt_embeds: Optional[torch.Tensor] = None,
649
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
650
650
  output_type: Optional[str] = "pil",
651
651
  return_dict: bool = True,
652
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
652
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
653
653
  callback_steps: int = 1,
654
654
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
655
655
  noise_level: int = 0,
656
656
  # prior args
657
657
  prior_num_inference_steps: int = 25,
658
658
  prior_guidance_scale: float = 4.0,
659
- prior_latents: Optional[torch.FloatTensor] = None,
659
+ prior_latents: Optional[torch.Tensor] = None,
660
660
  clip_skip: Optional[int] = None,
661
661
  ):
662
662
  """
@@ -686,14 +686,14 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
686
686
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
687
687
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
688
688
  generation deterministic.
689
- latents (`torch.FloatTensor`, *optional*):
689
+ latents (`torch.Tensor`, *optional*):
690
690
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
691
691
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
692
692
  tensor is generated by sampling using the supplied random `generator`.
693
- prompt_embeds (`torch.FloatTensor`, *optional*):
693
+ prompt_embeds (`torch.Tensor`, *optional*):
694
694
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
695
695
  provided, text embeddings are generated from the `prompt` input argument.
696
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
696
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
697
697
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
698
698
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
699
699
  output_type (`str`, *optional*, defaults to `"pil"`):
@@ -702,7 +702,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
702
702
  Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
703
703
  callback (`Callable`, *optional*):
704
704
  A function that calls every `callback_steps` steps during inference. The function is called with the
705
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
705
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
706
706
  callback_steps (`int`, *optional*, defaults to 1):
707
707
  The frequency at which the `callback` function is called. If not specified, the callback is called at
708
708
  every step.
@@ -718,7 +718,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
718
718
  prior_guidance_scale (`float`, *optional*, defaults to 4.0):
719
719
  A higher guidance scale value encourages the model to generate images closely linked to the text
720
720
  `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
721
- prior_latents (`torch.FloatTensor`, *optional*):
721
+ prior_latents (`torch.Tensor`, *optional*):
722
722
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
723
723
  embedding generation in the prior denoising process. Can be used to tweak the same generation with
724
724
  different prompts. If not provided, a latents tensor is generated by sampling using the supplied random
@@ -876,7 +876,12 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
876
876
 
877
877
  # 11. Prepare latent variables
878
878
  num_channels_latents = self.unet.config.in_channels
879
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
879
+ shape = (
880
+ batch_size,
881
+ num_channels_latents,
882
+ int(height) // self.vae_scale_factor,
883
+ int(width) // self.vae_scale_factor,
884
+ )
880
885
  latents = self.prepare_latents(
881
886
  shape=shape,
882
887
  dtype=prompt_embeds.dtype,