diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +20 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +23 -25
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +46 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
  229. diffusers/schedulers/scheduling_edm_euler.py +53 -30
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
  231. diffusers/schedulers/scheduling_euler_discrete.py +163 -67
  232. diffusers/schedulers/scheduling_heun_discrete.py +60 -38
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +27 -25
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. diffusers-0.27.1.dist-info/RECORD +0 -399
  267. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  268. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
  269. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -191,7 +191,12 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
191
191
 
192
192
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
193
193
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
194
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
194
+ shape = (
195
+ batch_size,
196
+ num_channels_latents,
197
+ int(height) // self.vae_scale_factor,
198
+ int(width) // self.vae_scale_factor,
199
+ )
195
200
  if isinstance(generator, list) and len(generator) != batch_size:
196
201
  raise ValueError(
197
202
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -219,10 +224,10 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
219
224
  num_images_per_prompt: int = 1,
220
225
  eta: float = 0.0,
221
226
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
222
- latents: Optional[torch.FloatTensor] = None,
227
+ latents: Optional[torch.Tensor] = None,
223
228
  output_type: Optional[str] = "pil",
224
229
  return_dict: bool = True,
225
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
230
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
226
231
  callback_steps: int = 1,
227
232
  editing_prompt: Optional[Union[str, List[str]]] = None,
228
233
  editing_prompt_embeddings: Optional[torch.Tensor] = None,
@@ -263,7 +268,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
263
268
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
264
269
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
265
270
  generation deterministic.
266
- latents (`torch.FloatTensor`, *optional*):
271
+ latents (`torch.Tensor`, *optional*):
267
272
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
268
273
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
269
274
  tensor is generated by sampling using the supplied random `generator`.
@@ -274,7 +279,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
274
279
  plain tuple.
275
280
  callback (`Callable`, *optional*):
276
281
  A function that calls every `callback_steps` steps during inference. The function is called with the
277
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
282
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
278
283
  callback_steps (`int`, *optional*, defaults to 1):
279
284
  The frequency at which the `callback` function is called. If not specified, the callback is called at
280
285
  every step.
@@ -69,7 +69,7 @@ class ShapEPipelineOutput(BaseOutput):
69
69
  Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
70
70
 
71
71
  Args:
72
- images (`torch.FloatTensor`)
72
+ images (`torch.Tensor`)
73
73
  A list of images for 3D rendering.
74
74
  """
75
75
 
@@ -187,7 +187,7 @@ class ShapEPipeline(DiffusionPipeline):
187
187
  num_images_per_prompt: int = 1,
188
188
  num_inference_steps: int = 25,
189
189
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
190
- latents: Optional[torch.FloatTensor] = None,
190
+ latents: Optional[torch.Tensor] = None,
191
191
  guidance_scale: float = 4.0,
192
192
  frame_size: int = 64,
193
193
  output_type: Optional[str] = "pil", # pil, np, latent, mesh
@@ -207,7 +207,7 @@ class ShapEPipeline(DiffusionPipeline):
207
207
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
208
208
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
209
209
  generation deterministic.
210
- latents (`torch.FloatTensor`, *optional*):
210
+ latents (`torch.Tensor`, *optional*):
211
211
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
212
212
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
213
213
  tensor is generated by sampling using the supplied random `generator`.
@@ -70,7 +70,7 @@ class ShapEPipelineOutput(BaseOutput):
70
70
  Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
71
71
 
72
72
  Args:
73
- images (`torch.FloatTensor`)
73
+ images (`torch.Tensor`)
74
74
  A list of images for 3D rendering.
75
75
  """
76
76
 
@@ -86,7 +86,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
86
86
 
87
87
  Args:
88
88
  prior ([`PriorTransformer`]):
89
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
89
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
90
90
  image_encoder ([`~transformers.CLIPVisionModel`]):
91
91
  Frozen image-encoder.
92
92
  image_processor ([`~transformers.CLIPImageProcessor`]):
@@ -169,7 +169,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
169
169
  num_images_per_prompt: int = 1,
170
170
  num_inference_steps: int = 25,
171
171
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
172
- latents: Optional[torch.FloatTensor] = None,
172
+ latents: Optional[torch.Tensor] = None,
173
173
  guidance_scale: float = 4.0,
174
174
  frame_size: int = 64,
175
175
  output_type: Optional[str] = "pil", # pil, np, latent, mesh
@@ -179,7 +179,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
179
179
  The call function to the pipeline for generation.
180
180
 
181
181
  Args:
182
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
182
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
183
183
  `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
184
184
  latents as image, but if passing latents directly it is not encoded again.
185
185
  num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -190,7 +190,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
190
190
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
191
191
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
192
192
  generation deterministic.
193
- latents (`torch.FloatTensor`, *optional*):
193
+ latents (`torch.Tensor`, *optional*):
194
194
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
195
195
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
196
196
  tensor is generated by sampling using the supplied random `generator`.
@@ -239,15 +239,15 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
239
239
 
240
240
  num_embeddings = self.prior.config.num_embeddings
241
241
  embedding_dim = self.prior.config.embedding_dim
242
-
243
- latents = self.prepare_latents(
244
- (batch_size, num_embeddings * embedding_dim),
245
- image_embeds.dtype,
246
- device,
247
- generator,
248
- latents,
249
- self.scheduler,
250
- )
242
+ if latents is None:
243
+ latents = self.prepare_latents(
244
+ (batch_size, num_embeddings * embedding_dim),
245
+ image_embeds.dtype,
246
+ device,
247
+ generator,
248
+ latents,
249
+ self.scheduler,
250
+ )
251
251
 
252
252
  # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
253
253
  latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
@@ -844,7 +844,7 @@ class ShapERenderer(ModelMixin, ConfigMixin):
844
844
  transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
845
845
  math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
846
846
 
847
- args:
847
+ Args:
848
848
  rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
849
849
  number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including
850
850
 
@@ -100,8 +100,10 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
100
100
  )
101
101
  self.register_to_config(latent_dim_scale=latent_dim_scale)
102
102
 
103
- def prepare_latents(self, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, scheduler):
104
- batch_size, channels, height, width = image_embeddings.shape
103
+ def prepare_latents(
104
+ self, batch_size, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, scheduler
105
+ ):
106
+ _, channels, height, width = image_embeddings.shape
105
107
  latents_shape = (
106
108
  batch_size * num_images_per_prompt,
107
109
  4,
@@ -127,10 +129,10 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
127
129
  do_classifier_free_guidance,
128
130
  prompt=None,
129
131
  negative_prompt=None,
130
- prompt_embeds: Optional[torch.FloatTensor] = None,
131
- prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
132
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
133
- negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
132
+ prompt_embeds: Optional[torch.Tensor] = None,
133
+ prompt_embeds_pooled: Optional[torch.Tensor] = None,
134
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
135
+ negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
134
136
  ):
135
137
  if prompt_embeds is None:
136
138
  # get prompt text embeddings
@@ -283,18 +285,18 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
283
285
  @replace_example_docstring(EXAMPLE_DOC_STRING)
284
286
  def __call__(
285
287
  self,
286
- image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]],
288
+ image_embeddings: Union[torch.Tensor, List[torch.Tensor]],
287
289
  prompt: Union[str, List[str]] = None,
288
290
  num_inference_steps: int = 10,
289
291
  guidance_scale: float = 0.0,
290
292
  negative_prompt: Optional[Union[str, List[str]]] = None,
291
- prompt_embeds: Optional[torch.FloatTensor] = None,
292
- prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
293
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
294
- negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
293
+ prompt_embeds: Optional[torch.Tensor] = None,
294
+ prompt_embeds_pooled: Optional[torch.Tensor] = None,
295
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
296
+ negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
295
297
  num_images_per_prompt: int = 1,
296
298
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
297
- latents: Optional[torch.FloatTensor] = None,
299
+ latents: Optional[torch.Tensor] = None,
298
300
  output_type: Optional[str] = "pil",
299
301
  return_dict: bool = True,
300
302
  callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -304,7 +306,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
304
306
  Function invoked when calling the pipeline for generation.
305
307
 
306
308
  Args:
307
- image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`):
309
+ image_embedding (`torch.Tensor` or `List[torch.Tensor]`):
308
310
  Image Embeddings either extracted from an image or generated by a Prior Model.
309
311
  prompt (`str` or `List[str]`):
310
312
  The prompt or prompts to guide the image generation.
@@ -320,26 +322,26 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
320
322
  negative_prompt (`str` or `List[str]`, *optional*):
321
323
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
322
324
  if `decoder_guidance_scale` is less than `1`).
323
- prompt_embeds (`torch.FloatTensor`, *optional*):
325
+ prompt_embeds (`torch.Tensor`, *optional*):
324
326
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
325
327
  provided, text embeddings will be generated from `prompt` input argument.
326
- prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
328
+ prompt_embeds_pooled (`torch.Tensor`, *optional*):
327
329
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
328
330
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
329
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
331
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
330
332
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
331
333
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
332
334
  argument.
333
- negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
335
+ negative_prompt_embeds_pooled (`torch.Tensor`, *optional*):
334
336
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
335
- weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
336
- argument.
337
+ weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
338
+ input argument.
337
339
  num_images_per_prompt (`int`, *optional*, defaults to 1):
338
340
  The number of images to generate per prompt.
339
341
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
340
342
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
341
343
  to make generation deterministic.
342
- latents (`torch.FloatTensor`, *optional*):
344
+ latents (`torch.Tensor`, *optional*):
343
345
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
344
346
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
345
347
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -383,7 +385,19 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
383
385
  )
384
386
  if isinstance(image_embeddings, list):
385
387
  image_embeddings = torch.cat(image_embeddings, dim=0)
386
- batch_size = image_embeddings.shape[0]
388
+
389
+ if prompt is not None and isinstance(prompt, str):
390
+ batch_size = 1
391
+ elif prompt is not None and isinstance(prompt, list):
392
+ batch_size = len(prompt)
393
+ else:
394
+ batch_size = prompt_embeds.shape[0]
395
+
396
+ # Compute the effective number of images per prompt
397
+ # We must account for the fact that the image embeddings from the prior can be generated with num_images_per_prompt > 1
398
+ # This results in a case where a single prompt is associated with multiple image embeddings
399
+ # Divide the number of image embeddings by the batch size to determine if this is the case.
400
+ num_images_per_prompt = num_images_per_prompt * (image_embeddings.shape[0] // batch_size)
387
401
 
388
402
  # 2. Encode caption
389
403
  if prompt_embeds is None and negative_prompt_embeds is None:
@@ -417,7 +431,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
417
431
 
418
432
  # 5. Prepare latents
419
433
  latents = self.prepare_latents(
420
- image_embeddings, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
434
+ batch_size, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
421
435
  )
422
436
 
423
437
  # 6. Run denoising loop
@@ -31,7 +31,10 @@ TEXT2IMAGE_EXAMPLE_DOC_STRING = """
31
31
  ```py
32
32
  >>> import torch
33
33
  >>> from diffusers import StableCascadeCombinedPipeline
34
- >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16)
34
+
35
+ >>> pipe = StableCascadeCombinedPipeline.from_pretrained(
36
+ ... "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16
37
+ ... )
35
38
  >>> pipe.enable_model_cpu_offload()
36
39
  >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
37
40
  >>> images = pipe(prompt=prompt)
@@ -68,6 +71,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
68
71
  """
69
72
 
70
73
  _load_connected_pipes = True
74
+ _optional_components = ["prior_feature_extractor", "prior_image_encoder"]
71
75
 
72
76
  def __init__(
73
77
  self,
@@ -117,25 +121,25 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
117
121
  def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
118
122
  self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
119
123
 
120
- def enable_model_cpu_offload(self, gpu_id=0):
124
+ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
121
125
  r"""
122
126
  Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
123
127
  to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
124
128
  method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
125
129
  `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
126
130
  """
127
- self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
128
- self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
131
+ self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
132
+ self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
129
133
 
130
- def enable_sequential_cpu_offload(self, gpu_id=0):
134
+ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
131
135
  r"""
132
136
  Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
133
137
  Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
134
138
  GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
135
139
  Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
136
140
  """
137
- self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
138
- self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
141
+ self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
142
+ self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
139
143
 
140
144
  def progress_bar(self, iterable=None, total=None):
141
145
  self.prior_pipe.progress_bar(iterable=iterable, total=total)
@@ -158,13 +162,13 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
158
162
  num_inference_steps: int = 12,
159
163
  decoder_guidance_scale: float = 0.0,
160
164
  negative_prompt: Optional[Union[str, List[str]]] = None,
161
- prompt_embeds: Optional[torch.FloatTensor] = None,
162
- prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
163
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
164
- negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
165
+ prompt_embeds: Optional[torch.Tensor] = None,
166
+ prompt_embeds_pooled: Optional[torch.Tensor] = None,
167
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
168
+ negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
165
169
  num_images_per_prompt: int = 1,
166
170
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
167
- latents: Optional[torch.FloatTensor] = None,
171
+ latents: Optional[torch.Tensor] = None,
168
172
  output_type: Optional[str] = "pil",
169
173
  return_dict: bool = True,
170
174
  prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -183,17 +187,17 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
183
187
  negative_prompt (`str` or `List[str]`, *optional*):
184
188
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
185
189
  if `guidance_scale` is less than `1`).
186
- prompt_embeds (`torch.FloatTensor`, *optional*):
190
+ prompt_embeds (`torch.Tensor`, *optional*):
187
191
  Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
188
192
  weighting. If not provided, text embeddings will be generated from `prompt` input argument.
189
- prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
193
+ prompt_embeds_pooled (`torch.Tensor`, *optional*):
190
194
  Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
191
195
  weighting. If not provided, text embeddings will be generated from `prompt` input argument.
192
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
196
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
193
197
  Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
194
198
  prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
195
199
  input argument.
196
- negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
200
+ negative_prompt_embeds_pooled (`torch.Tensor`, *optional*):
197
201
  Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
198
202
  prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
199
203
  input argument.
@@ -226,7 +230,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
226
230
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
227
231
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
228
232
  to make generation deterministic.
229
- latents (`torch.FloatTensor`, *optional*):
233
+ latents (`torch.Tensor`, *optional*):
230
234
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
231
235
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
232
236
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -242,7 +246,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
242
246
  prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
243
247
  The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
244
248
  list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
245
- the `._callback_tensor_inputs` attribute of your pipeine class.
249
+ the `._callback_tensor_inputs` attribute of your pipeline class.
246
250
  callback_on_step_end (`Callable`, *optional*):
247
251
  A function that calls at the end of each denoising steps during the inference. The function is called
248
252
  with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
@@ -251,7 +255,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
251
255
  callback_on_step_end_tensor_inputs (`List`, *optional*):
252
256
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
253
257
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
254
- `._callback_tensor_inputs` attribute of your pipeine class.
258
+ `._callback_tensor_inputs` attribute of your pipeline class.
255
259
 
256
260
  Examples:
257
261
 
@@ -54,19 +54,19 @@ class StableCascadePriorPipelineOutput(BaseOutput):
54
54
  Output class for WuerstchenPriorPipeline.
55
55
 
56
56
  Args:
57
- image_embeddings (`torch.FloatTensor` or `np.ndarray`)
57
+ image_embeddings (`torch.Tensor` or `np.ndarray`)
58
58
  Prior image embeddings for text prompt
59
- prompt_embeds (`torch.FloatTensor`):
59
+ prompt_embeds (`torch.Tensor`):
60
60
  Text embeddings for the prompt.
61
- negative_prompt_embeds (`torch.FloatTensor`):
61
+ negative_prompt_embeds (`torch.Tensor`):
62
62
  Text embeddings for the negative prompt.
63
63
  """
64
64
 
65
- image_embeddings: Union[torch.FloatTensor, np.ndarray]
66
- prompt_embeds: Union[torch.FloatTensor, np.ndarray]
67
- prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray]
68
- negative_prompt_embeds: Union[torch.FloatTensor, np.ndarray]
69
- negative_prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray]
65
+ image_embeddings: Union[torch.Tensor, np.ndarray]
66
+ prompt_embeds: Union[torch.Tensor, np.ndarray]
67
+ prompt_embeds_pooled: Union[torch.Tensor, np.ndarray]
68
+ negative_prompt_embeds: Union[torch.Tensor, np.ndarray]
69
+ negative_prompt_embeds_pooled: Union[torch.Tensor, np.ndarray]
70
70
 
71
71
 
72
72
  class StableCascadePriorPipeline(DiffusionPipeline):
@@ -80,7 +80,8 @@ class StableCascadePriorPipeline(DiffusionPipeline):
80
80
  prior ([`StableCascadeUNet`]):
81
81
  The Stable Cascade prior to approximate the image embedding from the text and/or image embedding.
82
82
  text_encoder ([`CLIPTextModelWithProjection`]):
83
- Frozen text-encoder ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
83
+ Frozen text-encoder
84
+ ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
84
85
  feature_extractor ([`~transformers.CLIPImageProcessor`]):
85
86
  Model that extracts features from generated images to be used as inputs for the `image_encoder`.
86
87
  image_encoder ([`CLIPVisionModelWithProjection`]):
@@ -149,10 +150,10 @@ class StableCascadePriorPipeline(DiffusionPipeline):
149
150
  do_classifier_free_guidance,
150
151
  prompt=None,
151
152
  negative_prompt=None,
152
- prompt_embeds: Optional[torch.FloatTensor] = None,
153
- prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
154
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
155
- negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
153
+ prompt_embeds: Optional[torch.Tensor] = None,
154
+ prompt_embeds_pooled: Optional[torch.Tensor] = None,
155
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
156
+ negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
156
157
  ):
157
158
  if prompt_embeds is None:
158
159
  # get prompt text embeddings
@@ -373,14 +374,14 @@ class StableCascadePriorPipeline(DiffusionPipeline):
373
374
  timesteps: List[float] = None,
374
375
  guidance_scale: float = 4.0,
375
376
  negative_prompt: Optional[Union[str, List[str]]] = None,
376
- prompt_embeds: Optional[torch.FloatTensor] = None,
377
- prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
378
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
379
- negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
380
- image_embeds: Optional[torch.FloatTensor] = None,
377
+ prompt_embeds: Optional[torch.Tensor] = None,
378
+ prompt_embeds_pooled: Optional[torch.Tensor] = None,
379
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
380
+ negative_prompt_embeds_pooled: Optional[torch.Tensor] = None,
381
+ image_embeds: Optional[torch.Tensor] = None,
381
382
  num_images_per_prompt: Optional[int] = 1,
382
383
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
383
- latents: Optional[torch.FloatTensor] = None,
384
+ latents: Optional[torch.Tensor] = None,
384
385
  output_type: Optional[str] = "pt",
385
386
  return_dict: bool = True,
386
387
  callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -408,29 +409,29 @@ class StableCascadePriorPipeline(DiffusionPipeline):
408
409
  negative_prompt (`str` or `List[str]`, *optional*):
409
410
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
410
411
  if `decoder_guidance_scale` is less than `1`).
411
- prompt_embeds (`torch.FloatTensor`, *optional*):
412
+ prompt_embeds (`torch.Tensor`, *optional*):
412
413
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
413
414
  provided, text embeddings will be generated from `prompt` input argument.
414
- prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
415
+ prompt_embeds_pooled (`torch.Tensor`, *optional*):
415
416
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
416
417
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
417
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
418
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
418
419
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
419
420
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
420
421
  argument.
421
- negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
422
+ negative_prompt_embeds_pooled (`torch.Tensor`, *optional*):
422
423
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
423
- weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
424
- argument.
425
- image_embeds (`torch.FloatTensor`, *optional*):
426
- Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting.
427
- If not provided, image embeddings will be generated from `image` input argument if existing.
424
+ weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
425
+ input argument.
426
+ image_embeds (`torch.Tensor`, *optional*):
427
+ Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If
428
+ not provided, image embeddings will be generated from `image` input argument if existing.
428
429
  num_images_per_prompt (`int`, *optional*, defaults to 1):
429
430
  The number of images to generate per prompt.
430
431
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
431
432
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
432
433
  to make generation deterministic.
433
- latents (`torch.FloatTensor`, *optional*):
434
+ latents (`torch.Tensor`, *optional*):
434
435
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
435
436
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
436
437
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -452,9 +453,9 @@ class StableCascadePriorPipeline(DiffusionPipeline):
452
453
  Examples:
453
454
 
454
455
  Returns:
455
- [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if
456
- `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
457
- generated image embeddings.
456
+ [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if `return_dict` is
457
+ True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
458
+ embeddings.
458
459
  """
459
460
 
460
461
  # 0. Define commonly used variables
@@ -113,7 +113,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
113
113
  from .pipeline_stable_diffusion import (
114
114
  StableDiffusionPipeline,
115
115
  StableDiffusionPipelineOutput,
116
- StableDiffusionSafetyChecker,
117
116
  )
118
117
  from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
119
118
  from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
@@ -12,7 +12,7 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
- """ Conversion script for the Stable Diffusion checkpoints."""
15
+ """Conversion script for the Stable Diffusion checkpoints."""
16
16
 
17
17
  import re
18
18
  from contextlib import nullcontext
@@ -557,7 +557,7 @@ def convert_ldm_unet_checkpoint(
557
557
  paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
558
558
  )
559
559
 
560
- output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
560
+ output_block_list = {k: sorted(v) for k, v in sorted(output_block_list.items())}
561
561
  if ["conv.bias", "conv.weight"] in output_block_list.values():
562
562
  index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
563
563
  new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
@@ -1153,6 +1153,8 @@ def download_from_original_stable_diffusion_ckpt(
1153
1153
  controlnet: Optional[bool] = None,
1154
1154
  adapter: Optional[bool] = None,
1155
1155
  load_safety_checker: bool = True,
1156
+ safety_checker: Optional[StableDiffusionSafetyChecker] = None,
1157
+ feature_extractor: Optional[AutoFeatureExtractor] = None,
1156
1158
  pipeline_class: DiffusionPipeline = None,
1157
1159
  local_files_only=False,
1158
1160
  vae_path=None,
@@ -1205,6 +1207,12 @@ def download_from_original_stable_diffusion_ckpt(
1205
1207
  If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
1206
1208
  load_safety_checker (`bool`, *optional*, defaults to `True`):
1207
1209
  Whether to load the safety checker or not. Defaults to `True`.
1210
+ safety_checker (`StableDiffusionSafetyChecker`, *optional*, defaults to `None`):
1211
+ Safety checker to use. If this parameter is `None`, the function will load a new instance of
1212
+ [StableDiffusionSafetyChecker] by itself, if needed.
1213
+ feature_extractor (`AutoFeatureExtractor`, *optional*, defaults to `None`):
1214
+ Feature extractor to use. If this parameter is `None`, the function will load a new instance of
1215
+ [AutoFeatureExtractor] by itself, if needed.
1208
1216
  pipeline_class (`str`, *optional*, defaults to `None`):
1209
1217
  The pipeline class to use. Pass `None` to determine automatically.
1210
1218
  local_files_only (`bool`, *optional*, defaults to `False`):
@@ -1530,8 +1538,8 @@ def download_from_original_stable_diffusion_ckpt(
1530
1538
  unet=unet,
1531
1539
  scheduler=scheduler,
1532
1540
  controlnet=controlnet,
1533
- safety_checker=None,
1534
- feature_extractor=None,
1541
+ safety_checker=safety_checker,
1542
+ feature_extractor=feature_extractor,
1535
1543
  )
1536
1544
  if hasattr(pipe, "requires_safety_checker"):
1537
1545
  pipe.requires_safety_checker = False
@@ -1551,8 +1559,8 @@ def download_from_original_stable_diffusion_ckpt(
1551
1559
  unet=unet,
1552
1560
  scheduler=scheduler,
1553
1561
  low_res_scheduler=low_res_scheduler,
1554
- safety_checker=None,
1555
- feature_extractor=None,
1562
+ safety_checker=safety_checker,
1563
+ feature_extractor=feature_extractor,
1556
1564
  )
1557
1565
 
1558
1566
  else:
@@ -1562,8 +1570,8 @@ def download_from_original_stable_diffusion_ckpt(
1562
1570
  tokenizer=tokenizer,
1563
1571
  unet=unet,
1564
1572
  scheduler=scheduler,
1565
- safety_checker=None,
1566
- feature_extractor=None,
1573
+ safety_checker=safety_checker,
1574
+ feature_extractor=feature_extractor,
1567
1575
  )
1568
1576
  if hasattr(pipe, "requires_safety_checker"):
1569
1577
  pipe.requires_safety_checker = False
@@ -1684,9 +1692,6 @@ def download_from_original_stable_diffusion_ckpt(
1684
1692
  feature_extractor = AutoFeatureExtractor.from_pretrained(
1685
1693
  "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
1686
1694
  )
1687
- else:
1688
- safety_checker = None
1689
- feature_extractor = None
1690
1695
 
1691
1696
  if controlnet:
1692
1697
  pipe = pipeline_class(
@@ -1838,6 +1843,8 @@ def download_controlnet_from_original_ckpt(
1838
1843
  while "state_dict" in checkpoint:
1839
1844
  checkpoint = checkpoint["state_dict"]
1840
1845
 
1846
+ with open(original_config_file, "r") as f:
1847
+ original_config_file = f.read()
1841
1848
  original_config = yaml.safe_load(original_config_file)
1842
1849
 
1843
1850
  if num_in_channels is not None:
@@ -288,7 +288,7 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
288
288
  prompt (`str` or `List[str]`, *optional*):
289
289
  The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
290
290
  instead.
291
- image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`):
291
+ image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.Tensor`):
292
292
  `Image`, or tensor representing an image batch which will be upscaled. *
293
293
  num_inference_steps (`int`, *optional*, defaults to 50):
294
294
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -329,7 +329,7 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
329
329
  plain tuple.
330
330
  callback (`Callable`, *optional*):
331
331
  A function that will be called every `callback_steps` steps during inference. The function will be
332
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
332
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
333
333
  callback_steps (`int`, *optional*, defaults to 1):
334
334
  The frequency at which the `callback` function will be called. If not specified, the callback will be
335
335
  called at every step.