diffusers 0.27.2__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +19 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -18
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +18 -18
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -39
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +20 -26
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +42 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +103 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +23 -23
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +86 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +75 -54
  229. diffusers/schedulers/scheduling_edm_euler.py +50 -31
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +23 -29
  231. diffusers/schedulers/scheduling_euler_discrete.py +160 -68
  232. diffusers/schedulers/scheduling_heun_discrete.py +57 -39
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +19 -19
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +19 -19
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +24 -26
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +111 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/METADATA +47 -47
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/WHEEL +1 -1
  267. diffusers-0.27.2.dist-info/RECORD +0 -399
  268. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  269. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -27,6 +27,8 @@ from transformers import (
27
27
  T5EncoderModel,
28
28
  T5Tokenizer,
29
29
  T5TokenizerFast,
30
+ VitsModel,
31
+ VitsTokenizer,
30
32
  )
31
33
 
32
34
  from ...models import AutoencoderKL
@@ -79,6 +81,37 @@ EXAMPLE_DOC_STRING = """
79
81
  >>> # save the best audio sample (index 0) as a .wav file
80
82
  >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
81
83
  ```
84
+ ```
85
+ #Using AudioLDM2 for Text To Speech
86
+ >>> import scipy
87
+ >>> import torch
88
+ >>> from diffusers import AudioLDM2Pipeline
89
+
90
+ >>> repo_id = "anhnct/audioldm2_gigaspeech"
91
+ >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
92
+ >>> pipe = pipe.to("cuda")
93
+
94
+ >>> # define the prompts
95
+ >>> prompt = "A female reporter is speaking"
96
+ >>> transcript = "wish you have a good day"
97
+
98
+ >>> # set the seed for generator
99
+ >>> generator = torch.Generator("cuda").manual_seed(0)
100
+
101
+ >>> # run the generation
102
+ >>> audio = pipe(
103
+ ... prompt,
104
+ ... transcription=transcript,
105
+ ... num_inference_steps=200,
106
+ ... audio_length_in_s=10.0,
107
+ ... num_waveforms_per_prompt=2,
108
+ ... generator=generator,
109
+ ... max_new_tokens=512, #Must set max_new_tokens equa to 512 for TTS
110
+ ... ).audios
111
+
112
+ >>> # save the best audio sample (index 0) as a .wav file
113
+ >>> scipy.io.wavfile.write("tts.wav", rate=16000, data=audio[0])
114
+ ```
82
115
  """
83
116
 
84
117
 
@@ -116,20 +149,23 @@ class AudioLDM2Pipeline(DiffusionPipeline):
116
149
  specifically the [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant. The
117
150
  text branch is used to encode the text prompt to a prompt embedding. The full audio-text model is used to
118
151
  rank generated waveforms against the text prompt by computing similarity scores.
119
- text_encoder_2 ([`~transformers.T5EncoderModel`]):
152
+ text_encoder_2 ([`~transformers.T5EncoderModel`, `~transformers.VitsModel`]):
120
153
  Second frozen text-encoder. AudioLDM2 uses the encoder of
121
154
  [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
122
- [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) variant.
155
+ [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) variant. Second frozen text-encoder use
156
+ for TTS. AudioLDM2 uses the encoder of
157
+ [Vits](https://huggingface.co/docs/transformers/model_doc/vits#transformers.VitsModel).
123
158
  projection_model ([`AudioLDM2ProjectionModel`]):
124
159
  A trained model used to linearly project the hidden-states from the first and second text encoder models
125
160
  and insert learned SOS and EOS token embeddings. The projected hidden-states from the two text encoders are
126
- concatenated to give the input to the language model.
161
+ concatenated to give the input to the language model. A Learned Position Embedding for the Vits
162
+ hidden-states
127
163
  language_model ([`~transformers.GPT2Model`]):
128
164
  An auto-regressive language model used to generate a sequence of hidden-states conditioned on the projected
129
165
  outputs from the two text encoders.
130
166
  tokenizer ([`~transformers.RobertaTokenizer`]):
131
167
  Tokenizer to tokenize text for the first frozen text-encoder.
132
- tokenizer_2 ([`~transformers.T5Tokenizer`]):
168
+ tokenizer_2 ([`~transformers.T5Tokenizer`, `~transformers.VitsTokenizer`]):
133
169
  Tokenizer to tokenize text for the second frozen text-encoder.
134
170
  feature_extractor ([`~transformers.ClapFeatureExtractor`]):
135
171
  Feature extractor to pre-process generated audio waveforms to log-mel spectrograms for automatic scoring.
@@ -146,11 +182,11 @@ class AudioLDM2Pipeline(DiffusionPipeline):
146
182
  self,
147
183
  vae: AutoencoderKL,
148
184
  text_encoder: ClapModel,
149
- text_encoder_2: T5EncoderModel,
185
+ text_encoder_2: Union[T5EncoderModel, VitsModel],
150
186
  projection_model: AudioLDM2ProjectionModel,
151
187
  language_model: GPT2Model,
152
188
  tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
153
- tokenizer_2: Union[T5Tokenizer, T5TokenizerFast],
189
+ tokenizer_2: Union[T5Tokenizer, T5TokenizerFast, VitsTokenizer],
154
190
  feature_extractor: ClapFeatureExtractor,
155
191
  unet: AudioLDM2UNet2DConditionModel,
156
192
  scheduler: KarrasDiffusionSchedulers,
@@ -237,7 +273,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
237
273
  Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs.
238
274
 
239
275
  Parameters:
240
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
276
+ inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
241
277
  The sequence used as a prompt for the generation.
242
278
  max_new_tokens (`int`):
243
279
  Number of new tokens to generate.
@@ -246,7 +282,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
246
282
  function of the model.
247
283
 
248
284
  Return:
249
- `inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
285
+ `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
250
286
  The sequence of generated hidden-states.
251
287
  """
252
288
  max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
@@ -273,11 +309,12 @@ class AudioLDM2Pipeline(DiffusionPipeline):
273
309
  device,
274
310
  num_waveforms_per_prompt,
275
311
  do_classifier_free_guidance,
312
+ transcription=None,
276
313
  negative_prompt=None,
277
- prompt_embeds: Optional[torch.FloatTensor] = None,
278
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
279
- generated_prompt_embeds: Optional[torch.FloatTensor] = None,
280
- negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None,
314
+ prompt_embeds: Optional[torch.Tensor] = None,
315
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
316
+ generated_prompt_embeds: Optional[torch.Tensor] = None,
317
+ negative_generated_prompt_embeds: Optional[torch.Tensor] = None,
281
318
  attention_mask: Optional[torch.LongTensor] = None,
282
319
  negative_attention_mask: Optional[torch.LongTensor] = None,
283
320
  max_new_tokens: Optional[int] = None,
@@ -288,6 +325,8 @@ class AudioLDM2Pipeline(DiffusionPipeline):
288
325
  Args:
289
326
  prompt (`str` or `List[str]`, *optional*):
290
327
  prompt to be encoded
328
+ transcription (`str` or `List[str]`):
329
+ transcription of text to speech
291
330
  device (`torch.device`):
292
331
  torch device
293
332
  num_waveforms_per_prompt (`int`):
@@ -298,18 +337,18 @@ class AudioLDM2Pipeline(DiffusionPipeline):
298
337
  The prompt or prompts not to guide the audio generation. If not defined, one has to pass
299
338
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
300
339
  less than `1`).
301
- prompt_embeds (`torch.FloatTensor`, *optional*):
340
+ prompt_embeds (`torch.Tensor`, *optional*):
302
341
  Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.*
303
342
  prompt weighting. If not provided, text embeddings will be computed from `prompt` input argument.
304
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
343
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
305
344
  Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs,
306
345
  *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
307
346
  `negative_prompt` input argument.
308
- generated_prompt_embeds (`torch.FloatTensor`, *optional*):
347
+ generated_prompt_embeds (`torch.Tensor`, *optional*):
309
348
  Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
310
349
  *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
311
350
  argument.
312
- negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*):
351
+ negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
313
352
  Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
314
353
  inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
315
354
  `negative_prompt` input argument.
@@ -322,11 +361,11 @@ class AudioLDM2Pipeline(DiffusionPipeline):
322
361
  max_new_tokens (`int`, *optional*, defaults to None):
323
362
  The number of new tokens to generate with the GPT2 language model.
324
363
  Returns:
325
- prompt_embeds (`torch.FloatTensor`):
364
+ prompt_embeds (`torch.Tensor`):
326
365
  Text embeddings from the Flan T5 model.
327
366
  attention_mask (`torch.LongTensor`):
328
367
  Attention mask to be applied to the `prompt_embeds`.
329
- generated_prompt_embeds (`torch.FloatTensor`):
368
+ generated_prompt_embeds (`torch.Tensor`):
330
369
  Text embeddings generated from the GPT2 langauge model.
331
370
 
332
371
  Example:
@@ -368,16 +407,26 @@ class AudioLDM2Pipeline(DiffusionPipeline):
368
407
 
369
408
  # Define tokenizers and text encoders
370
409
  tokenizers = [self.tokenizer, self.tokenizer_2]
371
- text_encoders = [self.text_encoder, self.text_encoder_2]
410
+ is_vits_text_encoder = isinstance(self.text_encoder_2, VitsModel)
411
+
412
+ if is_vits_text_encoder:
413
+ text_encoders = [self.text_encoder, self.text_encoder_2.text_encoder]
414
+ else:
415
+ text_encoders = [self.text_encoder, self.text_encoder_2]
372
416
 
373
417
  if prompt_embeds is None:
374
418
  prompt_embeds_list = []
375
419
  attention_mask_list = []
376
420
 
377
421
  for tokenizer, text_encoder in zip(tokenizers, text_encoders):
422
+ use_prompt = isinstance(
423
+ tokenizer, (RobertaTokenizer, RobertaTokenizerFast, T5Tokenizer, T5TokenizerFast)
424
+ )
378
425
  text_inputs = tokenizer(
379
- prompt,
380
- padding="max_length" if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast)) else True,
426
+ prompt if use_prompt else transcription,
427
+ padding="max_length"
428
+ if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast, VitsTokenizer))
429
+ else True,
381
430
  max_length=tokenizer.model_max_length,
382
431
  truncation=True,
383
432
  return_tensors="pt",
@@ -407,6 +456,18 @@ class AudioLDM2Pipeline(DiffusionPipeline):
407
456
  prompt_embeds = prompt_embeds[:, None, :]
408
457
  # make sure that we attend to this single hidden-state
409
458
  attention_mask = attention_mask.new_ones((batch_size, 1))
459
+ elif is_vits_text_encoder:
460
+ # Add end_token_id and attention mask in the end of sequence phonemes
461
+ for text_input_id, text_attention_mask in zip(text_input_ids, attention_mask):
462
+ for idx, phoneme_id in enumerate(text_input_id):
463
+ if phoneme_id == 0:
464
+ text_input_id[idx] = 182
465
+ text_attention_mask[idx] = 1
466
+ break
467
+ prompt_embeds = text_encoder(
468
+ text_input_ids, attention_mask=attention_mask, padding_mask=attention_mask.unsqueeze(-1)
469
+ )
470
+ prompt_embeds = prompt_embeds[0]
410
471
  else:
411
472
  prompt_embeds = text_encoder(
412
473
  text_input_ids,
@@ -485,7 +546,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
485
546
  uncond_tokens,
486
547
  padding="max_length",
487
548
  max_length=tokenizer.model_max_length
488
- if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
549
+ if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast, VitsTokenizer))
489
550
  else max_length,
490
551
  truncation=True,
491
552
  return_tensors="pt",
@@ -503,6 +564,15 @@ class AudioLDM2Pipeline(DiffusionPipeline):
503
564
  negative_prompt_embeds = negative_prompt_embeds[:, None, :]
504
565
  # make sure that we attend to this single hidden-state
505
566
  negative_attention_mask = negative_attention_mask.new_ones((batch_size, 1))
567
+ elif is_vits_text_encoder:
568
+ negative_prompt_embeds = torch.zeros(
569
+ batch_size,
570
+ tokenizer.model_max_length,
571
+ text_encoder.config.hidden_size,
572
+ ).to(dtype=self.text_encoder_2.dtype, device=device)
573
+ negative_attention_mask = torch.zeros(batch_size, tokenizer.model_max_length).to(
574
+ dtype=self.text_encoder_2.dtype, device=device
575
+ )
506
576
  else:
507
577
  negative_prompt_embeds = text_encoder(
508
578
  uncond_input_ids,
@@ -623,6 +693,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
623
693
  audio_length_in_s,
624
694
  vocoder_upsample_factor,
625
695
  callback_steps,
696
+ transcription=None,
626
697
  negative_prompt=None,
627
698
  prompt_embeds=None,
628
699
  negative_prompt_embeds=None,
@@ -690,6 +761,14 @@ class AudioLDM2Pipeline(DiffusionPipeline):
690
761
  f"`attention_mask: {attention_mask.shape} != `prompt_embeds` {prompt_embeds.shape}"
691
762
  )
692
763
 
764
+ if transcription is None:
765
+ if self.text_encoder_2.config.model_type == "vits":
766
+ raise ValueError("Cannot forward without transcription. Please make sure to" " have transcription")
767
+ elif transcription is not None and (
768
+ not isinstance(transcription, str) and not isinstance(transcription, list)
769
+ ):
770
+ raise ValueError(f"`transcription` has to be of type `str` or `list` but is {type(transcription)}")
771
+
693
772
  if generated_prompt_embeds is not None and negative_generated_prompt_embeds is not None:
694
773
  if generated_prompt_embeds.shape != negative_generated_prompt_embeds.shape:
695
774
  raise ValueError(
@@ -711,8 +790,8 @@ class AudioLDM2Pipeline(DiffusionPipeline):
711
790
  shape = (
712
791
  batch_size,
713
792
  num_channels_latents,
714
- height // self.vae_scale_factor,
715
- self.vocoder.config.model_in_dim // self.vae_scale_factor,
793
+ int(height) // self.vae_scale_factor,
794
+ int(self.vocoder.config.model_in_dim) // self.vae_scale_factor,
716
795
  )
717
796
  if isinstance(generator, list) and len(generator) != batch_size:
718
797
  raise ValueError(
@@ -734,6 +813,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
734
813
  def __call__(
735
814
  self,
736
815
  prompt: Union[str, List[str]] = None,
816
+ transcription: Union[str, List[str]] = None,
737
817
  audio_length_in_s: Optional[float] = None,
738
818
  num_inference_steps: int = 200,
739
819
  guidance_scale: float = 3.5,
@@ -741,16 +821,16 @@ class AudioLDM2Pipeline(DiffusionPipeline):
741
821
  num_waveforms_per_prompt: Optional[int] = 1,
742
822
  eta: float = 0.0,
743
823
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
744
- latents: Optional[torch.FloatTensor] = None,
745
- prompt_embeds: Optional[torch.FloatTensor] = None,
746
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
747
- generated_prompt_embeds: Optional[torch.FloatTensor] = None,
748
- negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None,
824
+ latents: Optional[torch.Tensor] = None,
825
+ prompt_embeds: Optional[torch.Tensor] = None,
826
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
827
+ generated_prompt_embeds: Optional[torch.Tensor] = None,
828
+ negative_generated_prompt_embeds: Optional[torch.Tensor] = None,
749
829
  attention_mask: Optional[torch.LongTensor] = None,
750
830
  negative_attention_mask: Optional[torch.LongTensor] = None,
751
831
  max_new_tokens: Optional[int] = None,
752
832
  return_dict: bool = True,
753
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
833
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
754
834
  callback_steps: Optional[int] = 1,
755
835
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
756
836
  output_type: Optional[str] = "np",
@@ -761,6 +841,8 @@ class AudioLDM2Pipeline(DiffusionPipeline):
761
841
  Args:
762
842
  prompt (`str` or `List[str]`, *optional*):
763
843
  The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
844
+ transcription (`str` or `List[str]`, *optional*):\
845
+ The transcript for text to speech.
764
846
  audio_length_in_s (`int`, *optional*, defaults to 10.24):
765
847
  The length of the generated audio sample in seconds.
766
848
  num_inference_steps (`int`, *optional*, defaults to 200):
@@ -783,21 +865,21 @@ class AudioLDM2Pipeline(DiffusionPipeline):
783
865
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
784
866
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
785
867
  generation deterministic.
786
- latents (`torch.FloatTensor`, *optional*):
868
+ latents (`torch.Tensor`, *optional*):
787
869
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram
788
870
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
789
871
  tensor is generated by sampling using the supplied random `generator`.
790
- prompt_embeds (`torch.FloatTensor`, *optional*):
872
+ prompt_embeds (`torch.Tensor`, *optional*):
791
873
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
792
874
  provided, text embeddings are generated from the `prompt` input argument.
793
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
875
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
794
876
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
795
877
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
796
- generated_prompt_embeds (`torch.FloatTensor`, *optional*):
878
+ generated_prompt_embeds (`torch.Tensor`, *optional*):
797
879
  Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
798
880
  *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
799
881
  argument.
800
- negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*):
882
+ negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
801
883
  Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
802
884
  inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
803
885
  `negative_prompt` input argument.
@@ -815,7 +897,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
815
897
  plain tuple.
816
898
  callback (`Callable`, *optional*):
817
899
  A function that calls every `callback_steps` steps during inference. The function is called with the
818
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
900
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
819
901
  callback_steps (`int`, *optional*, defaults to 1):
820
902
  The frequency at which the `callback` function is called. If not specified, the callback is called at
821
903
  every step.
@@ -857,6 +939,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
857
939
  audio_length_in_s,
858
940
  vocoder_upsample_factor,
859
941
  callback_steps,
942
+ transcription,
860
943
  negative_prompt,
861
944
  prompt_embeds,
862
945
  negative_prompt_embeds,
@@ -886,6 +969,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
886
969
  device,
887
970
  num_waveforms_per_prompt,
888
971
  do_classifier_free_guidance,
972
+ transcription,
889
973
  negative_prompt,
890
974
  prompt_embeds=prompt_embeds,
891
975
  negative_prompt_embeds=negative_prompt_embeds,
@@ -45,7 +45,8 @@ from .kandinsky2_2 import (
45
45
  )
46
46
  from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
47
47
  from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
48
- from .pixart_alpha import PixArtAlphaPipeline
48
+ from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
49
+ from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline
49
50
  from .stable_diffusion import (
50
51
  StableDiffusionImg2ImgPipeline,
51
52
  StableDiffusionInpaintPipeline,
@@ -70,8 +71,10 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
70
71
  ("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
71
72
  ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
72
73
  ("wuerstchen", WuerstchenCombinedPipeline),
74
+ ("cascade", StableCascadeCombinedPipeline),
73
75
  ("lcm", LatentConsistencyModelPipeline),
74
- ("pixart", PixArtAlphaPipeline),
76
+ ("pixart-alpha", PixArtAlphaPipeline),
77
+ ("pixart-sigma", PixArtSigmaPipeline),
75
78
  ]
76
79
  )
77
80
 
@@ -106,6 +109,7 @@ _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
106
109
  ("kandinsky", KandinskyPipeline),
107
110
  ("kandinsky22", KandinskyV22Pipeline),
108
111
  ("wuerstchen", WuerstchenDecoderPipeline),
112
+ ("cascade", StableCascadeDecoderPipeline),
109
113
  ]
110
114
  )
111
115
  _AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
@@ -213,7 +217,7 @@ class AutoPipelineForText2Image(ConfigMixin):
213
217
  ```
214
218
 
215
219
  Parameters:
216
- pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
220
+ pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
217
221
  Can be either:
218
222
 
219
223
  - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
@@ -230,9 +234,9 @@ class AutoPipelineForText2Image(ConfigMixin):
230
234
  cache_dir (`Union[str, os.PathLike]`, *optional*):
231
235
  Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
232
236
  is not used.
233
- resume_download (`bool`, *optional*, defaults to `False`):
234
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
235
- incompletely downloaded files are deleted.
237
+ resume_download:
238
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
239
+ of Diffusers.
236
240
  proxies (`Dict[str, str]`, *optional*):
237
241
  A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
238
242
  'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -307,7 +311,7 @@ class AutoPipelineForText2Image(ConfigMixin):
307
311
  """
308
312
  cache_dir = kwargs.pop("cache_dir", None)
309
313
  force_download = kwargs.pop("force_download", False)
310
- resume_download = kwargs.pop("resume_download", False)
314
+ resume_download = kwargs.pop("resume_download", None)
311
315
  proxies = kwargs.pop("proxies", None)
312
316
  token = kwargs.pop("token", None)
313
317
  local_files_only = kwargs.pop("local_files_only", False)
@@ -486,7 +490,7 @@ class AutoPipelineForImage2Image(ConfigMixin):
486
490
  ```
487
491
 
488
492
  Parameters:
489
- pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
493
+ pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
490
494
  Can be either:
491
495
 
492
496
  - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
@@ -503,9 +507,9 @@ class AutoPipelineForImage2Image(ConfigMixin):
503
507
  cache_dir (`Union[str, os.PathLike]`, *optional*):
504
508
  Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
505
509
  is not used.
506
- resume_download (`bool`, *optional*, defaults to `False`):
507
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
508
- incompletely downloaded files are deleted.
510
+ resume_download:
511
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
512
+ of Diffusers.
509
513
  proxies (`Dict[str, str]`, *optional*):
510
514
  A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
511
515
  'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -580,7 +584,7 @@ class AutoPipelineForImage2Image(ConfigMixin):
580
584
  """
581
585
  cache_dir = kwargs.pop("cache_dir", None)
582
586
  force_download = kwargs.pop("force_download", False)
583
- resume_download = kwargs.pop("resume_download", False)
587
+ resume_download = kwargs.pop("resume_download", None)
584
588
  proxies = kwargs.pop("proxies", None)
585
589
  token = kwargs.pop("token", None)
586
590
  local_files_only = kwargs.pop("local_files_only", False)
@@ -762,7 +766,7 @@ class AutoPipelineForInpainting(ConfigMixin):
762
766
  ```
763
767
 
764
768
  Parameters:
765
- pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
769
+ pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
766
770
  Can be either:
767
771
 
768
772
  - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
@@ -779,9 +783,9 @@ class AutoPipelineForInpainting(ConfigMixin):
779
783
  cache_dir (`Union[str, os.PathLike]`, *optional*):
780
784
  Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
781
785
  is not used.
782
- resume_download (`bool`, *optional*, defaults to `False`):
783
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
784
- incompletely downloaded files are deleted.
786
+ resume_download:
787
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
788
+ of Diffusers.
785
789
  proxies (`Dict[str, str]`, *optional*):
786
790
  A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
787
791
  'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -856,7 +860,7 @@ class AutoPipelineForInpainting(ConfigMixin):
856
860
  """
857
861
  cache_dir = kwargs.pop("cache_dir", None)
858
862
  force_download = kwargs.pop("force_download", False)
859
- resume_download = kwargs.pop("resume_download", False)
863
+ resume_download = kwargs.pop("resume_download", None)
860
864
  proxies = kwargs.pop("proxies", None)
861
865
  token = kwargs.pop("token", None)
862
866
  local_files_only = kwargs.pop("local_files_only", False)
@@ -298,7 +298,7 @@ class BlipImageProcessor(BaseImageProcessor):
298
298
  return encoded_outputs
299
299
 
300
300
  # Follows diffusers.VaeImageProcessor.postprocess
301
- def postprocess(self, sample: torch.FloatTensor, output_type: str = "pil"):
301
+ def postprocess(self, sample: torch.Tensor, output_type: str = "pil"):
302
302
  if output_type not in ["pt", "np", "pil"]:
303
303
  raise ValueError(
304
304
  f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
@@ -117,7 +117,7 @@ class Blip2VisionEmbeddings(nn.Module):
117
117
 
118
118
  self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
119
119
 
120
- def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
120
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
121
121
  batch_size = pixel_values.shape[0]
122
122
  target_dtype = self.patch_embedding.weight.dtype
123
123
  patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
@@ -376,7 +376,7 @@ class Blip2VisionModel(Blip2PreTrainedModel):
376
376
  @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig)
377
377
  def forward(
378
378
  self,
379
- pixel_values: Optional[torch.FloatTensor] = None,
379
+ pixel_values: Optional[torch.Tensor] = None,
380
380
  output_attentions: Optional[bool] = None,
381
381
  output_hidden_states: Optional[bool] = None,
382
382
  return_dict: Optional[bool] = None,
@@ -524,15 +524,15 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
524
524
  return_dict=None,
525
525
  ):
526
526
  r"""
527
- encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
527
+ encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
528
528
  Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
529
529
  the model is configured as a decoder.
530
- encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
530
+ encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
531
531
  Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
532
532
  the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
533
533
  - 1 for tokens that are **not masked**,
534
534
  - 0 for tokens that are **masked**.
535
- past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
535
+ past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
536
536
  shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
537
537
  value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
538
538
  used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
@@ -186,7 +186,7 @@ class ContextCLIPTextEmbeddings(nn.Module):
186
186
  ctx_begin_pos: list,
187
187
  input_ids: Optional[torch.LongTensor] = None,
188
188
  position_ids: Optional[torch.LongTensor] = None,
189
- inputs_embeds: Optional[torch.FloatTensor] = None,
189
+ inputs_embeds: Optional[torch.Tensor] = None,
190
190
  ) -> torch.Tensor:
191
191
  if ctx_embeddings is None:
192
192
  ctx_len = 0
@@ -191,7 +191,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
191
191
  reference_image: PIL.Image.Image,
192
192
  source_subject_category: List[str],
193
193
  target_subject_category: List[str],
194
- latents: Optional[torch.FloatTensor] = None,
194
+ latents: Optional[torch.Tensor] = None,
195
195
  guidance_scale: float = 7.5,
196
196
  height: int = 512,
197
197
  width: int = 512,
@@ -215,7 +215,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
215
215
  The source subject category.
216
216
  target_subject_category (`List[str]`):
217
217
  The target subject category.
218
- latents (`torch.FloatTensor`, *optional*):
218
+ latents (`torch.Tensor`, *optional*):
219
219
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
220
220
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
221
221
  tensor will ge generated by random sampling.
@@ -105,7 +105,7 @@ class ConsistencyModelPipeline(DiffusionPipeline):
105
105
  return latents
106
106
 
107
107
  # Follows diffusers.VaeImageProcessor.postprocess
108
- def postprocess_image(self, sample: torch.FloatTensor, output_type: str = "pil"):
108
+ def postprocess_image(self, sample: torch.Tensor, output_type: str = "pil"):
109
109
  if output_type not in ["pt", "np", "pil"]:
110
110
  raise ValueError(
111
111
  f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
@@ -173,10 +173,10 @@ class ConsistencyModelPipeline(DiffusionPipeline):
173
173
  num_inference_steps: int = 1,
174
174
  timesteps: List[int] = None,
175
175
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
176
- latents: Optional[torch.FloatTensor] = None,
176
+ latents: Optional[torch.Tensor] = None,
177
177
  output_type: Optional[str] = "pil",
178
178
  return_dict: bool = True,
179
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
179
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
180
180
  callback_steps: int = 1,
181
181
  ):
182
182
  r"""
@@ -195,7 +195,7 @@ class ConsistencyModelPipeline(DiffusionPipeline):
195
195
  generator (`torch.Generator`, *optional*):
196
196
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
197
197
  generation deterministic.
198
- latents (`torch.FloatTensor`, *optional*):
198
+ latents (`torch.Tensor`, *optional*):
199
199
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
200
200
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
201
201
  tensor is generated by sampling using the supplied random `generator`.
@@ -205,7 +205,7 @@ class ConsistencyModelPipeline(DiffusionPipeline):
205
205
  Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
206
206
  callback (`Callable`, *optional*):
207
207
  A function that calls every `callback_steps` steps during inference. The function is called with the
208
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
208
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
209
209
  callback_steps (`int`, *optional*, defaults to 1):
210
210
  The frequency at which the `callback` function is called. If not specified, the callback is called at
211
211
  every step.
@@ -31,7 +31,7 @@ class MultiControlNetModel(ModelMixin):
31
31
 
32
32
  def forward(
33
33
  self,
34
- sample: torch.FloatTensor,
34
+ sample: torch.Tensor,
35
35
  timestep: Union[torch.Tensor, float, int],
36
36
  encoder_hidden_states: torch.Tensor,
37
37
  controlnet_cond: List[torch.tensor],
@@ -100,20 +100,16 @@ class MultiControlNetModel(ModelMixin):
100
100
  variant (`str`, *optional*):
101
101
  If specified, weights are saved in the format pytorch_model.<variant>.bin.
102
102
  """
103
- idx = 0
104
- model_path_to_save = save_directory
105
- for controlnet in self.nets:
103
+ for idx, controlnet in enumerate(self.nets):
104
+ suffix = "" if idx == 0 else f"_{idx}"
106
105
  controlnet.save_pretrained(
107
- model_path_to_save,
106
+ save_directory + suffix,
108
107
  is_main_process=is_main_process,
109
108
  save_function=save_function,
110
109
  safe_serialization=safe_serialization,
111
110
  variant=variant,
112
111
  )
113
112
 
114
- idx += 1
115
- model_path_to_save = model_path_to_save + f"_{idx}"
116
-
117
113
  @classmethod
118
114
  def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
119
115
  r"""