diffusers 0.27.2__py3-none-any.whl → 0.28.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. diffusers/__init__.py +26 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +33 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +8 -0
  21. diffusers/models/activations.py +23 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +475 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +363 -32
  35. diffusers/models/model_loading_utils.py +177 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_outputs.py +14 -0
  39. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  40. diffusers/models/modeling_utils.py +175 -99
  41. diffusers/models/normalization.py +2 -1
  42. diffusers/models/resnet.py +18 -23
  43. diffusers/models/transformer_temporal.py +3 -3
  44. diffusers/models/transformers/__init__.py +3 -0
  45. diffusers/models/transformers/dit_transformer_2d.py +240 -0
  46. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  47. diffusers/models/transformers/hunyuan_transformer_2d.py +427 -0
  48. diffusers/models/transformers/pixart_transformer_2d.py +336 -0
  49. diffusers/models/transformers/prior_transformer.py +7 -7
  50. diffusers/models/transformers/t5_film_transformer.py +17 -19
  51. diffusers/models/transformers/transformer_2d.py +292 -184
  52. diffusers/models/transformers/transformer_temporal.py +10 -10
  53. diffusers/models/unets/unet_1d.py +5 -5
  54. diffusers/models/unets/unet_1d_blocks.py +29 -29
  55. diffusers/models/unets/unet_2d.py +6 -6
  56. diffusers/models/unets/unet_2d_blocks.py +137 -128
  57. diffusers/models/unets/unet_2d_condition.py +19 -15
  58. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  59. diffusers/models/unets/unet_3d_blocks.py +79 -77
  60. diffusers/models/unets/unet_3d_condition.py +13 -9
  61. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  62. diffusers/models/unets/unet_kandinsky3.py +1 -1
  63. diffusers/models/unets/unet_motion_model.py +114 -14
  64. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  65. diffusers/models/unets/unet_stable_cascade.py +16 -13
  66. diffusers/models/upsampling.py +17 -20
  67. diffusers/models/vq_model.py +16 -15
  68. diffusers/pipelines/__init__.py +27 -3
  69. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  70. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  71. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  72. diffusers/pipelines/animatediff/__init__.py +2 -0
  73. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  74. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  75. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  76. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  77. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  78. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  79. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  80. diffusers/pipelines/auto_pipeline.py +21 -17
  81. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  82. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  83. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  84. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  85. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  86. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  87. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  88. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  89. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  90. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  91. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  92. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  93. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  94. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  95. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  96. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  97. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  98. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  99. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  100. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  101. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  102. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  103. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  104. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  105. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  106. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  107. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  108. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  109. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -18
  110. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  111. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  112. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  113. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  114. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  115. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  116. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  117. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  118. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  119. diffusers/pipelines/dit/pipeline_dit.py +7 -4
  120. diffusers/pipelines/free_init_utils.py +39 -38
  121. diffusers/pipelines/hunyuandit/__init__.py +48 -0
  122. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +881 -0
  123. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  124. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  125. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  126. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  127. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  128. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  129. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  130. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  131. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  132. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  133. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  134. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  135. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  136. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  137. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  138. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  139. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  140. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  141. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  142. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  143. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  144. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  145. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  146. diffusers/pipelines/marigold/__init__.py +50 -0
  147. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  148. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  149. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  150. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  151. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  152. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  153. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  154. diffusers/pipelines/pipeline_loading_utils.py +269 -23
  155. diffusers/pipelines/pipeline_utils.py +266 -37
  156. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  157. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +69 -79
  158. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  159. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  160. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  161. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  162. diffusers/pipelines/shap_e/renderer.py +1 -1
  163. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +18 -18
  164. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  165. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  166. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  167. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  168. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  169. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  172. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  173. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  174. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  175. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  176. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  177. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  178. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  179. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  180. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  181. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  182. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -39
  183. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  184. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  185. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  186. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  187. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  188. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  189. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  190. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  191. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  192. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  193. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  194. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  195. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  196. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  197. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  198. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  199. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  200. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  201. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  202. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  203. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  204. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  205. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  206. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  207. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  208. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  209. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  210. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  211. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  212. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  213. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  214. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  215. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  216. diffusers/schedulers/__init__.py +2 -2
  217. diffusers/schedulers/deprecated/__init__.py +1 -1
  218. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  219. diffusers/schedulers/scheduling_amused.py +5 -5
  220. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  221. diffusers/schedulers/scheduling_consistency_models.py +20 -26
  222. diffusers/schedulers/scheduling_ddim.py +22 -24
  223. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  224. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  225. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  226. diffusers/schedulers/scheduling_ddpm.py +20 -22
  227. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  228. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  229. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  230. diffusers/schedulers/scheduling_deis_multistep.py +42 -42
  231. diffusers/schedulers/scheduling_dpmsolver_multistep.py +103 -77
  232. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  233. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  234. diffusers/schedulers/scheduling_dpmsolver_sde.py +23 -23
  235. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +86 -65
  236. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +75 -54
  237. diffusers/schedulers/scheduling_edm_euler.py +50 -31
  238. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +23 -29
  239. diffusers/schedulers/scheduling_euler_discrete.py +160 -68
  240. diffusers/schedulers/scheduling_heun_discrete.py +57 -39
  241. diffusers/schedulers/scheduling_ipndm.py +8 -8
  242. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +19 -19
  243. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +19 -19
  244. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  245. diffusers/schedulers/scheduling_lcm.py +21 -23
  246. diffusers/schedulers/scheduling_lms_discrete.py +24 -26
  247. diffusers/schedulers/scheduling_pndm.py +20 -20
  248. diffusers/schedulers/scheduling_repaint.py +20 -20
  249. diffusers/schedulers/scheduling_sasolver.py +55 -54
  250. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  251. diffusers/schedulers/scheduling_tcd.py +39 -30
  252. diffusers/schedulers/scheduling_unclip.py +15 -15
  253. diffusers/schedulers/scheduling_unipc_multistep.py +111 -41
  254. diffusers/schedulers/scheduling_utils.py +14 -5
  255. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  256. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  257. diffusers/training_utils.py +56 -1
  258. diffusers/utils/__init__.py +7 -0
  259. diffusers/utils/doc_utils.py +1 -0
  260. diffusers/utils/dummy_pt_objects.py +75 -0
  261. diffusers/utils/dummy_torch_and_transformers_objects.py +105 -0
  262. diffusers/utils/dynamic_modules_utils.py +24 -11
  263. diffusers/utils/hub_utils.py +3 -2
  264. diffusers/utils/import_utils.py +91 -0
  265. diffusers/utils/loading_utils.py +2 -2
  266. diffusers/utils/logging.py +1 -1
  267. diffusers/utils/peft_utils.py +32 -5
  268. diffusers/utils/state_dict_utils.py +11 -2
  269. diffusers/utils/testing_utils.py +71 -6
  270. diffusers/utils/torch_utils.py +1 -0
  271. diffusers/video_processor.py +113 -0
  272. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/METADATA +7 -7
  273. diffusers-0.28.1.dist-info/RECORD +419 -0
  274. diffusers-0.27.2.dist-info/RECORD +0 -399
  275. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/LICENSE +0 -0
  276. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/WHEEL +0 -0
  277. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/entry_points.txt +0 -0
  278. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/top_level.txt +0 -0
@@ -31,6 +31,7 @@ from ...utils import (
31
31
  replace_example_docstring,
32
32
  )
33
33
  from ...utils.torch_utils import randn_tensor
34
+ from ...video_processor import VideoProcessor
34
35
  from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
35
36
 
36
37
 
@@ -43,10 +44,14 @@ EXAMPLE_DOC_STRING = """
43
44
  >>> from diffusers import I2VGenXLPipeline
44
45
  >>> from diffusers.utils import export_to_gif, load_image
45
46
 
46
- >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
47
+ >>> pipeline = I2VGenXLPipeline.from_pretrained(
48
+ ... "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16"
49
+ ... )
47
50
  >>> pipeline.enable_model_cpu_offload()
48
51
 
49
- >>> image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
52
+ >>> image_url = (
53
+ ... "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
54
+ ... )
50
55
  >>> image = load_image(image_url).convert("RGB")
51
56
 
52
57
  >>> prompt = "Papers were floating in the air on a table in the library"
@@ -59,43 +64,22 @@ EXAMPLE_DOC_STRING = """
59
64
  ... num_inference_steps=50,
60
65
  ... negative_prompt=negative_prompt,
61
66
  ... guidance_scale=9.0,
62
- ... generator=generator
67
+ ... generator=generator,
63
68
  ... ).frames[0]
64
69
  >>> video_path = export_to_gif(frames, "i2v.gif")
65
70
  ```
66
71
  """
67
72
 
68
73
 
69
- # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
70
- def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
71
- batch_size, channels, num_frames, height, width = video.shape
72
- outputs = []
73
- for batch_idx in range(batch_size):
74
- batch_vid = video[batch_idx].permute(1, 0, 2, 3)
75
- batch_output = processor.postprocess(batch_vid, output_type)
76
-
77
- outputs.append(batch_output)
78
-
79
- if output_type == "np":
80
- outputs = np.stack(outputs)
81
-
82
- elif output_type == "pt":
83
- outputs = torch.stack(outputs)
84
-
85
- elif not output_type == "pil":
86
- raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
87
-
88
- return outputs
89
-
90
-
91
74
  @dataclass
92
75
  class I2VGenXLPipelineOutput(BaseOutput):
93
76
  r"""
94
77
  Output class for image-to-video pipeline.
95
78
 
96
- Args:
79
+ Args:
97
80
  frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
98
- List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
81
+ List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
82
+ denoised
99
83
  PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
100
84
  `(batch_size, num_frames, channels, height, width)`
101
85
  """
@@ -151,7 +135,7 @@ class I2VGenXLPipeline(
151
135
  )
152
136
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
153
137
  # `do_resize=False` as we do custom resizing.
154
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_resize=False)
138
+ self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor, do_resize=False)
155
139
 
156
140
  @property
157
141
  def guidance_scale(self):
@@ -170,8 +154,8 @@ class I2VGenXLPipeline(
170
154
  device,
171
155
  num_videos_per_prompt,
172
156
  negative_prompt=None,
173
- prompt_embeds: Optional[torch.FloatTensor] = None,
174
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
157
+ prompt_embeds: Optional[torch.Tensor] = None,
158
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
175
159
  clip_skip: Optional[int] = None,
176
160
  ):
177
161
  r"""
@@ -190,10 +174,10 @@ class I2VGenXLPipeline(
190
174
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
191
175
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
192
176
  less than `1`).
193
- prompt_embeds (`torch.FloatTensor`, *optional*):
177
+ prompt_embeds (`torch.Tensor`, *optional*):
194
178
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
195
179
  provided, text embeddings will be generated from `prompt` input argument.
196
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
180
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
197
181
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
198
182
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
199
183
  argument.
@@ -337,8 +321,8 @@ class I2VGenXLPipeline(
337
321
  dtype = next(self.image_encoder.parameters()).dtype
338
322
 
339
323
  if not isinstance(image, torch.Tensor):
340
- image = self.image_processor.pil_to_numpy(image)
341
- image = self.image_processor.numpy_to_pt(image)
324
+ image = self.video_processor.pil_to_numpy(image)
325
+ image = self.video_processor.numpy_to_pt(image)
342
326
 
343
327
  # Normalize the image with CLIP training stats.
344
328
  image = self.feature_extractor(
@@ -450,7 +434,7 @@ class I2VGenXLPipeline(
450
434
  and not isinstance(image, list)
451
435
  ):
452
436
  raise ValueError(
453
- "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
437
+ "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
454
438
  f" {type(image)}"
455
439
  )
456
440
 
@@ -529,9 +513,9 @@ class I2VGenXLPipeline(
529
513
  num_videos_per_prompt: Optional[int] = 1,
530
514
  decode_chunk_size: Optional[int] = 1,
531
515
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
532
- latents: Optional[torch.FloatTensor] = None,
533
- prompt_embeds: Optional[torch.FloatTensor] = None,
534
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
516
+ latents: Optional[torch.Tensor] = None,
517
+ prompt_embeds: Optional[torch.Tensor] = None,
518
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
535
519
  output_type: Optional[str] = "pil",
536
520
  return_dict: bool = True,
537
521
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -543,7 +527,7 @@ class I2VGenXLPipeline(
543
527
  Args:
544
528
  prompt (`str` or `List[str]`, *optional*):
545
529
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
546
- image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
530
+ image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
547
531
  Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
548
532
  [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
549
533
  height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
@@ -551,7 +535,8 @@ class I2VGenXLPipeline(
551
535
  width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
552
536
  The width in pixels of the generated image.
553
537
  target_fps (`int`, *optional*):
554
- Frames per second. The rate at which the generated images shall be exported to a video after generation. This is also used as a "micro-condition" while generation.
538
+ Frames per second. The rate at which the generated images shall be exported to a video after
539
+ generation. This is also used as a "micro-condition" while generation.
555
540
  num_frames (`int`, *optional*):
556
541
  The number of video frames to generate.
557
542
  num_inference_steps (`int`, *optional*):
@@ -568,20 +553,20 @@ class I2VGenXLPipeline(
568
553
  num_videos_per_prompt (`int`, *optional*):
569
554
  The number of images to generate per prompt.
570
555
  decode_chunk_size (`int`, *optional*):
571
- The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
572
- between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
573
- for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
556
+ The number of frames to decode at a time. The higher the chunk size, the higher the temporal
557
+ consistency between frames, but also the higher the memory consumption. By default, the decoder will
558
+ decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
574
559
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
575
560
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
576
561
  generation deterministic.
577
- latents (`torch.FloatTensor`, *optional*):
562
+ latents (`torch.Tensor`, *optional*):
578
563
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
579
564
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
580
565
  tensor is generated by sampling using the supplied random `generator`.
581
- prompt_embeds (`torch.FloatTensor`, *optional*):
566
+ prompt_embeds (`torch.Tensor`, *optional*):
582
567
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
583
568
  provided, text embeddings are generated from the `prompt` input argument.
584
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
569
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
585
570
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
586
571
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
587
572
  output_type (`str`, *optional*, defaults to `"pil"`):
@@ -651,7 +636,7 @@ class I2VGenXLPipeline(
651
636
 
652
637
  # 3.2.2 Image latents.
653
638
  resized_image = _center_crop_wide(image, (width, height))
654
- image = self.image_processor.preprocess(resized_image).to(device=device, dtype=image_embeddings.dtype)
639
+ image = self.video_processor.preprocess(resized_image).to(device=device, dtype=image_embeddings.dtype)
655
640
  image_latents = self.prepare_image_latents(
656
641
  image,
657
642
  device=device,
@@ -731,7 +716,7 @@ class I2VGenXLPipeline(
731
716
  video = latents
732
717
  else:
733
718
  video_tensor = self.decode_latents(latents, decode_chunk_size=decode_chunk_size)
734
- video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
719
+ video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
735
720
 
736
721
  # 9. Offload all models
737
722
  self.maybe_free_model_hooks()
@@ -233,8 +233,8 @@ class KandinskyPipeline(DiffusionPipeline):
233
233
  def __call__(
234
234
  self,
235
235
  prompt: Union[str, List[str]],
236
- image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
237
- negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
236
+ image_embeds: Union[torch.Tensor, List[torch.Tensor]],
237
+ negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
238
238
  negative_prompt: Optional[Union[str, List[str]]] = None,
239
239
  height: int = 512,
240
240
  width: int = 512,
@@ -242,9 +242,9 @@ class KandinskyPipeline(DiffusionPipeline):
242
242
  guidance_scale: float = 4.0,
243
243
  num_images_per_prompt: int = 1,
244
244
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
245
- latents: Optional[torch.FloatTensor] = None,
245
+ latents: Optional[torch.Tensor] = None,
246
246
  output_type: Optional[str] = "pil",
247
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
247
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
248
248
  callback_steps: int = 1,
249
249
  return_dict: bool = True,
250
250
  ):
@@ -254,9 +254,9 @@ class KandinskyPipeline(DiffusionPipeline):
254
254
  Args:
255
255
  prompt (`str` or `List[str]`):
256
256
  The prompt or prompts to guide the image generation.
257
- image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
257
+ image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
258
258
  The clip image embeddings for text prompt, that will be used to condition the image generation.
259
- negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
259
+ negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
260
260
  The clip image embeddings for negative text prompt, will be used to condition the image generation.
261
261
  negative_prompt (`str` or `List[str]`, *optional*):
262
262
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
@@ -279,7 +279,7 @@ class KandinskyPipeline(DiffusionPipeline):
279
279
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
280
280
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
281
281
  to make generation deterministic.
282
- latents (`torch.FloatTensor`, *optional*):
282
+ latents (`torch.Tensor`, *optional*):
283
283
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
284
284
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
285
285
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -288,7 +288,7 @@ class KandinskyPipeline(DiffusionPipeline):
288
288
  (`np.array`) or `"pt"` (`torch.Tensor`).
289
289
  callback (`Callable`, *optional*):
290
290
  A function that calls every `callback_steps` steps during inference. The function is called with the
291
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
291
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
292
292
  callback_steps (`int`, *optional*, defaults to 1):
293
293
  The frequency at which the `callback` function is called. If not specified, the callback is called at
294
294
  every step.
@@ -129,7 +129,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
129
129
  movq ([`VQModel`]):
130
130
  MoVQ Decoder to generate the image from the latents.
131
131
  prior_prior ([`PriorTransformer`]):
132
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
132
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
133
133
  prior_image_encoder ([`CLIPVisionModelWithProjection`]):
134
134
  Frozen image-encoder.
135
135
  prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -143,6 +143,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
143
143
 
144
144
  _load_connected_pipes = True
145
145
  model_cpu_offload_seq = "text_encoder->unet->movq->prior_prior->prior_image_encoder->prior_text_encoder"
146
+ _exclude_from_cpu_offload = ["prior_prior"]
146
147
 
147
148
  def __init__(
148
149
  self,
@@ -225,9 +226,9 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
225
226
  prior_guidance_scale: float = 4.0,
226
227
  prior_num_inference_steps: int = 25,
227
228
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
228
- latents: Optional[torch.FloatTensor] = None,
229
+ latents: Optional[torch.Tensor] = None,
229
230
  output_type: Optional[str] = "pil",
230
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
231
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
231
232
  callback_steps: int = 1,
232
233
  return_dict: bool = True,
233
234
  ):
@@ -267,7 +268,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
267
268
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
268
269
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
269
270
  to make generation deterministic.
270
- latents (`torch.FloatTensor`, *optional*):
271
+ latents (`torch.Tensor`, *optional*):
271
272
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
272
273
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
273
274
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -276,7 +277,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
276
277
  (`np.array`) or `"pt"` (`torch.Tensor`).
277
278
  callback (`Callable`, *optional*):
278
279
  A function that calls every `callback_steps` steps during inference. The function is called with the
279
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
280
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
280
281
  callback_steps (`int`, *optional*, defaults to 1):
281
282
  The frequency at which the `callback` function is called. If not specified, the callback is called at
282
283
  every step.
@@ -346,7 +347,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
346
347
  movq ([`VQModel`]):
347
348
  MoVQ Decoder to generate the image from the latents.
348
349
  prior_prior ([`PriorTransformer`]):
349
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
350
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
350
351
  prior_image_encoder ([`CLIPVisionModelWithProjection`]):
351
352
  Frozen image-encoder.
352
353
  prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -360,6 +361,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
360
361
 
361
362
  _load_connected_pipes = True
362
363
  model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->" "text_encoder->unet->movq"
364
+ _exclude_from_cpu_offload = ["prior_prior"]
363
365
 
364
366
  def __init__(
365
367
  self,
@@ -434,7 +436,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
434
436
  def __call__(
435
437
  self,
436
438
  prompt: Union[str, List[str]],
437
- image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
439
+ image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
438
440
  negative_prompt: Optional[Union[str, List[str]]] = None,
439
441
  num_inference_steps: int = 100,
440
442
  guidance_scale: float = 4.0,
@@ -445,9 +447,9 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
445
447
  prior_guidance_scale: float = 4.0,
446
448
  prior_num_inference_steps: int = 25,
447
449
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
448
- latents: Optional[torch.FloatTensor] = None,
450
+ latents: Optional[torch.Tensor] = None,
449
451
  output_type: Optional[str] = "pil",
450
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
452
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
451
453
  callback_steps: int = 1,
452
454
  return_dict: bool = True,
453
455
  ):
@@ -457,7 +459,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
457
459
  Args:
458
460
  prompt (`str` or `List[str]`):
459
461
  The prompt or prompts to guide the image generation.
460
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
462
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
461
463
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
462
464
  process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
463
465
  again.
@@ -497,7 +499,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
497
499
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
498
500
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
499
501
  to make generation deterministic.
500
- latents (`torch.FloatTensor`, *optional*):
502
+ latents (`torch.Tensor`, *optional*):
501
503
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
502
504
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
503
505
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -506,7 +508,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
506
508
  (`np.array`) or `"pt"` (`torch.Tensor`).
507
509
  callback (`Callable`, *optional*):
508
510
  A function that calls every `callback_steps` steps during inference. The function is called with the
509
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
511
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
510
512
  callback_steps (`int`, *optional*, defaults to 1):
511
513
  The frequency at which the `callback` function is called. If not specified, the callback is called at
512
514
  every step.
@@ -586,7 +588,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
586
588
  movq ([`VQModel`]):
587
589
  MoVQ Decoder to generate the image from the latents.
588
590
  prior_prior ([`PriorTransformer`]):
589
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
591
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
590
592
  prior_image_encoder ([`CLIPVisionModelWithProjection`]):
591
593
  Frozen image-encoder.
592
594
  prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -600,6 +602,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
600
602
 
601
603
  _load_connected_pipes = True
602
604
  model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->text_encoder->unet->movq"
605
+ _exclude_from_cpu_offload = ["prior_prior"]
603
606
 
604
607
  def __init__(
605
608
  self,
@@ -674,8 +677,8 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
674
677
  def __call__(
675
678
  self,
676
679
  prompt: Union[str, List[str]],
677
- image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
678
- mask_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
680
+ image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
681
+ mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
679
682
  negative_prompt: Optional[Union[str, List[str]]] = None,
680
683
  num_inference_steps: int = 100,
681
684
  guidance_scale: float = 4.0,
@@ -685,9 +688,9 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
685
688
  prior_guidance_scale: float = 4.0,
686
689
  prior_num_inference_steps: int = 25,
687
690
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
688
- latents: Optional[torch.FloatTensor] = None,
691
+ latents: Optional[torch.Tensor] = None,
689
692
  output_type: Optional[str] = "pil",
690
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
693
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
691
694
  callback_steps: int = 1,
692
695
  return_dict: bool = True,
693
696
  ):
@@ -697,7 +700,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
697
700
  Args:
698
701
  prompt (`str` or `List[str]`):
699
702
  The prompt or prompts to guide the image generation.
700
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
703
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
701
704
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
702
705
  process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
703
706
  again.
@@ -736,7 +739,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
736
739
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
737
740
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
738
741
  to make generation deterministic.
739
- latents (`torch.FloatTensor`, *optional*):
742
+ latents (`torch.Tensor`, *optional*):
740
743
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
741
744
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
742
745
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -745,7 +748,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
745
748
  (`np.array`) or `"pt"` (`torch.Tensor`).
746
749
  callback (`Callable`, *optional*):
747
750
  A function that calls every `callback_steps` steps during inference. The function is called with the
748
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
751
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
749
752
  callback_steps (`int`, *optional*, defaults to 1):
750
753
  The frequency at which the `callback` function is called. If not specified, the callback is called at
751
754
  every step.
@@ -266,10 +266,10 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
266
266
  # add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling
267
267
  def add_noise(
268
268
  self,
269
- original_samples: torch.FloatTensor,
270
- noise: torch.FloatTensor,
269
+ original_samples: torch.Tensor,
270
+ noise: torch.Tensor,
271
271
  timesteps: torch.IntTensor,
272
- ) -> torch.FloatTensor:
272
+ ) -> torch.Tensor:
273
273
  betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32)
274
274
  alphas = 1.0 - betas
275
275
  alphas_cumprod = torch.cumprod(alphas, dim=0)
@@ -295,9 +295,9 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
295
295
  def __call__(
296
296
  self,
297
297
  prompt: Union[str, List[str]],
298
- image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
299
- image_embeds: torch.FloatTensor,
300
- negative_image_embeds: torch.FloatTensor,
298
+ image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
299
+ image_embeds: torch.Tensor,
300
+ negative_image_embeds: torch.Tensor,
301
301
  negative_prompt: Optional[Union[str, List[str]]] = None,
302
302
  height: int = 512,
303
303
  width: int = 512,
@@ -307,7 +307,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
307
307
  num_images_per_prompt: int = 1,
308
308
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
309
309
  output_type: Optional[str] = "pil",
310
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
310
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
311
311
  callback_steps: int = 1,
312
312
  return_dict: bool = True,
313
313
  ):
@@ -317,12 +317,12 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
317
317
  Args:
318
318
  prompt (`str` or `List[str]`):
319
319
  The prompt or prompts to guide the image generation.
320
- image (`torch.FloatTensor`, `PIL.Image.Image`):
320
+ image (`torch.Tensor`, `PIL.Image.Image`):
321
321
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
322
322
  process.
323
- image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
323
+ image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
324
324
  The clip image embeddings for text prompt, that will be used to condition the image generation.
325
- negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
325
+ negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
326
326
  The clip image embeddings for negative text prompt, will be used to condition the image generation.
327
327
  negative_prompt (`str` or `List[str]`, *optional*):
328
328
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
@@ -356,7 +356,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
356
356
  (`np.array`) or `"pt"` (`torch.Tensor`).
357
357
  callback (`Callable`, *optional*):
358
358
  A function that calls every `callback_steps` steps during inference. The function is called with the
359
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
359
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
360
360
  callback_steps (`int`, *optional*, defaults to 1):
361
361
  The frequency at which the `callback` function is called. If not specified, the callback is called at
362
362
  every step.
@@ -398,10 +398,10 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
398
398
  def __call__(
399
399
  self,
400
400
  prompt: Union[str, List[str]],
401
- image: Union[torch.FloatTensor, PIL.Image.Image],
402
- mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
403
- image_embeds: torch.FloatTensor,
404
- negative_image_embeds: torch.FloatTensor,
401
+ image: Union[torch.Tensor, PIL.Image.Image],
402
+ mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray],
403
+ image_embeds: torch.Tensor,
404
+ negative_image_embeds: torch.Tensor,
405
405
  negative_prompt: Optional[Union[str, List[str]]] = None,
406
406
  height: int = 512,
407
407
  width: int = 512,
@@ -409,9 +409,9 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
409
409
  guidance_scale: float = 4.0,
410
410
  num_images_per_prompt: int = 1,
411
411
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
412
- latents: Optional[torch.FloatTensor] = None,
412
+ latents: Optional[torch.Tensor] = None,
413
413
  output_type: Optional[str] = "pil",
414
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
414
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
415
415
  callback_steps: int = 1,
416
416
  return_dict: bool = True,
417
417
  ):
@@ -421,10 +421,10 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
421
421
  Args:
422
422
  prompt (`str` or `List[str]`):
423
423
  The prompt or prompts to guide the image generation.
424
- image (`torch.FloatTensor`, `PIL.Image.Image` or `np.ndarray`):
424
+ image (`torch.Tensor`, `PIL.Image.Image` or `np.ndarray`):
425
425
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
426
426
  process.
427
- mask_image (`PIL.Image.Image`,`torch.FloatTensor` or `np.ndarray`):
427
+ mask_image (`PIL.Image.Image`,`torch.Tensor` or `np.ndarray`):
428
428
  `Image`, or a tensor representing an image batch, to mask `image`. White pixels in the mask will be
429
429
  repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if the
430
430
  image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, so the
@@ -432,9 +432,9 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
432
432
  image or numpy array, mask should also be a either PIL image or numpy array. If it is a PIL image, it
433
433
  will be converted to a single channel (luminance) before use. If it is a nummpy array, the expected
434
434
  shape is `(H, W)`.
435
- image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
435
+ image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
436
436
  The clip image embeddings for text prompt, that will be used to condition the image generation.
437
- negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
437
+ negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
438
438
  The clip image embeddings for negative text prompt, will be used to condition the image generation.
439
439
  negative_prompt (`str` or `List[str]`, *optional*):
440
440
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
@@ -457,7 +457,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
457
457
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
458
458
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
459
459
  to make generation deterministic.
460
- latents (`torch.FloatTensor`, *optional*):
460
+ latents (`torch.Tensor`, *optional*):
461
461
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
462
462
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
463
463
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -466,7 +466,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
466
466
  (`np.array`) or `"pt"` (`torch.Tensor`).
467
467
  callback (`Callable`, *optional*):
468
468
  A function that calls every `callback_steps` steps during inference. The function is called with the
469
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
469
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
470
470
  callback_steps (`int`, *optional*, defaults to 1):
471
471
  The frequency at which the `callback` function is called. If not specified, the callback is called at
472
472
  every step.
@@ -115,14 +115,14 @@ class KandinskyPriorPipelineOutput(BaseOutput):
115
115
  Output class for KandinskyPriorPipeline.
116
116
 
117
117
  Args:
118
- image_embeds (`torch.FloatTensor`)
118
+ image_embeds (`torch.Tensor`)
119
119
  clip image embeddings for text prompt
120
120
  negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
121
121
  clip image embeddings for unconditional tokens
122
122
  """
123
123
 
124
- image_embeds: Union[torch.FloatTensor, np.ndarray]
125
- negative_image_embeds: Union[torch.FloatTensor, np.ndarray]
124
+ image_embeds: Union[torch.Tensor, np.ndarray]
125
+ negative_image_embeds: Union[torch.Tensor, np.ndarray]
126
126
 
127
127
 
128
128
  class KandinskyPriorPipeline(DiffusionPipeline):
@@ -134,7 +134,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
134
134
 
135
135
  Args:
136
136
  prior ([`PriorTransformer`]):
137
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
137
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
138
138
  image_encoder ([`CLIPVisionModelWithProjection`]):
139
139
  Frozen image-encoder.
140
140
  text_encoder ([`CLIPTextModelWithProjection`]):
@@ -173,12 +173,12 @@ class KandinskyPriorPipeline(DiffusionPipeline):
173
173
  @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
174
174
  def interpolate(
175
175
  self,
176
- images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
176
+ images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]],
177
177
  weights: List[float],
178
178
  num_images_per_prompt: int = 1,
179
179
  num_inference_steps: int = 25,
180
180
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
181
- latents: Optional[torch.FloatTensor] = None,
181
+ latents: Optional[torch.Tensor] = None,
182
182
  negative_prior_prompt: Optional[str] = None,
183
183
  negative_prompt: str = "",
184
184
  guidance_scale: float = 4.0,
@@ -188,7 +188,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
188
188
  Function invoked when using the prior pipeline for interpolation.
189
189
 
190
190
  Args:
191
- images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
191
+ images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`):
192
192
  list of prompts and images to guide the image generation.
193
193
  weights: (`List[float]`):
194
194
  list of weights for each condition in `images_and_prompts`
@@ -200,7 +200,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
200
200
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
201
201
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
202
202
  to make generation deterministic.
203
- latents (`torch.FloatTensor`, *optional*):
203
+ latents (`torch.Tensor`, *optional*):
204
204
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
205
205
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
206
206
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -403,7 +403,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
403
403
  num_images_per_prompt: int = 1,
404
404
  num_inference_steps: int = 25,
405
405
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
406
- latents: Optional[torch.FloatTensor] = None,
406
+ latents: Optional[torch.Tensor] = None,
407
407
  guidance_scale: float = 4.0,
408
408
  output_type: Optional[str] = "pt",
409
409
  return_dict: bool = True,
@@ -425,7 +425,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
425
425
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
426
426
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
427
427
  to make generation deterministic.
428
- latents (`torch.FloatTensor`, *optional*):
428
+ latents (`torch.Tensor`, *optional*):
429
429
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
430
430
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
431
431
  tensor will ge generated by sampling using the supplied random `generator`.