diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +20 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +23 -25
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +46 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
  229. diffusers/schedulers/scheduling_edm_euler.py +53 -30
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
  231. diffusers/schedulers/scheduling_euler_discrete.py +163 -67
  232. diffusers/schedulers/scheduling_heun_discrete.py +60 -38
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +27 -25
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. diffusers-0.27.1.dist-info/RECORD +0 -399
  267. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  268. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
  269. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -197,7 +197,7 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
197
197
  and not isinstance(image, list)
198
198
  ):
199
199
  raise ValueError(
200
- "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
200
+ "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
201
201
  f" {type(image)}"
202
202
  )
203
203
 
@@ -214,7 +214,12 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
214
214
 
215
215
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
216
216
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
217
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
217
+ shape = (
218
+ batch_size,
219
+ num_channels_latents,
220
+ int(height) // self.vae_scale_factor,
221
+ int(width) // self.vae_scale_factor,
222
+ )
218
223
  if isinstance(generator, list) and len(generator) != batch_size:
219
224
  raise ValueError(
220
225
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -242,10 +247,10 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
242
247
  num_images_per_prompt: Optional[int] = 1,
243
248
  eta: float = 0.0,
244
249
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
245
- latents: Optional[torch.FloatTensor] = None,
250
+ latents: Optional[torch.Tensor] = None,
246
251
  output_type: Optional[str] = "pil",
247
252
  return_dict: bool = True,
248
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
253
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
249
254
  callback_steps: int = 1,
250
255
  **kwargs,
251
256
  ):
@@ -276,7 +281,7 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
276
281
  generator (`torch.Generator`, *optional*):
277
282
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
278
283
  generation deterministic.
279
- latents (`torch.FloatTensor`, *optional*):
284
+ latents (`torch.Tensor`, *optional*):
280
285
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
281
286
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
282
287
  tensor is generated by sampling using the supplied random `generator`.
@@ -287,7 +292,7 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
287
292
  plain tuple.
288
293
  callback (`Callable`, *optional*):
289
294
  A function that calls every `callback_steps` steps during inference. The function is called with the
290
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
295
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
291
296
  callback_steps (`int`, *optional*, defaults to 1):
292
297
  The frequency at which the `callback` function is called. If not specified, the callback is called at
293
298
  every step.
@@ -300,7 +300,12 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
300
300
 
301
301
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
302
302
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
303
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
303
+ shape = (
304
+ batch_size,
305
+ num_channels_latents,
306
+ int(height) // self.vae_scale_factor,
307
+ int(width) // self.vae_scale_factor,
308
+ )
304
309
  if isinstance(generator, list) and len(generator) != batch_size:
305
310
  raise ValueError(
306
311
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -328,10 +333,10 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
328
333
  num_images_per_prompt: Optional[int] = 1,
329
334
  eta: float = 0.0,
330
335
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
331
- latents: Optional[torch.FloatTensor] = None,
336
+ latents: Optional[torch.Tensor] = None,
332
337
  output_type: Optional[str] = "pil",
333
338
  return_dict: bool = True,
334
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
339
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
335
340
  callback_steps: int = 1,
336
341
  **kwargs,
337
342
  ):
@@ -362,7 +367,7 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
362
367
  generator (`torch.Generator`, *optional*):
363
368
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
364
369
  generation deterministic.
365
- latents (`torch.FloatTensor`, *optional*):
370
+ latents (`torch.Tensor`, *optional*):
366
371
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
367
372
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
368
373
  tensor is generated by sampling using the supplied random `generator`.
@@ -373,7 +378,7 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
373
378
  plain tuple.
374
379
  callback (`Callable`, *optional*):
375
380
  A function that calls every `callback_steps` steps during inference. The function is called with the
376
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
381
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
377
382
  callback_steps (`int`, *optional*, defaults to 1):
378
383
  The frequency at which the `callback` function is called. If not specified, the callback is called at
379
384
  every step.
@@ -169,10 +169,10 @@ class VQDiffusionPipeline(DiffusionPipeline):
169
169
  truncation_rate: float = 1.0,
170
170
  num_images_per_prompt: int = 1,
171
171
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
172
- latents: Optional[torch.FloatTensor] = None,
172
+ latents: Optional[torch.Tensor] = None,
173
173
  output_type: Optional[str] = "pil",
174
174
  return_dict: bool = True,
175
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
175
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
176
176
  callback_steps: int = 1,
177
177
  ) -> Union[ImagePipelineOutput, Tuple]:
178
178
  """
@@ -196,7 +196,7 @@ class VQDiffusionPipeline(DiffusionPipeline):
196
196
  generator (`torch.Generator`, *optional*):
197
197
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
198
198
  generation deterministic.
199
- latents (`torch.FloatTensor` of shape (batch), *optional*):
199
+ latents (`torch.Tensor` of shape (batch), *optional*):
200
200
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
201
201
  generation. Must be valid embedding indices.If not provided, a latents tensor will be generated of
202
202
  completely masked latent pixels.
@@ -206,7 +206,7 @@ class VQDiffusionPipeline(DiffusionPipeline):
206
206
  Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
207
207
  callback (`Callable`, *optional*):
208
208
  A function that calls every `callback_steps` steps during inference. The function is called with the
209
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
209
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
210
210
  callback_steps (`int`, *optional*, defaults to 1):
211
211
  The frequency at which the `callback` function is called. If not specified, the callback is called at
212
212
  every step.
@@ -301,7 +301,7 @@ class VQDiffusionPipeline(DiffusionPipeline):
301
301
 
302
302
  return ImagePipelineOutput(images=image)
303
303
 
304
- def truncate(self, log_p_x_0: torch.FloatTensor, truncation_rate: float) -> torch.FloatTensor:
304
+ def truncate(self, log_p_x_0: torch.Tensor, truncation_rate: float) -> torch.Tensor:
305
305
  """
306
306
  Truncates `log_p_x_0` such that for each column vector, the total cumulative probability is `truncation_rate`
307
307
  The lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to
@@ -227,6 +227,9 @@ class DiTPipeline(DiffusionPipeline):
227
227
  if output_type == "pil":
228
228
  samples = self.numpy_to_pil(samples)
229
229
 
230
+ # Offload all models
231
+ self.maybe_free_model_hooks()
232
+
230
233
  if not return_dict:
231
234
  return (samples,)
232
235
 
@@ -41,20 +41,20 @@ class FreeInitMixin:
41
41
  num_iters (`int`, *optional*, defaults to `3`):
42
42
  Number of FreeInit noise re-initialization iterations.
43
43
  use_fast_sampling (`bool`, *optional*, defaults to `False`):
44
- Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
45
- the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
44
+ Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables the
45
+ "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
46
46
  method (`str`, *optional*, defaults to `butterworth`):
47
- Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
48
- FreeInit low pass filter.
47
+ Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the FreeInit low
48
+ pass filter.
49
49
  order (`int`, *optional*, defaults to `4`):
50
50
  Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
51
51
  whereas lower values lead to `gaussian` method behaviour.
52
52
  spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
53
- Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
54
- the original implementation.
53
+ Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in the
54
+ original implementation.
55
55
  temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
56
- Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
57
- the original implementation.
56
+ Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in the
57
+ original implementation.
58
58
  """
59
59
  self._free_init_num_iters = num_iters
60
60
  self._free_init_use_fast_sampling = use_fast_sampling
@@ -146,39 +146,40 @@ class FreeInitMixin:
146
146
  ):
147
147
  if free_init_iteration == 0:
148
148
  self._free_init_initial_noise = latents.detach().clone()
149
- return latents, self.scheduler.timesteps
150
-
151
- latent_shape = latents.shape
152
-
153
- free_init_filter_shape = (1, *latent_shape[1:])
154
- free_init_freq_filter = self._get_free_init_freq_filter(
155
- shape=free_init_filter_shape,
156
- device=device,
157
- filter_type=self._free_init_method,
158
- order=self._free_init_order,
159
- spatial_stop_frequency=self._free_init_spatial_stop_frequency,
160
- temporal_stop_frequency=self._free_init_temporal_stop_frequency,
161
- )
162
-
163
- current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1
164
- diffuse_timesteps = torch.full((latent_shape[0],), current_diffuse_timestep).long()
165
-
166
- z_t = self.scheduler.add_noise(
167
- original_samples=latents, noise=self._free_init_initial_noise, timesteps=diffuse_timesteps.to(device)
168
- ).to(dtype=torch.float32)
169
-
170
- z_rand = randn_tensor(
171
- shape=latent_shape,
172
- generator=generator,
173
- device=device,
174
- dtype=torch.float32,
175
- )
176
- latents = self._apply_freq_filter(z_t, z_rand, low_pass_filter=free_init_freq_filter)
177
- latents = latents.to(dtype)
149
+ else:
150
+ latent_shape = latents.shape
151
+
152
+ free_init_filter_shape = (1, *latent_shape[1:])
153
+ free_init_freq_filter = self._get_free_init_freq_filter(
154
+ shape=free_init_filter_shape,
155
+ device=device,
156
+ filter_type=self._free_init_method,
157
+ order=self._free_init_order,
158
+ spatial_stop_frequency=self._free_init_spatial_stop_frequency,
159
+ temporal_stop_frequency=self._free_init_temporal_stop_frequency,
160
+ )
161
+
162
+ current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1
163
+ diffuse_timesteps = torch.full((latent_shape[0],), current_diffuse_timestep).long()
164
+
165
+ z_t = self.scheduler.add_noise(
166
+ original_samples=latents, noise=self._free_init_initial_noise, timesteps=diffuse_timesteps.to(device)
167
+ ).to(dtype=torch.float32)
168
+
169
+ z_rand = randn_tensor(
170
+ shape=latent_shape,
171
+ generator=generator,
172
+ device=device,
173
+ dtype=torch.float32,
174
+ )
175
+ latents = self._apply_freq_filter(z_t, z_rand, low_pass_filter=free_init_freq_filter)
176
+ latents = latents.to(dtype)
178
177
 
179
178
  # Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
180
179
  if self._free_init_use_fast_sampling:
181
- num_inference_steps = int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1))
180
+ num_inference_steps = max(
181
+ 1, int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1))
182
+ )
182
183
  self.scheduler.set_timesteps(num_inference_steps, device=device)
183
184
 
184
185
  return latents, self.scheduler.timesteps
@@ -31,6 +31,7 @@ from ...utils import (
31
31
  replace_example_docstring,
32
32
  )
33
33
  from ...utils.torch_utils import randn_tensor
34
+ from ...video_processor import VideoProcessor
34
35
  from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
35
36
 
36
37
 
@@ -43,10 +44,14 @@ EXAMPLE_DOC_STRING = """
43
44
  >>> from diffusers import I2VGenXLPipeline
44
45
  >>> from diffusers.utils import export_to_gif, load_image
45
46
 
46
- >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
47
+ >>> pipeline = I2VGenXLPipeline.from_pretrained(
48
+ ... "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16"
49
+ ... )
47
50
  >>> pipeline.enable_model_cpu_offload()
48
51
 
49
- >>> image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
52
+ >>> image_url = (
53
+ ... "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
54
+ ... )
50
55
  >>> image = load_image(image_url).convert("RGB")
51
56
 
52
57
  >>> prompt = "Papers were floating in the air on a table in the library"
@@ -59,43 +64,22 @@ EXAMPLE_DOC_STRING = """
59
64
  ... num_inference_steps=50,
60
65
  ... negative_prompt=negative_prompt,
61
66
  ... guidance_scale=9.0,
62
- ... generator=generator
67
+ ... generator=generator,
63
68
  ... ).frames[0]
64
69
  >>> video_path = export_to_gif(frames, "i2v.gif")
65
70
  ```
66
71
  """
67
72
 
68
73
 
69
- # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
70
- def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
71
- batch_size, channels, num_frames, height, width = video.shape
72
- outputs = []
73
- for batch_idx in range(batch_size):
74
- batch_vid = video[batch_idx].permute(1, 0, 2, 3)
75
- batch_output = processor.postprocess(batch_vid, output_type)
76
-
77
- outputs.append(batch_output)
78
-
79
- if output_type == "np":
80
- outputs = np.stack(outputs)
81
-
82
- elif output_type == "pt":
83
- outputs = torch.stack(outputs)
84
-
85
- elif not output_type == "pil":
86
- raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
87
-
88
- return outputs
89
-
90
-
91
74
  @dataclass
92
75
  class I2VGenXLPipelineOutput(BaseOutput):
93
76
  r"""
94
77
  Output class for image-to-video pipeline.
95
78
 
96
- Args:
79
+ Args:
97
80
  frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
98
- List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
81
+ List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
82
+ denoised
99
83
  PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
100
84
  `(batch_size, num_frames, channels, height, width)`
101
85
  """
@@ -151,7 +135,7 @@ class I2VGenXLPipeline(
151
135
  )
152
136
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
153
137
  # `do_resize=False` as we do custom resizing.
154
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_resize=False)
138
+ self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor, do_resize=False)
155
139
 
156
140
  @property
157
141
  def guidance_scale(self):
@@ -170,8 +154,8 @@ class I2VGenXLPipeline(
170
154
  device,
171
155
  num_videos_per_prompt,
172
156
  negative_prompt=None,
173
- prompt_embeds: Optional[torch.FloatTensor] = None,
174
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
157
+ prompt_embeds: Optional[torch.Tensor] = None,
158
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
175
159
  clip_skip: Optional[int] = None,
176
160
  ):
177
161
  r"""
@@ -190,10 +174,10 @@ class I2VGenXLPipeline(
190
174
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
191
175
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
192
176
  less than `1`).
193
- prompt_embeds (`torch.FloatTensor`, *optional*):
177
+ prompt_embeds (`torch.Tensor`, *optional*):
194
178
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
195
179
  provided, text embeddings will be generated from `prompt` input argument.
196
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
180
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
197
181
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
198
182
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
199
183
  argument.
@@ -337,8 +321,8 @@ class I2VGenXLPipeline(
337
321
  dtype = next(self.image_encoder.parameters()).dtype
338
322
 
339
323
  if not isinstance(image, torch.Tensor):
340
- image = self.image_processor.pil_to_numpy(image)
341
- image = self.image_processor.numpy_to_pt(image)
324
+ image = self.video_processor.pil_to_numpy(image)
325
+ image = self.video_processor.numpy_to_pt(image)
342
326
 
343
327
  # Normalize the image with CLIP training stats.
344
328
  image = self.feature_extractor(
@@ -450,7 +434,7 @@ class I2VGenXLPipeline(
450
434
  and not isinstance(image, list)
451
435
  ):
452
436
  raise ValueError(
453
- "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
437
+ "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
454
438
  f" {type(image)}"
455
439
  )
456
440
 
@@ -529,9 +513,9 @@ class I2VGenXLPipeline(
529
513
  num_videos_per_prompt: Optional[int] = 1,
530
514
  decode_chunk_size: Optional[int] = 1,
531
515
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
532
- latents: Optional[torch.FloatTensor] = None,
533
- prompt_embeds: Optional[torch.FloatTensor] = None,
534
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
516
+ latents: Optional[torch.Tensor] = None,
517
+ prompt_embeds: Optional[torch.Tensor] = None,
518
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
535
519
  output_type: Optional[str] = "pil",
536
520
  return_dict: bool = True,
537
521
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -543,7 +527,7 @@ class I2VGenXLPipeline(
543
527
  Args:
544
528
  prompt (`str` or `List[str]`, *optional*):
545
529
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
546
- image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
530
+ image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
547
531
  Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
548
532
  [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
549
533
  height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
@@ -551,7 +535,8 @@ class I2VGenXLPipeline(
551
535
  width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
552
536
  The width in pixels of the generated image.
553
537
  target_fps (`int`, *optional*):
554
- Frames per second. The rate at which the generated images shall be exported to a video after generation. This is also used as a "micro-condition" while generation.
538
+ Frames per second. The rate at which the generated images shall be exported to a video after
539
+ generation. This is also used as a "micro-condition" while generation.
555
540
  num_frames (`int`, *optional*):
556
541
  The number of video frames to generate.
557
542
  num_inference_steps (`int`, *optional*):
@@ -568,20 +553,20 @@ class I2VGenXLPipeline(
568
553
  num_videos_per_prompt (`int`, *optional*):
569
554
  The number of images to generate per prompt.
570
555
  decode_chunk_size (`int`, *optional*):
571
- The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
572
- between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
573
- for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
556
+ The number of frames to decode at a time. The higher the chunk size, the higher the temporal
557
+ consistency between frames, but also the higher the memory consumption. By default, the decoder will
558
+ decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
574
559
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
575
560
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
576
561
  generation deterministic.
577
- latents (`torch.FloatTensor`, *optional*):
562
+ latents (`torch.Tensor`, *optional*):
578
563
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
579
564
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
580
565
  tensor is generated by sampling using the supplied random `generator`.
581
- prompt_embeds (`torch.FloatTensor`, *optional*):
566
+ prompt_embeds (`torch.Tensor`, *optional*):
582
567
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
583
568
  provided, text embeddings are generated from the `prompt` input argument.
584
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
569
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
585
570
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
586
571
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
587
572
  output_type (`str`, *optional*, defaults to `"pil"`):
@@ -651,7 +636,7 @@ class I2VGenXLPipeline(
651
636
 
652
637
  # 3.2.2 Image latents.
653
638
  resized_image = _center_crop_wide(image, (width, height))
654
- image = self.image_processor.preprocess(resized_image).to(device=device, dtype=image_embeddings.dtype)
639
+ image = self.video_processor.preprocess(resized_image).to(device=device, dtype=image_embeddings.dtype)
655
640
  image_latents = self.prepare_image_latents(
656
641
  image,
657
642
  device=device,
@@ -731,7 +716,7 @@ class I2VGenXLPipeline(
731
716
  video = latents
732
717
  else:
733
718
  video_tensor = self.decode_latents(latents, decode_chunk_size=decode_chunk_size)
734
- video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
719
+ video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
735
720
 
736
721
  # 9. Offload all models
737
722
  self.maybe_free_model_hooks()
@@ -233,8 +233,8 @@ class KandinskyPipeline(DiffusionPipeline):
233
233
  def __call__(
234
234
  self,
235
235
  prompt: Union[str, List[str]],
236
- image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
237
- negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
236
+ image_embeds: Union[torch.Tensor, List[torch.Tensor]],
237
+ negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
238
238
  negative_prompt: Optional[Union[str, List[str]]] = None,
239
239
  height: int = 512,
240
240
  width: int = 512,
@@ -242,9 +242,9 @@ class KandinskyPipeline(DiffusionPipeline):
242
242
  guidance_scale: float = 4.0,
243
243
  num_images_per_prompt: int = 1,
244
244
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
245
- latents: Optional[torch.FloatTensor] = None,
245
+ latents: Optional[torch.Tensor] = None,
246
246
  output_type: Optional[str] = "pil",
247
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
247
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
248
248
  callback_steps: int = 1,
249
249
  return_dict: bool = True,
250
250
  ):
@@ -254,9 +254,9 @@ class KandinskyPipeline(DiffusionPipeline):
254
254
  Args:
255
255
  prompt (`str` or `List[str]`):
256
256
  The prompt or prompts to guide the image generation.
257
- image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
257
+ image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
258
258
  The clip image embeddings for text prompt, that will be used to condition the image generation.
259
- negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
259
+ negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
260
260
  The clip image embeddings for negative text prompt, will be used to condition the image generation.
261
261
  negative_prompt (`str` or `List[str]`, *optional*):
262
262
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
@@ -279,7 +279,7 @@ class KandinskyPipeline(DiffusionPipeline):
279
279
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
280
280
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
281
281
  to make generation deterministic.
282
- latents (`torch.FloatTensor`, *optional*):
282
+ latents (`torch.Tensor`, *optional*):
283
283
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
284
284
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
285
285
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -288,7 +288,7 @@ class KandinskyPipeline(DiffusionPipeline):
288
288
  (`np.array`) or `"pt"` (`torch.Tensor`).
289
289
  callback (`Callable`, *optional*):
290
290
  A function that calls every `callback_steps` steps during inference. The function is called with the
291
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
291
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
292
292
  callback_steps (`int`, *optional*, defaults to 1):
293
293
  The frequency at which the `callback` function is called. If not specified, the callback is called at
294
294
  every step.