diffusers 0.27.2__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +19 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -18
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +18 -18
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -39
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +20 -26
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +42 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +103 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +23 -23
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +86 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +75 -54
  229. diffusers/schedulers/scheduling_edm_euler.py +50 -31
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +23 -29
  231. diffusers/schedulers/scheduling_euler_discrete.py +160 -68
  232. diffusers/schedulers/scheduling_heun_discrete.py +57 -39
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +19 -19
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +19 -19
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +24 -26
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +111 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/METADATA +47 -47
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/WHEEL +1 -1
  267. diffusers-0.27.2.dist-info/RECORD +0 -399
  268. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  269. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -140,6 +140,7 @@ def retrieve_timesteps(
140
140
  num_inference_steps: Optional[int] = None,
141
141
  device: Optional[Union[str, torch.device]] = None,
142
142
  timesteps: Optional[List[int]] = None,
143
+ sigmas: Optional[List[float]] = None,
143
144
  **kwargs,
144
145
  ):
145
146
  """
@@ -150,19 +151,23 @@ def retrieve_timesteps(
150
151
  scheduler (`SchedulerMixin`):
151
152
  The scheduler to get timesteps from.
152
153
  num_inference_steps (`int`):
153
- The number of diffusion steps used when generating samples with a pre-trained model. If used,
154
- `timesteps` must be `None`.
154
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
155
+ must be `None`.
155
156
  device (`str` or `torch.device`, *optional*):
156
157
  The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
157
158
  timesteps (`List[int]`, *optional*):
158
- Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
159
- timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
160
- must be `None`.
159
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
160
+ `num_inference_steps` and `sigmas` must be `None`.
161
+ sigmas (`List[float]`, *optional*):
162
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
163
+ `num_inference_steps` and `timesteps` must be `None`.
161
164
 
162
165
  Returns:
163
166
  `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
164
167
  second element is the number of inference steps.
165
168
  """
169
+ if timesteps is not None and sigmas is not None:
170
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
166
171
  if timesteps is not None:
167
172
  accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
168
173
  if not accepts_timesteps:
@@ -173,6 +178,16 @@ def retrieve_timesteps(
173
178
  scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
174
179
  timesteps = scheduler.timesteps
175
180
  num_inference_steps = len(timesteps)
181
+ elif sigmas is not None:
182
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
183
+ if not accept_sigmas:
184
+ raise ValueError(
185
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
186
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
187
+ )
188
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
189
+ timesteps = scheduler.timesteps
190
+ num_inference_steps = len(timesteps)
176
191
  else:
177
192
  scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
178
193
  timesteps = scheduler.timesteps
@@ -281,10 +296,10 @@ class StableDiffusionXLAdapterPipeline(
281
296
  do_classifier_free_guidance: bool = True,
282
297
  negative_prompt: Optional[str] = None,
283
298
  negative_prompt_2: Optional[str] = None,
284
- prompt_embeds: Optional[torch.FloatTensor] = None,
285
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
286
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
287
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
299
+ prompt_embeds: Optional[torch.Tensor] = None,
300
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
301
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
302
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
288
303
  lora_scale: Optional[float] = None,
289
304
  clip_skip: Optional[int] = None,
290
305
  ):
@@ -310,17 +325,17 @@ class StableDiffusionXLAdapterPipeline(
310
325
  negative_prompt_2 (`str` or `List[str]`, *optional*):
311
326
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
312
327
  `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
313
- prompt_embeds (`torch.FloatTensor`, *optional*):
328
+ prompt_embeds (`torch.Tensor`, *optional*):
314
329
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
315
330
  provided, text embeddings will be generated from `prompt` input argument.
316
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
331
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
317
332
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
318
333
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
319
334
  argument.
320
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
335
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
321
336
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
322
337
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
323
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
338
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
324
339
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
325
340
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
326
341
  input argument.
@@ -700,7 +715,12 @@ class StableDiffusionXLAdapterPipeline(
700
715
 
701
716
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
702
717
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
703
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
718
+ shape = (
719
+ batch_size,
720
+ num_channels_latents,
721
+ int(height) // self.vae_scale_factor,
722
+ int(width) // self.vae_scale_factor,
723
+ )
704
724
  if isinstance(generator, list) and len(generator) != batch_size:
705
725
  raise ValueError(
706
726
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -784,20 +804,22 @@ class StableDiffusionXLAdapterPipeline(
784
804
  return height, width
785
805
 
786
806
  # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
787
- def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
807
+ def get_guidance_scale_embedding(
808
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
809
+ ) -> torch.Tensor:
788
810
  """
789
811
  See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
790
812
 
791
813
  Args:
792
- timesteps (`torch.Tensor`):
793
- generate embedding vectors at these timesteps
814
+ w (`torch.Tensor`):
815
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
794
816
  embedding_dim (`int`, *optional*, defaults to 512):
795
- dimension of the embeddings to generate
796
- dtype:
797
- data type of the generated embeddings
817
+ Dimension of the embeddings to generate.
818
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
819
+ Data type of the generated embeddings.
798
820
 
799
821
  Returns:
800
- `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
822
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
801
823
  """
802
824
  assert len(w.shape) == 1
803
825
  w = w * 1000.0
@@ -834,6 +856,7 @@ class StableDiffusionXLAdapterPipeline(
834
856
  width: Optional[int] = None,
835
857
  num_inference_steps: int = 50,
836
858
  timesteps: List[int] = None,
859
+ sigmas: List[float] = None,
837
860
  denoising_end: Optional[float] = None,
838
861
  guidance_scale: float = 5.0,
839
862
  negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -841,16 +864,16 @@ class StableDiffusionXLAdapterPipeline(
841
864
  num_images_per_prompt: Optional[int] = 1,
842
865
  eta: float = 0.0,
843
866
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
844
- latents: Optional[torch.FloatTensor] = None,
845
- prompt_embeds: Optional[torch.FloatTensor] = None,
846
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
847
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
848
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
867
+ latents: Optional[torch.Tensor] = None,
868
+ prompt_embeds: Optional[torch.Tensor] = None,
869
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
870
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
871
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
849
872
  ip_adapter_image: Optional[PipelineImageInput] = None,
850
- ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
873
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
851
874
  output_type: Optional[str] = "pil",
852
875
  return_dict: bool = True,
853
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
876
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
854
877
  callback_steps: int = 1,
855
878
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
856
879
  guidance_rescale: float = 0.0,
@@ -874,9 +897,9 @@ class StableDiffusionXLAdapterPipeline(
874
897
  prompt_2 (`str` or `List[str]`, *optional*):
875
898
  The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
876
899
  used in both text-encoders
877
- image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
900
+ image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
878
901
  The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
879
- type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
902
+ type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
880
903
  accepted as an image. The control image is automatically resized to fit the output image.
881
904
  height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
882
905
  The height in pixels of the generated image. Anything below 512 pixels won't work well for
@@ -893,6 +916,10 @@ class StableDiffusionXLAdapterPipeline(
893
916
  Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
894
917
  in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
895
918
  passed will be used. Must be in descending order.
919
+ sigmas (`List[float]`, *optional*):
920
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
921
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
922
+ will be used.
896
923
  denoising_end (`float`, *optional*):
897
924
  When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
898
925
  completed before it is intentionally prematurely terminated. As a result, the returned sample will
@@ -921,30 +948,30 @@ class StableDiffusionXLAdapterPipeline(
921
948
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
922
949
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
923
950
  to make generation deterministic.
924
- latents (`torch.FloatTensor`, *optional*):
951
+ latents (`torch.Tensor`, *optional*):
925
952
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
926
953
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
927
954
  tensor will ge generated by sampling using the supplied random `generator`.
928
- prompt_embeds (`torch.FloatTensor`, *optional*):
955
+ prompt_embeds (`torch.Tensor`, *optional*):
929
956
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
930
957
  provided, text embeddings will be generated from `prompt` input argument.
931
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
958
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
932
959
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
933
960
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
934
961
  argument.
935
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
962
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
936
963
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
937
964
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
938
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
965
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
939
966
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
940
967
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
941
968
  input argument.
942
969
  ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
943
- ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
944
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
945
- Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
946
- if `do_classifier_free_guidance` is set to `True`.
947
- If not provided, embeddings are computed from the `ip_adapter_image` input argument.
970
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
971
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
972
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
973
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
974
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
948
975
  output_type (`str`, *optional*, defaults to `"pil"`):
949
976
  The output format of the generate image. Choose between
950
977
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -953,7 +980,7 @@ class StableDiffusionXLAdapterPipeline(
953
980
  instead of a plain tuple.
954
981
  callback (`Callable`, *optional*):
955
982
  A function that will be called every `callback_steps` steps during inference. The function will be
956
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
983
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
957
984
  callback_steps (`int`, *optional*, defaults to 1):
958
985
  The frequency at which the `callback` function will be called. If not specified, the callback will be
959
986
  called at every step.
@@ -1094,7 +1121,9 @@ class StableDiffusionXLAdapterPipeline(
1094
1121
  )
1095
1122
 
1096
1123
  # 4. Prepare timesteps
1097
- timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
1124
+ timesteps, num_inference_steps = retrieve_timesteps(
1125
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
1126
+ )
1098
1127
 
1099
1128
  # 5. Prepare latent variables
1100
1129
  num_channels_latents = self.unet.config.in_channels
@@ -15,9 +15,10 @@ class TextToVideoSDPipelineOutput(BaseOutput):
15
15
  """
16
16
  Output class for text-to-video pipelines.
17
17
 
18
- Args:
18
+ Args:
19
19
  frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
20
- List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
20
+ List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
21
+ denoised
21
22
  PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
22
23
  `(batch_size, num_frames, channels, height, width)`
23
24
  """
@@ -15,11 +15,9 @@
15
15
  import inspect
16
16
  from typing import Any, Callable, Dict, List, Optional, Union
17
17
 
18
- import numpy as np
19
18
  import torch
20
19
  from transformers import CLIPTextModel, CLIPTokenizer
21
20
 
22
- from ...image_processor import VaeImageProcessor
23
21
  from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
24
22
  from ...models import AutoencoderKL, UNet3DConditionModel
25
23
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -33,6 +31,7 @@ from ...utils import (
33
31
  unscale_lora_layers,
34
32
  )
35
33
  from ...utils.torch_utils import randn_tensor
34
+ from ...video_processor import VideoProcessor
36
35
  from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
37
36
  from . import TextToVideoSDPipelineOutput
38
37
 
@@ -59,28 +58,6 @@ EXAMPLE_DOC_STRING = """
59
58
  """
60
59
 
61
60
 
62
- # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
63
- def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
64
- batch_size, channels, num_frames, height, width = video.shape
65
- outputs = []
66
- for batch_idx in range(batch_size):
67
- batch_vid = video[batch_idx].permute(1, 0, 2, 3)
68
- batch_output = processor.postprocess(batch_vid, output_type)
69
-
70
- outputs.append(batch_output)
71
-
72
- if output_type == "np":
73
- outputs = np.stack(outputs)
74
-
75
- elif output_type == "pt":
76
- outputs = torch.stack(outputs)
77
-
78
- elif not output_type == "pil":
79
- raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
80
-
81
- return outputs
82
-
83
-
84
61
  class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
85
62
  r"""
86
63
  Pipeline for text-to-video generation.
@@ -127,7 +104,7 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
127
104
  scheduler=scheduler,
128
105
  )
129
106
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
130
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
107
+ self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
131
108
 
132
109
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
133
110
  def _encode_prompt(
@@ -137,8 +114,8 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
137
114
  num_images_per_prompt,
138
115
  do_classifier_free_guidance,
139
116
  negative_prompt=None,
140
- prompt_embeds: Optional[torch.FloatTensor] = None,
141
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
117
+ prompt_embeds: Optional[torch.Tensor] = None,
118
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
142
119
  lora_scale: Optional[float] = None,
143
120
  **kwargs,
144
121
  ):
@@ -170,8 +147,8 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
170
147
  num_images_per_prompt,
171
148
  do_classifier_free_guidance,
172
149
  negative_prompt=None,
173
- prompt_embeds: Optional[torch.FloatTensor] = None,
174
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
150
+ prompt_embeds: Optional[torch.Tensor] = None,
151
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
175
152
  lora_scale: Optional[float] = None,
176
153
  clip_skip: Optional[int] = None,
177
154
  ):
@@ -191,10 +168,10 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
191
168
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
192
169
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
193
170
  less than `1`).
194
- prompt_embeds (`torch.FloatTensor`, *optional*):
171
+ prompt_embeds (`torch.Tensor`, *optional*):
195
172
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
196
173
  provided, text embeddings will be generated from `prompt` input argument.
197
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
174
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
198
175
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
199
176
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
200
177
  argument.
@@ -465,12 +442,12 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
465
442
  negative_prompt: Optional[Union[str, List[str]]] = None,
466
443
  eta: float = 0.0,
467
444
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
468
- latents: Optional[torch.FloatTensor] = None,
469
- prompt_embeds: Optional[torch.FloatTensor] = None,
470
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
445
+ latents: Optional[torch.Tensor] = None,
446
+ prompt_embeds: Optional[torch.Tensor] = None,
447
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
471
448
  output_type: Optional[str] = "np",
472
449
  return_dict: bool = True,
473
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
450
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
474
451
  callback_steps: int = 1,
475
452
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
476
453
  clip_skip: Optional[int] = None,
@@ -505,25 +482,25 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
505
482
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
506
483
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
507
484
  generation deterministic.
508
- latents (`torch.FloatTensor`, *optional*):
485
+ latents (`torch.Tensor`, *optional*):
509
486
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
510
487
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
511
488
  tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
512
489
  `(batch_size, num_channel, num_frames, height, width)`.
513
- prompt_embeds (`torch.FloatTensor`, *optional*):
490
+ prompt_embeds (`torch.Tensor`, *optional*):
514
491
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
515
492
  provided, text embeddings are generated from the `prompt` input argument.
516
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
493
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
517
494
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
518
495
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
519
496
  output_type (`str`, *optional*, defaults to `"np"`):
520
- The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`.
497
+ The output format of the generated video. Choose between `torch.Tensor` or `np.array`.
521
498
  return_dict (`bool`, *optional*, defaults to `True`):
522
499
  Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
523
500
  of a plain tuple.
524
501
  callback (`Callable`, *optional*):
525
502
  A function that calls every `callback_steps` steps during inference. The function is called with the
526
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
503
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
527
504
  callback_steps (`int`, *optional*, defaults to 1):
528
505
  The frequency at which the `callback` function is called. If not specified, the callback is called at
529
506
  every step.
@@ -652,7 +629,7 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve
652
629
  video = latents
653
630
  else:
654
631
  video_tensor = self.decode_latents(latents)
655
- video = tensor2vid(video_tensor, self.image_processor, output_type)
632
+ video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
656
633
 
657
634
  # 9. Offload all models
658
635
  self.maybe_free_model_hooks()
@@ -16,11 +16,9 @@ import inspect
16
16
  from typing import Any, Callable, Dict, List, Optional, Union
17
17
 
18
18
  import numpy as np
19
- import PIL.Image
20
19
  import torch
21
20
  from transformers import CLIPTextModel, CLIPTokenizer
22
21
 
23
- from ...image_processor import VaeImageProcessor
24
22
  from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
25
23
  from ...models import AutoencoderKL, UNet3DConditionModel
26
24
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -34,6 +32,7 @@ from ...utils import (
34
32
  unscale_lora_layers,
35
33
  )
36
34
  from ...utils.torch_utils import randn_tensor
35
+ from ...video_processor import VideoProcessor
37
36
  from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
38
37
  from . import TextToVideoSDPipelineOutput
39
38
 
@@ -94,69 +93,6 @@ def retrieve_latents(
94
93
  raise AttributeError("Could not access latents of provided encoder_output")
95
94
 
96
95
 
97
- # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
98
- def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
99
- batch_size, channels, num_frames, height, width = video.shape
100
- outputs = []
101
- for batch_idx in range(batch_size):
102
- batch_vid = video[batch_idx].permute(1, 0, 2, 3)
103
- batch_output = processor.postprocess(batch_vid, output_type)
104
-
105
- outputs.append(batch_output)
106
-
107
- if output_type == "np":
108
- outputs = np.stack(outputs)
109
-
110
- elif output_type == "pt":
111
- outputs = torch.stack(outputs)
112
-
113
- elif not output_type == "pil":
114
- raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
115
-
116
- return outputs
117
-
118
-
119
- def preprocess_video(video):
120
- supported_formats = (np.ndarray, torch.Tensor, PIL.Image.Image)
121
-
122
- if isinstance(video, supported_formats):
123
- video = [video]
124
- elif not (isinstance(video, list) and all(isinstance(i, supported_formats) for i in video)):
125
- raise ValueError(
126
- f"Input is in incorrect format: {[type(i) for i in video]}. Currently, we only support {', '.join(supported_formats)}"
127
- )
128
-
129
- if isinstance(video[0], PIL.Image.Image):
130
- video = [np.array(frame) for frame in video]
131
-
132
- if isinstance(video[0], np.ndarray):
133
- video = np.concatenate(video, axis=0) if video[0].ndim == 5 else np.stack(video, axis=0)
134
-
135
- if video.dtype == np.uint8:
136
- video = np.array(video).astype(np.float32) / 255.0
137
-
138
- if video.ndim == 4:
139
- video = video[None, ...]
140
-
141
- video = torch.from_numpy(video.transpose(0, 4, 1, 2, 3))
142
-
143
- elif isinstance(video[0], torch.Tensor):
144
- video = torch.cat(video, axis=0) if video[0].ndim == 5 else torch.stack(video, axis=0)
145
-
146
- # don't need any preprocess if the video is latents
147
- channel = video.shape[1]
148
- if channel == 4:
149
- return video
150
-
151
- # move channels before num_frames
152
- video = video.permute(0, 2, 1, 3, 4)
153
-
154
- # normalize video
155
- video = 2.0 * video - 1.0
156
-
157
- return video
158
-
159
-
160
96
  class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
161
97
  r"""
162
98
  Pipeline for text-guided video-to-video generation.
@@ -203,7 +139,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
203
139
  scheduler=scheduler,
204
140
  )
205
141
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
206
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
142
+ self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
207
143
 
208
144
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
209
145
  def _encode_prompt(
@@ -213,8 +149,8 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
213
149
  num_images_per_prompt,
214
150
  do_classifier_free_guidance,
215
151
  negative_prompt=None,
216
- prompt_embeds: Optional[torch.FloatTensor] = None,
217
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
152
+ prompt_embeds: Optional[torch.Tensor] = None,
153
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
218
154
  lora_scale: Optional[float] = None,
219
155
  **kwargs,
220
156
  ):
@@ -246,8 +182,8 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
246
182
  num_images_per_prompt,
247
183
  do_classifier_free_guidance,
248
184
  negative_prompt=None,
249
- prompt_embeds: Optional[torch.FloatTensor] = None,
250
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
185
+ prompt_embeds: Optional[torch.Tensor] = None,
186
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
251
187
  lora_scale: Optional[float] = None,
252
188
  clip_skip: Optional[int] = None,
253
189
  ):
@@ -267,10 +203,10 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
267
203
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
268
204
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
269
205
  less than `1`).
270
- prompt_embeds (`torch.FloatTensor`, *optional*):
206
+ prompt_embeds (`torch.Tensor`, *optional*):
271
207
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
272
208
  provided, text embeddings will be generated from `prompt` input argument.
273
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
209
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
274
210
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
275
211
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
276
212
  argument.
@@ -563,19 +499,19 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
563
499
  def __call__(
564
500
  self,
565
501
  prompt: Union[str, List[str]] = None,
566
- video: Union[List[np.ndarray], torch.FloatTensor] = None,
502
+ video: Union[List[np.ndarray], torch.Tensor] = None,
567
503
  strength: float = 0.6,
568
504
  num_inference_steps: int = 50,
569
505
  guidance_scale: float = 15.0,
570
506
  negative_prompt: Optional[Union[str, List[str]]] = None,
571
507
  eta: float = 0.0,
572
508
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
573
- latents: Optional[torch.FloatTensor] = None,
574
- prompt_embeds: Optional[torch.FloatTensor] = None,
575
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
509
+ latents: Optional[torch.Tensor] = None,
510
+ prompt_embeds: Optional[torch.Tensor] = None,
511
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
576
512
  output_type: Optional[str] = "np",
577
513
  return_dict: bool = True,
578
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
514
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
579
515
  callback_steps: int = 1,
580
516
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
581
517
  clip_skip: Optional[int] = None,
@@ -586,7 +522,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
586
522
  Args:
587
523
  prompt (`str` or `List[str]`, *optional*):
588
524
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
589
- video (`List[np.ndarray]` or `torch.FloatTensor`):
525
+ video (`List[np.ndarray]` or `torch.Tensor`):
590
526
  `video` frames or tensor representing a video batch to be used as the starting point for the process.
591
527
  Can also accept video latents as `image`, if passing latents directly, it will not be encoded again.
592
528
  strength (`float`, *optional*, defaults to 0.8):
@@ -610,25 +546,25 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
610
546
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
611
547
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
612
548
  generation deterministic.
613
- latents (`torch.FloatTensor`, *optional*):
549
+ latents (`torch.Tensor`, *optional*):
614
550
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
615
551
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
616
552
  tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
617
553
  `(batch_size, num_channel, num_frames, height, width)`.
618
- prompt_embeds (`torch.FloatTensor`, *optional*):
554
+ prompt_embeds (`torch.Tensor`, *optional*):
619
555
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
620
556
  provided, text embeddings are generated from the `prompt` input argument.
621
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
557
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
622
558
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
623
559
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
624
560
  output_type (`str`, *optional*, defaults to `"np"`):
625
- The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`.
561
+ The output format of the generated video. Choose between `torch.Tensor` or `np.array`.
626
562
  return_dict (`bool`, *optional*, defaults to `True`):
627
563
  Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
628
564
  of a plain tuple.
629
565
  callback (`Callable`, *optional*):
630
566
  A function that calls every `callback_steps` steps during inference. The function is called with the
631
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
567
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
632
568
  callback_steps (`int`, *optional*, defaults to 1):
633
569
  The frequency at which the `callback` function is called. If not specified, the callback is called at
634
570
  every step.
@@ -687,7 +623,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
687
623
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
688
624
 
689
625
  # 4. Preprocess video
690
- video = preprocess_video(video)
626
+ video = self.video_processor.preprocess_video(video)
691
627
 
692
628
  # 5. Prepare timesteps
693
629
  self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -749,7 +685,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv
749
685
  video = latents
750
686
  else:
751
687
  video_tensor = self.decode_latents(latents)
752
- video = tensor2vid(video_tensor, self.image_processor, output_type)
688
+ video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
753
689
 
754
690
  # 10. Offload all models
755
691
  self.maybe_free_model_hooks()