diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +20 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +23 -25
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +46 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
  229. diffusers/schedulers/scheduling_edm_euler.py +53 -30
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
  231. diffusers/schedulers/scheduling_euler_discrete.py +163 -67
  232. diffusers/schedulers/scheduling_heun_discrete.py +60 -38
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +27 -25
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. diffusers-0.27.1.dist-info/RECORD +0 -399
  267. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  268. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
  269. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -119,7 +119,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
119
119
  movq ([`VQModel`]):
120
120
  MoVQ Decoder to generate the image from the latents.
121
121
  prior_prior ([`PriorTransformer`]):
122
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
122
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
123
123
  prior_image_encoder ([`CLIPVisionModelWithProjection`]):
124
124
  Frozen image-encoder.
125
125
  prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -135,6 +135,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
135
135
 
136
136
  model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
137
137
  _load_connected_pipes = True
138
+ _exclude_from_cpu_offload = ["prior_prior"]
138
139
 
139
140
  def __init__(
140
141
  self,
@@ -178,7 +179,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
178
179
  def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
179
180
  self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
180
181
 
181
- def enable_sequential_cpu_offload(self, gpu_id=0):
182
+ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
182
183
  r"""
183
184
  Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
184
185
  text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -186,8 +187,8 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
186
187
  Note that offloading happens on a submodule basis. Memory savings are higher than with
187
188
  `enable_model_cpu_offload`, but performance is lower.
188
189
  """
189
- self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
190
- self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
190
+ self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
191
+ self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
191
192
 
192
193
  def progress_bar(self, iterable=None, total=None):
193
194
  self.prior_pipe.progress_bar(iterable=iterable, total=total)
@@ -212,9 +213,9 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
212
213
  prior_guidance_scale: float = 4.0,
213
214
  prior_num_inference_steps: int = 25,
214
215
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
215
- latents: Optional[torch.FloatTensor] = None,
216
+ latents: Optional[torch.Tensor] = None,
216
217
  output_type: Optional[str] = "pil",
217
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
218
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
218
219
  callback_steps: int = 1,
219
220
  return_dict: bool = True,
220
221
  prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -258,7 +259,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
258
259
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
259
260
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
260
261
  to make generation deterministic.
261
- latents (`torch.FloatTensor`, *optional*):
262
+ latents (`torch.Tensor`, *optional*):
262
263
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
263
264
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
264
265
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -346,7 +347,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
346
347
  movq ([`VQModel`]):
347
348
  MoVQ Decoder to generate the image from the latents.
348
349
  prior_prior ([`PriorTransformer`]):
349
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
350
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
350
351
  prior_image_encoder ([`CLIPVisionModelWithProjection`]):
351
352
  Frozen image-encoder.
352
353
  prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -362,6 +363,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
362
363
 
363
364
  model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
364
365
  _load_connected_pipes = True
366
+ _exclude_from_cpu_offload = ["prior_prior"]
365
367
 
366
368
  def __init__(
367
369
  self,
@@ -405,17 +407,17 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
405
407
  def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
406
408
  self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
407
409
 
408
- def enable_model_cpu_offload(self, gpu_id=0):
410
+ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
409
411
  r"""
410
412
  Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
411
413
  to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
412
414
  method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
413
415
  `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
414
416
  """
415
- self.prior_pipe.enable_model_cpu_offload()
416
- self.decoder_pipe.enable_model_cpu_offload()
417
+ self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
418
+ self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
417
419
 
418
- def enable_sequential_cpu_offload(self, gpu_id=0):
420
+ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
419
421
  r"""
420
422
  Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
421
423
  text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -423,8 +425,8 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
423
425
  Note that offloading happens on a submodule basis. Memory savings are higher than with
424
426
  `enable_model_cpu_offload`, but performance is lower.
425
427
  """
426
- self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
427
- self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
428
+ self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
429
+ self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
428
430
 
429
431
  def progress_bar(self, iterable=None, total=None):
430
432
  self.prior_pipe.progress_bar(iterable=iterable, total=total)
@@ -440,7 +442,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
440
442
  def __call__(
441
443
  self,
442
444
  prompt: Union[str, List[str]],
443
- image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
445
+ image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
444
446
  negative_prompt: Optional[Union[str, List[str]]] = None,
445
447
  num_inference_steps: int = 100,
446
448
  guidance_scale: float = 4.0,
@@ -451,9 +453,9 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
451
453
  prior_guidance_scale: float = 4.0,
452
454
  prior_num_inference_steps: int = 25,
453
455
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
454
- latents: Optional[torch.FloatTensor] = None,
456
+ latents: Optional[torch.Tensor] = None,
455
457
  output_type: Optional[str] = "pil",
456
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
458
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
457
459
  callback_steps: int = 1,
458
460
  return_dict: bool = True,
459
461
  prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -467,7 +469,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
467
469
  Args:
468
470
  prompt (`str` or `List[str]`):
469
471
  The prompt or prompts to guide the image generation.
470
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
472
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
471
473
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
472
474
  process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
473
475
  again.
@@ -507,7 +509,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
507
509
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
508
510
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
509
511
  to make generation deterministic.
510
- latents (`torch.FloatTensor`, *optional*):
512
+ latents (`torch.Tensor`, *optional*):
511
513
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
512
514
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
513
515
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -516,7 +518,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
516
518
  (`np.array`) or `"pt"` (`torch.Tensor`).
517
519
  callback (`Callable`, *optional*):
518
520
  A function that calls every `callback_steps` steps during inference. The function is called with the
519
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
521
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
520
522
  callback_steps (`int`, *optional*, defaults to 1):
521
523
  The frequency at which the `callback` function is called. If not specified, the callback is called at
522
524
  every step.
@@ -594,7 +596,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
594
596
  movq ([`VQModel`]):
595
597
  MoVQ Decoder to generate the image from the latents.
596
598
  prior_prior ([`PriorTransformer`]):
597
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
599
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
598
600
  prior_image_encoder ([`CLIPVisionModelWithProjection`]):
599
601
  Frozen image-encoder.
600
602
  prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -610,6 +612,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
610
612
 
611
613
  model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
612
614
  _load_connected_pipes = True
615
+ _exclude_from_cpu_offload = ["prior_prior"]
613
616
 
614
617
  def __init__(
615
618
  self,
@@ -653,7 +656,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
653
656
  def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
654
657
  self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
655
658
 
656
- def enable_sequential_cpu_offload(self, gpu_id=0):
659
+ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
657
660
  r"""
658
661
  Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
659
662
  text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -661,8 +664,8 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
661
664
  Note that offloading happens on a submodule basis. Memory savings are higher than with
662
665
  `enable_model_cpu_offload`, but performance is lower.
663
666
  """
664
- self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
665
- self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
667
+ self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
668
+ self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
666
669
 
667
670
  def progress_bar(self, iterable=None, total=None):
668
671
  self.prior_pipe.progress_bar(iterable=iterable, total=total)
@@ -678,8 +681,8 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
678
681
  def __call__(
679
682
  self,
680
683
  prompt: Union[str, List[str]],
681
- image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
682
- mask_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
684
+ image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
685
+ mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
683
686
  negative_prompt: Optional[Union[str, List[str]]] = None,
684
687
  num_inference_steps: int = 100,
685
688
  guidance_scale: float = 4.0,
@@ -689,7 +692,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
689
692
  prior_guidance_scale: float = 4.0,
690
693
  prior_num_inference_steps: int = 25,
691
694
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
692
- latents: Optional[torch.FloatTensor] = None,
695
+ latents: Optional[torch.Tensor] = None,
693
696
  output_type: Optional[str] = "pil",
694
697
  return_dict: bool = True,
695
698
  prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -704,7 +707,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
704
707
  Args:
705
708
  prompt (`str` or `List[str]`):
706
709
  The prompt or prompts to guide the image generation.
707
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
710
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
708
711
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
709
712
  process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
710
713
  again.
@@ -743,7 +746,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
743
746
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
744
747
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
745
748
  to make generation deterministic.
746
- latents (`torch.FloatTensor`, *optional*):
749
+ latents (`torch.Tensor`, *optional*):
747
750
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
748
751
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
749
752
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -151,18 +151,18 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
151
151
  @torch.no_grad()
152
152
  def __call__(
153
153
  self,
154
- image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
155
- negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
156
- hint: torch.FloatTensor,
154
+ image_embeds: Union[torch.Tensor, List[torch.Tensor]],
155
+ negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
156
+ hint: torch.Tensor,
157
157
  height: int = 512,
158
158
  width: int = 512,
159
159
  num_inference_steps: int = 100,
160
160
  guidance_scale: float = 4.0,
161
161
  num_images_per_prompt: int = 1,
162
162
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
163
- latents: Optional[torch.FloatTensor] = None,
163
+ latents: Optional[torch.Tensor] = None,
164
164
  output_type: Optional[str] = "pil",
165
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
165
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
166
166
  callback_steps: int = 1,
167
167
  return_dict: bool = True,
168
168
  ):
@@ -172,11 +172,11 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
172
172
  Args:
173
173
  prompt (`str` or `List[str]`):
174
174
  The prompt or prompts to guide the image generation.
175
- hint (`torch.FloatTensor`):
175
+ hint (`torch.Tensor`):
176
176
  The controlnet condition.
177
- image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
177
+ image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
178
178
  The clip image embeddings for text prompt, that will be used to condition the image generation.
179
- negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
179
+ negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
180
180
  The clip image embeddings for negative text prompt, will be used to condition the image generation.
181
181
  negative_prompt (`str` or `List[str]`, *optional*):
182
182
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
@@ -199,7 +199,7 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
199
199
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
200
200
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
201
201
  to make generation deterministic.
202
- latents (`torch.FloatTensor`, *optional*):
202
+ latents (`torch.Tensor`, *optional*):
203
203
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
204
204
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
205
205
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -208,7 +208,7 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
208
208
  (`np.array`) or `"pt"` (`torch.Tensor`).
209
209
  callback (`Callable`, *optional*):
210
210
  A function that calls every `callback_steps` steps during inference. The function is called with the
211
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
211
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
212
212
  callback_steps (`int`, *optional*, defaults to 1):
213
213
  The frequency at which the `callback` function is called. If not specified, the callback is called at
214
214
  every step.
@@ -206,10 +206,10 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
206
206
  @torch.no_grad()
207
207
  def __call__(
208
208
  self,
209
- image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
210
- image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
211
- negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
212
- hint: torch.FloatTensor,
209
+ image_embeds: Union[torch.Tensor, List[torch.Tensor]],
210
+ image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
211
+ negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
212
+ hint: torch.Tensor,
213
213
  height: int = 512,
214
214
  width: int = 512,
215
215
  num_inference_steps: int = 100,
@@ -218,7 +218,7 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
218
218
  num_images_per_prompt: int = 1,
219
219
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
220
220
  output_type: Optional[str] = "pil",
221
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
221
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
222
222
  callback_steps: int = 1,
223
223
  return_dict: bool = True,
224
224
  ):
@@ -226,9 +226,9 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
226
226
  Function invoked when calling the pipeline for generation.
227
227
 
228
228
  Args:
229
- image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
229
+ image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
230
230
  The clip image embeddings for text prompt, that will be used to condition the image generation.
231
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
231
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
232
232
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
233
233
  process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
234
234
  again.
@@ -238,9 +238,9 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
238
238
  denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
239
239
  be maximum and the denoising process will run for the full number of iterations specified in
240
240
  `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
241
- hint (`torch.FloatTensor`):
241
+ hint (`torch.Tensor`):
242
242
  The controlnet condition.
243
- negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
243
+ negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
244
244
  The clip image embeddings for negative text prompt, will be used to condition the image generation.
245
245
  height (`int`, *optional*, defaults to 512):
246
246
  The height in pixels of the generated image.
@@ -265,7 +265,7 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
265
265
  (`np.array`) or `"pt"` (`torch.Tensor`).
266
266
  callback (`Callable`, *optional*):
267
267
  A function that calls every `callback_steps` steps during inference. The function is called with the
268
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
268
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
269
269
  callback_steps (`int`, *optional*, defaults to 1):
270
270
  The frequency at which the `callback` function is called. If not specified, the callback is called at
271
271
  every step.
@@ -190,9 +190,9 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
190
190
  @torch.no_grad()
191
191
  def __call__(
192
192
  self,
193
- image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
194
- image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
195
- negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
193
+ image_embeds: Union[torch.Tensor, List[torch.Tensor]],
194
+ image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]],
195
+ negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
196
196
  height: int = 512,
197
197
  width: int = 512,
198
198
  num_inference_steps: int = 100,
@@ -210,9 +210,9 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
210
210
  Function invoked when calling the pipeline for generation.
211
211
 
212
212
  Args:
213
- image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
213
+ image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
214
214
  The clip image embeddings for text prompt, that will be used to condition the image generation.
215
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
215
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
216
216
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
217
217
  process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
218
218
  again.
@@ -222,7 +222,7 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
222
222
  denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
223
223
  be maximum and the denoising process will run for the full number of iterations specified in
224
224
  `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
225
- negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
225
+ negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
226
226
  The clip image embeddings for negative text prompt, will be used to condition the image generation.
227
227
  height (`int`, *optional*, defaults to 512):
228
228
  The height in pixels of the generated image.
@@ -294,17 +294,17 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
294
294
  @torch.no_grad()
295
295
  def __call__(
296
296
  self,
297
- image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
298
- image: Union[torch.FloatTensor, PIL.Image.Image],
299
- mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
300
- negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
297
+ image_embeds: Union[torch.Tensor, List[torch.Tensor]],
298
+ image: Union[torch.Tensor, PIL.Image.Image],
299
+ mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray],
300
+ negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]],
301
301
  height: int = 512,
302
302
  width: int = 512,
303
303
  num_inference_steps: int = 100,
304
304
  guidance_scale: float = 4.0,
305
305
  num_images_per_prompt: int = 1,
306
306
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
307
- latents: Optional[torch.FloatTensor] = None,
307
+ latents: Optional[torch.Tensor] = None,
308
308
  output_type: Optional[str] = "pil",
309
309
  return_dict: bool = True,
310
310
  callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -315,7 +315,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
315
315
  Function invoked when calling the pipeline for generation.
316
316
 
317
317
  Args:
318
- image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
318
+ image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
319
319
  The clip image embeddings for text prompt, that will be used to condition the image generation.
320
320
  image (`PIL.Image.Image`):
321
321
  `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
@@ -325,7 +325,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
325
325
  black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
326
326
  channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
327
327
  so the expected shape would be `(B, H, W, 1)`.
328
- negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
328
+ negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
329
329
  The clip image embeddings for negative text prompt, will be used to condition the image generation.
330
330
  height (`int`, *optional*, defaults to 512):
331
331
  The height in pixels of the generated image.
@@ -345,7 +345,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
345
345
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
346
346
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
347
347
  to make generation deterministic.
348
- latents (`torch.FloatTensor`, *optional*):
348
+ latents (`torch.Tensor`, *optional*):
349
349
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
350
350
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
351
351
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -90,7 +90,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
90
90
 
91
91
  Args:
92
92
  prior ([`PriorTransformer`]):
93
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
93
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
94
94
  image_encoder ([`CLIPVisionModelWithProjection`]):
95
95
  Frozen image-encoder.
96
96
  text_encoder ([`CLIPTextModelWithProjection`]):
@@ -132,12 +132,12 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
132
132
  @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
133
133
  def interpolate(
134
134
  self,
135
- images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
135
+ images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]],
136
136
  weights: List[float],
137
137
  num_images_per_prompt: int = 1,
138
138
  num_inference_steps: int = 25,
139
139
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
140
- latents: Optional[torch.FloatTensor] = None,
140
+ latents: Optional[torch.Tensor] = None,
141
141
  negative_prior_prompt: Optional[str] = None,
142
142
  negative_prompt: str = "",
143
143
  guidance_scale: float = 4.0,
@@ -147,7 +147,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
147
147
  Function invoked when using the prior pipeline for interpolation.
148
148
 
149
149
  Args:
150
- images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
150
+ images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`):
151
151
  list of prompts and images to guide the image generation.
152
152
  weights: (`List[float]`):
153
153
  list of weights for each condition in `images_and_prompts`
@@ -159,7 +159,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
159
159
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
160
160
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
161
161
  to make generation deterministic.
162
- latents (`torch.FloatTensor`, *optional*):
162
+ latents (`torch.Tensor`, *optional*):
163
163
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
164
164
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
165
165
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -376,7 +376,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
376
376
  num_images_per_prompt: int = 1,
377
377
  num_inference_steps: int = 25,
378
378
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
379
- latents: Optional[torch.FloatTensor] = None,
379
+ latents: Optional[torch.Tensor] = None,
380
380
  guidance_scale: float = 4.0,
381
381
  output_type: Optional[str] = "pt", # pt only
382
382
  return_dict: bool = True,
@@ -400,7 +400,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
400
400
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
401
401
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
402
402
  to make generation deterministic.
403
- latents (`torch.FloatTensor`, *optional*):
403
+ latents (`torch.Tensor`, *optional*):
404
404
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
405
405
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
406
406
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -108,7 +108,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
108
108
 
109
109
  Args:
110
110
  prior ([`PriorTransformer`]):
111
- The canonincal unCLIP prior to approximate the image embedding from the text embedding.
111
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
112
112
  image_encoder ([`CLIPVisionModelWithProjection`]):
113
113
  Frozen image-encoder.
114
114
  text_encoder ([`CLIPTextModelWithProjection`]):
@@ -156,12 +156,12 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
156
156
  @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
157
157
  def interpolate(
158
158
  self,
159
- images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
159
+ images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]],
160
160
  weights: List[float],
161
161
  num_images_per_prompt: int = 1,
162
162
  num_inference_steps: int = 25,
163
163
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
164
- latents: Optional[torch.FloatTensor] = None,
164
+ latents: Optional[torch.Tensor] = None,
165
165
  negative_prior_prompt: Optional[str] = None,
166
166
  negative_prompt: str = "",
167
167
  guidance_scale: float = 4.0,
@@ -171,7 +171,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
171
171
  Function invoked when using the prior pipeline for interpolation.
172
172
 
173
173
  Args:
174
- images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
174
+ images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`):
175
175
  list of prompts and images to guide the image generation.
176
176
  weights: (`List[float]`):
177
177
  list of weights for each condition in `images_and_prompts`
@@ -183,7 +183,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
183
183
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
184
184
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
185
185
  to make generation deterministic.
186
- latents (`torch.FloatTensor`, *optional*):
186
+ latents (`torch.Tensor`, *optional*):
187
187
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
188
188
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
189
189
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -418,7 +418,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
418
418
  Conceptually, indicates how much to transform the reference `emb`. Must be between 0 and 1. `image`
419
419
  will be used as a starting point, adding more noise to it the larger the `strength`. The number of
420
420
  denoising steps depends on the amount of noise initially added.
421
- emb (`torch.FloatTensor`):
421
+ emb (`torch.Tensor`):
422
422
  The image embedding.
423
423
  negative_prompt (`str` or `List[str]`, *optional*):
424
424
  The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
@@ -35,10 +35,10 @@ DYNAMIC_MAP = {
35
35
 
36
36
  def convert_state_dict(unet_state_dict):
37
37
  """
38
- Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
39
38
  Args:
40
- unet_model (torch.nn.Module): The original U-Net model.
41
- unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet model to match keys with.
39
+ Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
40
+ unet_model (torch.nn.Module): The original U-Net model. unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet
41
+ model to match keys with.
42
42
 
43
43
  Returns:
44
44
  OrderedDict: The converted state dictionary.