diffusers 0.17.1__py3-none-any.whl → 0.18.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (120) hide show
  1. diffusers/__init__.py +26 -1
  2. diffusers/configuration_utils.py +34 -29
  3. diffusers/dependency_versions_table.py +4 -0
  4. diffusers/image_processor.py +125 -12
  5. diffusers/loaders.py +169 -203
  6. diffusers/models/attention.py +24 -1
  7. diffusers/models/attention_flax.py +10 -5
  8. diffusers/models/attention_processor.py +3 -0
  9. diffusers/models/autoencoder_kl.py +114 -33
  10. diffusers/models/controlnet.py +131 -14
  11. diffusers/models/controlnet_flax.py +37 -26
  12. diffusers/models/cross_attention.py +17 -17
  13. diffusers/models/embeddings.py +67 -0
  14. diffusers/models/modeling_flax_utils.py +64 -56
  15. diffusers/models/modeling_utils.py +193 -104
  16. diffusers/models/prior_transformer.py +207 -37
  17. diffusers/models/resnet.py +26 -26
  18. diffusers/models/transformer_2d.py +36 -41
  19. diffusers/models/transformer_temporal.py +24 -21
  20. diffusers/models/unet_1d.py +31 -25
  21. diffusers/models/unet_2d.py +43 -30
  22. diffusers/models/unet_2d_blocks.py +210 -89
  23. diffusers/models/unet_2d_blocks_flax.py +12 -12
  24. diffusers/models/unet_2d_condition.py +172 -64
  25. diffusers/models/unet_2d_condition_flax.py +38 -24
  26. diffusers/models/unet_3d_blocks.py +34 -31
  27. diffusers/models/unet_3d_condition.py +101 -34
  28. diffusers/models/vae.py +5 -5
  29. diffusers/models/vae_flax.py +37 -34
  30. diffusers/models/vq_model.py +23 -14
  31. diffusers/pipelines/__init__.py +24 -1
  32. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +1 -1
  33. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -3
  34. diffusers/pipelines/consistency_models/__init__.py +1 -0
  35. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +337 -0
  36. diffusers/pipelines/controlnet/multicontrolnet.py +120 -1
  37. diffusers/pipelines/controlnet/pipeline_controlnet.py +59 -17
  38. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +60 -15
  39. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +60 -17
  40. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
  41. diffusers/pipelines/kandinsky/__init__.py +1 -1
  42. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +4 -6
  43. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +1 -0
  44. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -0
  45. diffusers/pipelines/kandinsky2_2/__init__.py +7 -0
  46. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +317 -0
  47. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +372 -0
  48. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +434 -0
  49. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +398 -0
  50. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +531 -0
  51. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +541 -0
  52. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +605 -0
  53. diffusers/pipelines/pipeline_flax_utils.py +2 -2
  54. diffusers/pipelines/pipeline_utils.py +124 -146
  55. diffusers/pipelines/shap_e/__init__.py +27 -0
  56. diffusers/pipelines/shap_e/camera.py +147 -0
  57. diffusers/pipelines/shap_e/pipeline_shap_e.py +390 -0
  58. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +349 -0
  59. diffusers/pipelines/shap_e/renderer.py +709 -0
  60. diffusers/pipelines/stable_diffusion/__init__.py +2 -0
  61. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +261 -66
  62. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +3 -3
  63. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -3
  64. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -2
  65. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
  66. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +1 -1
  67. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  68. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +719 -0
  69. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +1 -1
  70. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py +832 -0
  71. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +17 -7
  72. diffusers/pipelines/stable_diffusion_xl/__init__.py +26 -0
  73. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +823 -0
  74. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +896 -0
  75. diffusers/pipelines/stable_diffusion_xl/watermark.py +31 -0
  76. diffusers/pipelines/text_to_video_synthesis/__init__.py +2 -1
  77. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +5 -1
  78. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +771 -0
  79. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +92 -6
  80. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
  81. diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +209 -91
  82. diffusers/schedulers/__init__.py +3 -0
  83. diffusers/schedulers/scheduling_consistency_models.py +380 -0
  84. diffusers/schedulers/scheduling_ddim.py +28 -6
  85. diffusers/schedulers/scheduling_ddim_inverse.py +19 -4
  86. diffusers/schedulers/scheduling_ddim_parallel.py +642 -0
  87. diffusers/schedulers/scheduling_ddpm.py +53 -7
  88. diffusers/schedulers/scheduling_ddpm_parallel.py +604 -0
  89. diffusers/schedulers/scheduling_deis_multistep.py +66 -11
  90. diffusers/schedulers/scheduling_dpmsolver_multistep.py +55 -13
  91. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +19 -4
  92. diffusers/schedulers/scheduling_dpmsolver_sde.py +73 -11
  93. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +23 -7
  94. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +58 -9
  95. diffusers/schedulers/scheduling_euler_discrete.py +58 -8
  96. diffusers/schedulers/scheduling_heun_discrete.py +89 -14
  97. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +73 -11
  98. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +73 -11
  99. diffusers/schedulers/scheduling_lms_discrete.py +57 -8
  100. diffusers/schedulers/scheduling_pndm.py +46 -10
  101. diffusers/schedulers/scheduling_repaint.py +19 -4
  102. diffusers/schedulers/scheduling_sde_ve.py +5 -1
  103. diffusers/schedulers/scheduling_unclip.py +43 -4
  104. diffusers/schedulers/scheduling_unipc_multistep.py +48 -7
  105. diffusers/training_utils.py +1 -1
  106. diffusers/utils/__init__.py +2 -1
  107. diffusers/utils/dummy_pt_objects.py +60 -0
  108. diffusers/utils/dummy_torch_and_transformers_and_invisible_watermark_objects.py +32 -0
  109. diffusers/utils/dummy_torch_and_transformers_objects.py +180 -0
  110. diffusers/utils/hub_utils.py +1 -1
  111. diffusers/utils/import_utils.py +20 -3
  112. diffusers/utils/logging.py +15 -18
  113. diffusers/utils/outputs.py +3 -3
  114. diffusers/utils/testing_utils.py +15 -0
  115. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/METADATA +4 -2
  116. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/RECORD +120 -94
  117. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/WHEEL +1 -1
  118. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/LICENSE +0 -0
  119. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/entry_points.txt +0 -0
  120. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/top_level.txt +0 -0
@@ -38,12 +38,12 @@ def rearrange_4(tensor):
38
38
 
39
39
  class CrossFrameAttnProcessor:
40
40
  """
41
- Cross frame attention processor. For each frame the self-attention is replaced with attention with first frame
41
+ Cross frame attention processor. Each frame attends the first frame.
42
42
 
43
43
  Args:
44
44
  batch_size: The number that represents actual batch size, other than the frames.
45
- For example, using calling unet with a single prompt and num_images_per_prompt=1, batch_size should be
46
- equal to 2, due to classifier-free guidance.
45
+ For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
46
+ 2, due to classifier-free guidance.
47
47
  """
48
48
 
49
49
  def __init__(self, batch_size=2):
@@ -63,7 +63,7 @@ class CrossFrameAttnProcessor:
63
63
  key = attn.to_k(encoder_hidden_states)
64
64
  value = attn.to_v(encoder_hidden_states)
65
65
 
66
- # Sparse Attention
66
+ # Cross Frame Attention
67
67
  if not is_cross_attention:
68
68
  video_length = key.size()[0] // self.batch_size
69
69
  first_frame_index = [0] * video_length
@@ -95,6 +95,81 @@ class CrossFrameAttnProcessor:
95
95
  return hidden_states
96
96
 
97
97
 
98
+ class CrossFrameAttnProcessor2_0:
99
+ """
100
+ Cross frame attention processor with scaled_dot_product attention of Pytorch 2.0.
101
+
102
+ Args:
103
+ batch_size: The number that represents actual batch size, other than the frames.
104
+ For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
105
+ 2, due to classifier-free guidance.
106
+ """
107
+
108
+ def __init__(self, batch_size=2):
109
+ if not hasattr(F, "scaled_dot_product_attention"):
110
+ raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
111
+ self.batch_size = batch_size
112
+
113
+ def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
114
+ batch_size, sequence_length, _ = (
115
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
116
+ )
117
+ inner_dim = hidden_states.shape[-1]
118
+
119
+ if attention_mask is not None:
120
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
121
+ # scaled_dot_product_attention expects attention_mask shape to be
122
+ # (batch, heads, source_length, target_length)
123
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
124
+
125
+ query = attn.to_q(hidden_states)
126
+
127
+ is_cross_attention = encoder_hidden_states is not None
128
+ if encoder_hidden_states is None:
129
+ encoder_hidden_states = hidden_states
130
+ elif attn.norm_cross:
131
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
132
+
133
+ key = attn.to_k(encoder_hidden_states)
134
+ value = attn.to_v(encoder_hidden_states)
135
+
136
+ # Cross Frame Attention
137
+ if not is_cross_attention:
138
+ video_length = key.size()[0] // self.batch_size
139
+ first_frame_index = [0] * video_length
140
+
141
+ # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
142
+ key = rearrange_3(key, video_length)
143
+ key = key[:, first_frame_index]
144
+ # rearrange values to have batch and frames in the 1st and 2nd dims respectively
145
+ value = rearrange_3(value, video_length)
146
+ value = value[:, first_frame_index]
147
+
148
+ # rearrange back to original shape
149
+ key = rearrange_4(key)
150
+ value = rearrange_4(value)
151
+
152
+ head_dim = inner_dim // attn.heads
153
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
154
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
155
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
156
+
157
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
158
+ # TODO: add support for attn.scale when we move to Torch 2.1
159
+ hidden_states = F.scaled_dot_product_attention(
160
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
161
+ )
162
+
163
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
164
+ hidden_states = hidden_states.to(query.dtype)
165
+
166
+ # linear proj
167
+ hidden_states = attn.to_out[0](hidden_states)
168
+ # dropout
169
+ hidden_states = attn.to_out[1](hidden_states)
170
+ return hidden_states
171
+
172
+
98
173
  @dataclass
99
174
  class TextToVideoPipelineOutput(BaseOutput):
100
175
  images: Union[List[PIL.Image.Image], np.ndarray]
@@ -227,7 +302,12 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
227
302
  super().__init__(
228
303
  vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
229
304
  )
230
- self.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
305
+ processor = (
306
+ CrossFrameAttnProcessor2_0(batch_size=2)
307
+ if hasattr(F, "scaled_dot_product_attention")
308
+ else CrossFrameAttnProcessor(batch_size=2)
309
+ )
310
+ self.unet.set_attn_processor(processor)
231
311
 
232
312
  def forward_loop(self, x_t0, t0, t1, generator):
233
313
  """
@@ -338,6 +418,7 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
338
418
  callback_steps: Optional[int] = 1,
339
419
  t0: int = 44,
340
420
  t1: int = 47,
421
+ frame_ids: Optional[List[int]] = None,
341
422
  ):
342
423
  """
343
424
  Function invoked when calling the pipeline for generation.
@@ -399,6 +480,9 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
399
480
  t1 (`int`, *optional*, defaults to 47):
400
481
  Timestep t0. Should be in the range [t0 + 1, num_inference_steps - 1]. See the
401
482
  [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
483
+ frame_ids (`List[int]`, *optional*):
484
+ Indexes of the frames that are being generated. This is used when generating longer videos
485
+ chunk-by-chunk.
402
486
 
403
487
  Returns:
404
488
  [`~pipelines.text_to_video_synthesis.TextToVideoPipelineOutput`]:
@@ -407,7 +491,9 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
407
491
  likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
408
492
  """
409
493
  assert video_length > 0
410
- frame_ids = list(range(video_length))
494
+ if frame_ids is None:
495
+ frame_ids = list(range(video_length))
496
+ assert len(frame_ids) == video_length
411
497
 
412
498
  assert num_videos_per_prompt == 1
413
499
 
@@ -68,11 +68,11 @@ class ImageTextPipelineOutput(BaseOutput):
68
68
 
69
69
  Args:
70
70
  images (`List[PIL.Image.Image]` or `np.ndarray`)
71
- List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
72
- num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
71
+ List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
72
+ num_channels)`.
73
73
  text (`List[str]` or `List[List[str]]`)
74
74
  List of generated text strings of length `batch_size` or a list of list of strings whose outer list has
75
- length `batch_size`. Text generated by the diffusion pipeline.
75
+ length `batch_size`.
76
76
  """
77
77
 
78
78
  images: Optional[Union[List[PIL.Image.Image], np.ndarray]]