diffusers 0.17.1__py3-none-any.whl → 0.18.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- diffusers/__init__.py +26 -1
- diffusers/configuration_utils.py +34 -29
- diffusers/dependency_versions_table.py +4 -0
- diffusers/image_processor.py +125 -12
- diffusers/loaders.py +169 -203
- diffusers/models/attention.py +24 -1
- diffusers/models/attention_flax.py +10 -5
- diffusers/models/attention_processor.py +3 -0
- diffusers/models/autoencoder_kl.py +114 -33
- diffusers/models/controlnet.py +131 -14
- diffusers/models/controlnet_flax.py +37 -26
- diffusers/models/cross_attention.py +17 -17
- diffusers/models/embeddings.py +67 -0
- diffusers/models/modeling_flax_utils.py +64 -56
- diffusers/models/modeling_utils.py +193 -104
- diffusers/models/prior_transformer.py +207 -37
- diffusers/models/resnet.py +26 -26
- diffusers/models/transformer_2d.py +36 -41
- diffusers/models/transformer_temporal.py +24 -21
- diffusers/models/unet_1d.py +31 -25
- diffusers/models/unet_2d.py +43 -30
- diffusers/models/unet_2d_blocks.py +210 -89
- diffusers/models/unet_2d_blocks_flax.py +12 -12
- diffusers/models/unet_2d_condition.py +172 -64
- diffusers/models/unet_2d_condition_flax.py +38 -24
- diffusers/models/unet_3d_blocks.py +34 -31
- diffusers/models/unet_3d_condition.py +101 -34
- diffusers/models/vae.py +5 -5
- diffusers/models/vae_flax.py +37 -34
- diffusers/models/vq_model.py +23 -14
- diffusers/pipelines/__init__.py +24 -1
- diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +1 -1
- diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -3
- diffusers/pipelines/consistency_models/__init__.py +1 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +337 -0
- diffusers/pipelines/controlnet/multicontrolnet.py +120 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +59 -17
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +60 -15
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +60 -17
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
- diffusers/pipelines/kandinsky/__init__.py +1 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky.py +4 -6
- diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +1 -0
- diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -0
- diffusers/pipelines/kandinsky2_2/__init__.py +7 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +317 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +372 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +434 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +398 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +531 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +541 -0
- diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +605 -0
- diffusers/pipelines/pipeline_flax_utils.py +2 -2
- diffusers/pipelines/pipeline_utils.py +124 -146
- diffusers/pipelines/shap_e/__init__.py +27 -0
- diffusers/pipelines/shap_e/camera.py +147 -0
- diffusers/pipelines/shap_e/pipeline_shap_e.py +390 -0
- diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +349 -0
- diffusers/pipelines/shap_e/renderer.py +709 -0
- diffusers/pipelines/stable_diffusion/__init__.py +2 -0
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +261 -66
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +3 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -3
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -2
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +719 -0
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py +832 -0
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +17 -7
- diffusers/pipelines/stable_diffusion_xl/__init__.py +26 -0
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +823 -0
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +896 -0
- diffusers/pipelines/stable_diffusion_xl/watermark.py +31 -0
- diffusers/pipelines/text_to_video_synthesis/__init__.py +2 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +5 -1
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +771 -0
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +92 -6
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
- diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +209 -91
- diffusers/schedulers/__init__.py +3 -0
- diffusers/schedulers/scheduling_consistency_models.py +380 -0
- diffusers/schedulers/scheduling_ddim.py +28 -6
- diffusers/schedulers/scheduling_ddim_inverse.py +19 -4
- diffusers/schedulers/scheduling_ddim_parallel.py +642 -0
- diffusers/schedulers/scheduling_ddpm.py +53 -7
- diffusers/schedulers/scheduling_ddpm_parallel.py +604 -0
- diffusers/schedulers/scheduling_deis_multistep.py +66 -11
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +55 -13
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +19 -4
- diffusers/schedulers/scheduling_dpmsolver_sde.py +73 -11
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +23 -7
- diffusers/schedulers/scheduling_euler_ancestral_discrete.py +58 -9
- diffusers/schedulers/scheduling_euler_discrete.py +58 -8
- diffusers/schedulers/scheduling_heun_discrete.py +89 -14
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +73 -11
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +73 -11
- diffusers/schedulers/scheduling_lms_discrete.py +57 -8
- diffusers/schedulers/scheduling_pndm.py +46 -10
- diffusers/schedulers/scheduling_repaint.py +19 -4
- diffusers/schedulers/scheduling_sde_ve.py +5 -1
- diffusers/schedulers/scheduling_unclip.py +43 -4
- diffusers/schedulers/scheduling_unipc_multistep.py +48 -7
- diffusers/training_utils.py +1 -1
- diffusers/utils/__init__.py +2 -1
- diffusers/utils/dummy_pt_objects.py +60 -0
- diffusers/utils/dummy_torch_and_transformers_and_invisible_watermark_objects.py +32 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +180 -0
- diffusers/utils/hub_utils.py +1 -1
- diffusers/utils/import_utils.py +20 -3
- diffusers/utils/logging.py +15 -18
- diffusers/utils/outputs.py +3 -3
- diffusers/utils/testing_utils.py +15 -0
- {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/METADATA +4 -2
- {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/RECORD +120 -94
- {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/WHEEL +1 -1
- {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/LICENSE +0 -0
- {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/entry_points.txt +0 -0
- {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/top_level.txt +0 -0
@@ -38,12 +38,12 @@ def rearrange_4(tensor):
|
|
38
38
|
|
39
39
|
class CrossFrameAttnProcessor:
|
40
40
|
"""
|
41
|
-
Cross frame attention processor.
|
41
|
+
Cross frame attention processor. Each frame attends the first frame.
|
42
42
|
|
43
43
|
Args:
|
44
44
|
batch_size: The number that represents actual batch size, other than the frames.
|
45
|
-
For example,
|
46
|
-
|
45
|
+
For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
|
46
|
+
2, due to classifier-free guidance.
|
47
47
|
"""
|
48
48
|
|
49
49
|
def __init__(self, batch_size=2):
|
@@ -63,7 +63,7 @@ class CrossFrameAttnProcessor:
|
|
63
63
|
key = attn.to_k(encoder_hidden_states)
|
64
64
|
value = attn.to_v(encoder_hidden_states)
|
65
65
|
|
66
|
-
#
|
66
|
+
# Cross Frame Attention
|
67
67
|
if not is_cross_attention:
|
68
68
|
video_length = key.size()[0] // self.batch_size
|
69
69
|
first_frame_index = [0] * video_length
|
@@ -95,6 +95,81 @@ class CrossFrameAttnProcessor:
|
|
95
95
|
return hidden_states
|
96
96
|
|
97
97
|
|
98
|
+
class CrossFrameAttnProcessor2_0:
|
99
|
+
"""
|
100
|
+
Cross frame attention processor with scaled_dot_product attention of Pytorch 2.0.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
batch_size: The number that represents actual batch size, other than the frames.
|
104
|
+
For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
|
105
|
+
2, due to classifier-free guidance.
|
106
|
+
"""
|
107
|
+
|
108
|
+
def __init__(self, batch_size=2):
|
109
|
+
if not hasattr(F, "scaled_dot_product_attention"):
|
110
|
+
raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
|
111
|
+
self.batch_size = batch_size
|
112
|
+
|
113
|
+
def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
|
114
|
+
batch_size, sequence_length, _ = (
|
115
|
+
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
|
116
|
+
)
|
117
|
+
inner_dim = hidden_states.shape[-1]
|
118
|
+
|
119
|
+
if attention_mask is not None:
|
120
|
+
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
|
121
|
+
# scaled_dot_product_attention expects attention_mask shape to be
|
122
|
+
# (batch, heads, source_length, target_length)
|
123
|
+
attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
|
124
|
+
|
125
|
+
query = attn.to_q(hidden_states)
|
126
|
+
|
127
|
+
is_cross_attention = encoder_hidden_states is not None
|
128
|
+
if encoder_hidden_states is None:
|
129
|
+
encoder_hidden_states = hidden_states
|
130
|
+
elif attn.norm_cross:
|
131
|
+
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
|
132
|
+
|
133
|
+
key = attn.to_k(encoder_hidden_states)
|
134
|
+
value = attn.to_v(encoder_hidden_states)
|
135
|
+
|
136
|
+
# Cross Frame Attention
|
137
|
+
if not is_cross_attention:
|
138
|
+
video_length = key.size()[0] // self.batch_size
|
139
|
+
first_frame_index = [0] * video_length
|
140
|
+
|
141
|
+
# rearrange keys to have batch and frames in the 1st and 2nd dims respectively
|
142
|
+
key = rearrange_3(key, video_length)
|
143
|
+
key = key[:, first_frame_index]
|
144
|
+
# rearrange values to have batch and frames in the 1st and 2nd dims respectively
|
145
|
+
value = rearrange_3(value, video_length)
|
146
|
+
value = value[:, first_frame_index]
|
147
|
+
|
148
|
+
# rearrange back to original shape
|
149
|
+
key = rearrange_4(key)
|
150
|
+
value = rearrange_4(value)
|
151
|
+
|
152
|
+
head_dim = inner_dim // attn.heads
|
153
|
+
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
154
|
+
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
155
|
+
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
156
|
+
|
157
|
+
# the output of sdp = (batch, num_heads, seq_len, head_dim)
|
158
|
+
# TODO: add support for attn.scale when we move to Torch 2.1
|
159
|
+
hidden_states = F.scaled_dot_product_attention(
|
160
|
+
query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
|
161
|
+
)
|
162
|
+
|
163
|
+
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
|
164
|
+
hidden_states = hidden_states.to(query.dtype)
|
165
|
+
|
166
|
+
# linear proj
|
167
|
+
hidden_states = attn.to_out[0](hidden_states)
|
168
|
+
# dropout
|
169
|
+
hidden_states = attn.to_out[1](hidden_states)
|
170
|
+
return hidden_states
|
171
|
+
|
172
|
+
|
98
173
|
@dataclass
|
99
174
|
class TextToVideoPipelineOutput(BaseOutput):
|
100
175
|
images: Union[List[PIL.Image.Image], np.ndarray]
|
@@ -227,7 +302,12 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
|
|
227
302
|
super().__init__(
|
228
303
|
vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
|
229
304
|
)
|
230
|
-
|
305
|
+
processor = (
|
306
|
+
CrossFrameAttnProcessor2_0(batch_size=2)
|
307
|
+
if hasattr(F, "scaled_dot_product_attention")
|
308
|
+
else CrossFrameAttnProcessor(batch_size=2)
|
309
|
+
)
|
310
|
+
self.unet.set_attn_processor(processor)
|
231
311
|
|
232
312
|
def forward_loop(self, x_t0, t0, t1, generator):
|
233
313
|
"""
|
@@ -338,6 +418,7 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
|
|
338
418
|
callback_steps: Optional[int] = 1,
|
339
419
|
t0: int = 44,
|
340
420
|
t1: int = 47,
|
421
|
+
frame_ids: Optional[List[int]] = None,
|
341
422
|
):
|
342
423
|
"""
|
343
424
|
Function invoked when calling the pipeline for generation.
|
@@ -399,6 +480,9 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
|
|
399
480
|
t1 (`int`, *optional*, defaults to 47):
|
400
481
|
Timestep t0. Should be in the range [t0 + 1, num_inference_steps - 1]. See the
|
401
482
|
[paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
|
483
|
+
frame_ids (`List[int]`, *optional*):
|
484
|
+
Indexes of the frames that are being generated. This is used when generating longer videos
|
485
|
+
chunk-by-chunk.
|
402
486
|
|
403
487
|
Returns:
|
404
488
|
[`~pipelines.text_to_video_synthesis.TextToVideoPipelineOutput`]:
|
@@ -407,7 +491,9 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
|
|
407
491
|
likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
|
408
492
|
"""
|
409
493
|
assert video_length > 0
|
410
|
-
frame_ids
|
494
|
+
if frame_ids is None:
|
495
|
+
frame_ids = list(range(video_length))
|
496
|
+
assert len(frame_ids) == video_length
|
411
497
|
|
412
498
|
assert num_videos_per_prompt == 1
|
413
499
|
|
@@ -68,11 +68,11 @@ class ImageTextPipelineOutput(BaseOutput):
|
|
68
68
|
|
69
69
|
Args:
|
70
70
|
images (`List[PIL.Image.Image]` or `np.ndarray`)
|
71
|
-
List of denoised PIL images of length `batch_size` or
|
72
|
-
num_channels)`.
|
71
|
+
List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
|
72
|
+
num_channels)`.
|
73
73
|
text (`List[str]` or `List[List[str]]`)
|
74
74
|
List of generated text strings of length `batch_size` or a list of list of strings whose outer list has
|
75
|
-
length `batch_size`.
|
75
|
+
length `batch_size`.
|
76
76
|
"""
|
77
77
|
|
78
78
|
images: Optional[Union[List[PIL.Image.Image], np.ndarray]]
|