PyPI - diffusers - Versions diffs - 0.17.1__py3-none-any.whl → 0.18.2__py3-none-any.whl - Mend

diffusers 0.17.1py3-none-any.whl → 0.18.2py3-none-any.whl

Files changed (120) hide show

diffusers/__init__.py +26 -1
diffusers/configuration_utils.py +34 -29
diffusers/dependency_versions_table.py +4 -0
diffusers/image_processor.py +125 -12
diffusers/loaders.py +169 -203
diffusers/models/attention.py +24 -1
diffusers/models/attention_flax.py +10 -5
diffusers/models/attention_processor.py +3 -0
diffusers/models/autoencoder_kl.py +114 -33
diffusers/models/controlnet.py +131 -14
diffusers/models/controlnet_flax.py +37 -26
diffusers/models/cross_attention.py +17 -17
diffusers/models/embeddings.py +67 -0
diffusers/models/modeling_flax_utils.py +64 -56
diffusers/models/modeling_utils.py +193 -104
diffusers/models/prior_transformer.py +207 -37
diffusers/models/resnet.py +26 -26
diffusers/models/transformer_2d.py +36 -41
diffusers/models/transformer_temporal.py +24 -21
diffusers/models/unet_1d.py +31 -25
diffusers/models/unet_2d.py +43 -30
diffusers/models/unet_2d_blocks.py +210 -89
diffusers/models/unet_2d_blocks_flax.py +12 -12
diffusers/models/unet_2d_condition.py +172 -64
diffusers/models/unet_2d_condition_flax.py +38 -24
diffusers/models/unet_3d_blocks.py +34 -31
diffusers/models/unet_3d_condition.py +101 -34
diffusers/models/vae.py +5 -5
diffusers/models/vae_flax.py +37 -34
diffusers/models/vq_model.py +23 -14
diffusers/pipelines/__init__.py +24 -1
diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +1 -1
diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -3
diffusers/pipelines/consistency_models/__init__.py +1 -0
diffusers/pipelines/consistency_models/pipeline_consistency_models.py +337 -0
diffusers/pipelines/controlnet/multicontrolnet.py +120 -1
diffusers/pipelines/controlnet/pipeline_controlnet.py +59 -17
diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +60 -15
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +60 -17
diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
diffusers/pipelines/kandinsky/__init__.py +1 -1
diffusers/pipelines/kandinsky/pipeline_kandinsky.py +4 -6
diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +1 -0
diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -0
diffusers/pipelines/kandinsky2_2/__init__.py +7 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +317 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +372 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +434 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +398 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +531 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +541 -0
diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +605 -0
diffusers/pipelines/pipeline_flax_utils.py +2 -2
diffusers/pipelines/pipeline_utils.py +124 -146
diffusers/pipelines/shap_e/__init__.py +27 -0
diffusers/pipelines/shap_e/camera.py +147 -0
diffusers/pipelines/shap_e/pipeline_shap_e.py +390 -0
diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +349 -0
diffusers/pipelines/shap_e/renderer.py +709 -0
diffusers/pipelines/stable_diffusion/__init__.py +2 -0
diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +261 -66
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +3 -3
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -3
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -2
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +719 -0
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +1 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py +832 -0
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +17 -7
diffusers/pipelines/stable_diffusion_xl/__init__.py +26 -0
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +823 -0
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +896 -0
diffusers/pipelines/stable_diffusion_xl/watermark.py +31 -0
diffusers/pipelines/text_to_video_synthesis/__init__.py +2 -1
diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +5 -1
diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +771 -0
diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +92 -6
diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +209 -91
diffusers/schedulers/__init__.py +3 -0
diffusers/schedulers/scheduling_consistency_models.py +380 -0
diffusers/schedulers/scheduling_ddim.py +28 -6
diffusers/schedulers/scheduling_ddim_inverse.py +19 -4
diffusers/schedulers/scheduling_ddim_parallel.py +642 -0
diffusers/schedulers/scheduling_ddpm.py +53 -7
diffusers/schedulers/scheduling_ddpm_parallel.py +604 -0
diffusers/schedulers/scheduling_deis_multistep.py +66 -11
diffusers/schedulers/scheduling_dpmsolver_multistep.py +55 -13
diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +19 -4
diffusers/schedulers/scheduling_dpmsolver_sde.py +73 -11
diffusers/schedulers/scheduling_dpmsolver_singlestep.py +23 -7
diffusers/schedulers/scheduling_euler_ancestral_discrete.py +58 -9
diffusers/schedulers/scheduling_euler_discrete.py +58 -8
diffusers/schedulers/scheduling_heun_discrete.py +89 -14
diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +73 -11
diffusers/schedulers/scheduling_k_dpm_2_discrete.py +73 -11
diffusers/schedulers/scheduling_lms_discrete.py +57 -8
diffusers/schedulers/scheduling_pndm.py +46 -10
diffusers/schedulers/scheduling_repaint.py +19 -4
diffusers/schedulers/scheduling_sde_ve.py +5 -1
diffusers/schedulers/scheduling_unclip.py +43 -4
diffusers/schedulers/scheduling_unipc_multistep.py +48 -7
diffusers/training_utils.py +1 -1
diffusers/utils/__init__.py +2 -1
diffusers/utils/dummy_pt_objects.py +60 -0
diffusers/utils/dummy_torch_and_transformers_and_invisible_watermark_objects.py +32 -0
diffusers/utils/dummy_torch_and_transformers_objects.py +180 -0
diffusers/utils/hub_utils.py +1 -1
diffusers/utils/import_utils.py +20 -3
diffusers/utils/logging.py +15 -18
diffusers/utils/outputs.py +3 -3
diffusers/utils/testing_utils.py +15 -0
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/METADATA +4 -2
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/RECORD +120 -94
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/WHEEL +1 -1
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/LICENSE +0 -0
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/entry_points.txt +0 -0
{diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/top_level.txt +0 -0

diffusers/models/unet_2d_condition_flax.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple, Union
+from typing import Optional, Tuple, Union
 import flax
 import flax.linen as nn
@@ -35,9 +35,11 @@ from .unet_2d_blocks_flax import (
 @flax.struct.dataclass
 class FlaxUNet2DConditionOutput(BaseOutput):
     """
+    The output of [`FlaxUNet2DConditionModel`].
     Args:
         sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
-            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
     """
     sample: jnp.ndarray
@@ -46,17 +48,17 @@ class FlaxUNet2DConditionOutput(BaseOutput):
 @flax_register_to_config
 class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
     r"""
-    FlaxUNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a
-    timestep and returns sample shaped output.
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
-    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the models (such as downloading or saving, etc.)
+    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it's generic methods
+    implemented for all models (such as downloading or saving).
-    Also, this model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
-    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matters related to its
     general usage and behavior.
-    Finally, this model supports inherent JAX features such as:
+    Inherent JAX features such as the following are supported:
     - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
     - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
     - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
@@ -69,18 +71,18 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
             The number of channels in the input sample.
         out_channels (`int`, *optional*, defaults to 4):
             The number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use. The corresponding class names will be: "FlaxCrossAttnDownBlock2D",
-            "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D"
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
-            The tuple of upsample blocks to use. The corresponding class names will be: "FlaxUpBlock2D",
-            "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D"
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
         block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
             The tuple of output channels for each block.
         layers_per_block (`int`, *optional*, defaults to 2):
             The number of layers per block.
         attention_head_dim (`int` or `Tuple[int]`, *optional*, defaults to 8):
             The dimension of the attention heads.
+        num_attention_heads (`int` or `Tuple[int]`, *optional*):
+            The number of attention heads.
         cross_attention_dim (`int`, *optional*, defaults to 768):
             The dimension of the cross attention features.
         dropout (`float`, *optional*, defaults to 0):
@@ -89,8 +91,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
             Whether to flip the sin to cos in the time embedding.
         freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
         use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
-            enable memory efficient attention https://arxiv.org/abs/2112.05682
+            Enable memory efficient attention as described [here](https://arxiv.org/abs/2112.05682).
     """
     sample_size: int = 32
@@ -107,6 +108,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
     block_out_channels: Tuple[int] = (320, 640, 1280, 1280)
     layers_per_block: int = 2
     attention_head_dim: Union[int, Tuple[int]] = 8
+    num_attention_heads: Optional[Union[int, Tuple[int]]] = None
     cross_attention_dim: int = 1280
     dropout: float = 0.0
     use_linear_projection: bool = False
@@ -131,6 +133,19 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         block_out_channels = self.block_out_channels
         time_embed_dim = block_out_channels[0] * 4
+        if self.num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = self.num_attention_heads or self.attention_head_dim
         # input
         self.conv_in = nn.Conv(
             block_out_channels[0],
@@ -150,9 +165,8 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         if isinstance(only_cross_attention, bool):
             only_cross_attention = (only_cross_attention,) * len(self.down_block_types)
-        attention_head_dim = self.attention_head_dim
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(self.down_block_types)
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(self.down_block_types)
         # down
         down_blocks = []
@@ -168,7 +182,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
                     out_channels=output_channel,
                     dropout=self.dropout,
                     num_layers=self.layers_per_block,
-                    attn_num_head_channels=attention_head_dim[i],
+                    num_attention_heads=num_attention_heads[i],
                     add_downsample=not is_final_block,
                     use_linear_projection=self.use_linear_projection,
                     only_cross_attention=only_cross_attention[i],
@@ -192,7 +206,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         self.mid_block = FlaxUNetMidBlock2DCrossAttn(
             in_channels=block_out_channels[-1],
             dropout=self.dropout,
-            attn_num_head_channels=attention_head_dim[-1],
+            num_attention_heads=num_attention_heads[-1],
             use_linear_projection=self.use_linear_projection,
             use_memory_efficient_attention=self.use_memory_efficient_attention,
             dtype=self.dtype,
@@ -201,7 +215,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         # up
         up_blocks = []
         reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
         only_cross_attention = list(reversed(only_cross_attention))
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(self.up_block_types):
@@ -217,7 +231,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
                     out_channels=output_channel,
                     prev_output_channel=prev_output_channel,
                     num_layers=self.layers_per_block + 1,
-                    attn_num_head_channels=reversed_attention_head_dim[i],
+                    num_attention_heads=reversed_num_attention_heads[i],
                     add_upsample=not is_final_block,
                     dropout=self.dropout,
                     use_linear_projection=self.use_linear_projection,

diffusers/models/unet_3d_blocks.py CHANGED Viewed

@@ -29,7 +29,7 @@ def get_down_block(
     add_downsample,
     resnet_eps,
     resnet_act_fn,
-    attn_num_head_channels,
+    num_attention_heads,
     resnet_groups=None,
     cross_attention_dim=None,
     downsample_padding=None,
@@ -66,7 +66,7 @@ def get_down_block(
             resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
             cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
+            num_attention_heads=num_attention_heads,
             dual_cross_attention=dual_cross_attention,
             use_linear_projection=use_linear_projection,
             only_cross_attention=only_cross_attention,
@@ -86,7 +86,7 @@ def get_up_block(
     add_upsample,
     resnet_eps,
     resnet_act_fn,
-    attn_num_head_channels,
+    num_attention_heads,
     resnet_groups=None,
     cross_attention_dim=None,
     dual_cross_attention=False,
@@ -122,7 +122,7 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
             cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
+            num_attention_heads=num_attention_heads,
             dual_cross_attention=dual_cross_attention,
             use_linear_projection=use_linear_projection,
             only_cross_attention=only_cross_attention,
@@ -144,7 +144,7 @@ class UNetMidBlock3DCrossAttn(nn.Module):
         resnet_act_fn: str = "swish",
         resnet_groups: int = 32,
         resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
+        num_attention_heads=1,
         output_scale_factor=1.0,
         cross_attention_dim=1280,
         dual_cross_attention=False,
@@ -154,7 +154,7 @@ class UNetMidBlock3DCrossAttn(nn.Module):
         super().__init__()
         self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
+        self.num_attention_heads = num_attention_heads
         resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         # there is always at least one resnet
@@ -185,8 +185,8 @@ class UNetMidBlock3DCrossAttn(nn.Module):
         for _ in range(num_layers):
             attentions.append(
                 Transformer2DModel(
-                    in_channels // attn_num_head_channels,
-                    attn_num_head_channels,
+                    in_channels // num_attention_heads,
+                    num_attention_heads,
                     in_channels=in_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
@@ -197,8 +197,8 @@ class UNetMidBlock3DCrossAttn(nn.Module):
             )
             temp_attentions.append(
                 TransformerTemporalModel(
-                    in_channels // attn_num_head_channels,
-                    attn_num_head_channels,
+                    in_channels // num_attention_heads,
+                    num_attention_heads,
                     in_channels=in_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
@@ -250,10 +250,11 @@ class UNetMidBlock3DCrossAttn(nn.Module):
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
                 cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
+                return_dict=False,
+            )[0]
             hidden_states = temp_attn(
-                hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
-            ).sample
+                hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs, return_dict=False
+            )[0]
             hidden_states = resnet(hidden_states, temb)
             hidden_states = temp_conv(hidden_states, num_frames=num_frames)
@@ -273,7 +274,7 @@ class CrossAttnDownBlock3D(nn.Module):
         resnet_act_fn: str = "swish",
         resnet_groups: int = 32,
         resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
+        num_attention_heads=1,
         cross_attention_dim=1280,
         output_scale_factor=1.0,
         downsample_padding=1,
@@ -290,7 +291,7 @@ class CrossAttnDownBlock3D(nn.Module):
         temp_convs = []
         self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
+        self.num_attention_heads = num_attention_heads
         for i in range(num_layers):
             in_channels = in_channels if i == 0 else out_channels
@@ -317,8 +318,8 @@ class CrossAttnDownBlock3D(nn.Module):
             )
             attentions.append(
                 Transformer2DModel(
-                    out_channels // attn_num_head_channels,
-                    attn_num_head_channels,
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
                     in_channels=out_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
@@ -330,8 +331,8 @@ class CrossAttnDownBlock3D(nn.Module):
             )
             temp_attentions.append(
                 TransformerTemporalModel(
-                    out_channels // attn_num_head_channels,
-                    attn_num_head_channels,
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
                     in_channels=out_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
@@ -377,10 +378,11 @@ class CrossAttnDownBlock3D(nn.Module):
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
                 cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
+                return_dict=False,
+            )[0]
             hidden_states = temp_attn(
-                hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
-            ).sample
+                hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs, return_dict=False
+            )[0]
             output_states += (hidden_states,)
@@ -486,7 +488,7 @@ class CrossAttnUpBlock3D(nn.Module):
         resnet_act_fn: str = "swish",
         resnet_groups: int = 32,
         resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
+        num_attention_heads=1,
         cross_attention_dim=1280,
         output_scale_factor=1.0,
         add_upsample=True,
@@ -502,7 +504,7 @@ class CrossAttnUpBlock3D(nn.Module):
         temp_attentions = []
         self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
+        self.num_attention_heads = num_attention_heads
         for i in range(num_layers):
             res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
@@ -531,8 +533,8 @@ class CrossAttnUpBlock3D(nn.Module):
             )
             attentions.append(
                 Transformer2DModel(
-                    out_channels // attn_num_head_channels,
-                    attn_num_head_channels,
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
                     in_channels=out_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
@@ -544,8 +546,8 @@ class CrossAttnUpBlock3D(nn.Module):
             )
             temp_attentions.append(
                 TransformerTemporalModel(
-                    out_channels // attn_num_head_channels,
-                    attn_num_head_channels,
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
                     in_channels=out_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
@@ -590,10 +592,11 @@ class CrossAttnUpBlock3D(nn.Module):
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
                 cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
+                return_dict=False,
+            )[0]
             hidden_states = temp_attn(
-                hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
-            ).sample
+                hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs, return_dict=False
+            )[0]
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:

diffusers/models/unet_3d_condition.py CHANGED Viewed

@@ -43,9 +43,11 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 @dataclass
 class UNet3DConditionOutput(BaseOutput):
     """
+    The output of [`UNet3DConditionModel`].
     Args:
         sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
-            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
     """
     sample: torch.FloatTensor
@@ -53,11 +55,11 @@ class UNet3DConditionOutput(BaseOutput):
 class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
     r"""
-    UNet3DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
-    and returns sample shaped output.
+    A conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the models (such as downloading or saving, etc.)
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
     Parameters:
         sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
@@ -66,7 +68,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
         down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
             The tuple of downsample blocks to use.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
             The tuple of upsample blocks to use.
         block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
             The tuple of output channels for each block.
@@ -75,10 +77,11 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
         act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
         norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-            If `None`, it will skip the normalization and activation layers in post-processing
+            If `None`, normalization and activation layers is skipped in post-processing.
         norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
         cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
         attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*): The number of attention heads.
     """
     _supports_gradient_checkpointing = False
@@ -105,11 +108,25 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         norm_eps: float = 1e-5,
         cross_attention_dim: int = 1024,
         attention_head_dim: Union[int, Tuple[int]] = 64,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
     ):
         super().__init__()
         self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise NotImplementedError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
         # Check inputs
         if len(down_block_types) != len(up_block_types):
             raise ValueError(
@@ -121,9 +138,9 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
                 f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
             )
-        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
             raise ValueError(
-                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
             )
         # input
@@ -156,8 +173,8 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         self.down_blocks = nn.ModuleList([])
         self.up_blocks = nn.ModuleList([])
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
         # down
         output_channel = block_out_channels[0]
@@ -177,7 +194,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[i],
+                num_attention_heads=num_attention_heads[i],
                 downsample_padding=downsample_padding,
                 dual_cross_attention=False,
             )
@@ -191,7 +208,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
             resnet_act_fn=act_fn,
             output_scale_factor=mid_block_scale_factor,
             cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim[-1],
+            num_attention_heads=num_attention_heads[-1],
             resnet_groups=norm_num_groups,
             dual_cross_attention=False,
         )
@@ -201,7 +218,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_block_types):
@@ -230,7 +247,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=reversed_attention_head_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
                 dual_cross_attention=False,
             )
             self.up_blocks.append(up_block)
@@ -281,13 +298,13 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         r"""
         Enable sliced attention computation.
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
         Args:
             slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
                 provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
                 must be a multiple of `slice_size`.
         """
@@ -345,11 +362,15 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
     # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
     def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
         r"""
+        Sets the attention processor to use to compute attention.
         Parameters:
-            `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                 The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                of **all** `Attention` layers.
-            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
         """
         count = len(self.attn_processors.keys())
@@ -373,6 +394,46 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
+    def enable_forward_chunking(self, chunk_size=None, dim=0):
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+    def disable_forward_chunking(self):
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
     # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
@@ -398,21 +459,24 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         return_dict: bool = True,
     ) -> Union[UNet3DConditionOutput, Tuple]:
         r"""
+        The [`UNet3DConditionModel`] forward method.
         Args:
-            sample (`torch.FloatTensor`): (batch, num_frames, channel, height, width) noisy inputs tensor
-            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet3DConditionOutput`] instead of a plain tuple.
+                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                tuple.
             cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
         Returns:
-            [`~models.unet_2d_condition.UNet3DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet3DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
+            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
         """
         # By default samples have to be AT least a multiple of the overall upsampling factor.
         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
@@ -467,8 +531,11 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         sample = self.conv_in(sample)
         sample = self.transformer_in(
-            sample, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
-        ).sample
+            sample,
+            num_frames=num_frames,
+            cross_attention_kwargs=cross_attention_kwargs,
+            return_dict=False,
+        )[0]
         # 3. down
         down_block_res_samples = (sample,)

diffusers 0.17.1__py3-none-any.whl → 0.18.2__py3-none-any.whl

diffusers 0.17.1py3-none-any.whl → 0.18.2py3-none-any.whl