PyPI - diffusers - Versions diffs - 0.31.0__py3-none-any.whl → 0.32.0__py3-none-any.whl - Mend

diffusers 0.31.0py3-none-any.whl → 0.32.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (214) hide show

diffusers/__init__.py +66 -5
diffusers/callbacks.py +56 -3
diffusers/configuration_utils.py +1 -1
diffusers/dependency_versions_table.py +1 -1
diffusers/image_processor.py +25 -17
diffusers/loaders/__init__.py +22 -3
diffusers/loaders/ip_adapter.py +538 -15
diffusers/loaders/lora_base.py +124 -118
diffusers/loaders/lora_conversion_utils.py +318 -3
diffusers/loaders/lora_pipeline.py +1688 -368
diffusers/loaders/peft.py +379 -0
diffusers/loaders/single_file_model.py +71 -4
diffusers/loaders/single_file_utils.py +519 -9
diffusers/loaders/textual_inversion.py +3 -3
diffusers/loaders/transformer_flux.py +181 -0
diffusers/loaders/transformer_sd3.py +89 -0
diffusers/loaders/unet.py +17 -4
diffusers/models/__init__.py +47 -14
diffusers/models/activations.py +22 -9
diffusers/models/attention.py +13 -4
diffusers/models/attention_flax.py +1 -1
diffusers/models/attention_processor.py +2059 -281
diffusers/models/autoencoders/__init__.py +5 -0
diffusers/models/autoencoders/autoencoder_dc.py +620 -0
diffusers/models/autoencoders/autoencoder_kl.py +2 -1
diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +36 -27
diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
diffusers/models/autoencoders/vae.py +18 -5
diffusers/models/controlnet.py +47 -802
diffusers/models/controlnet_flux.py +29 -495
diffusers/models/controlnet_sd3.py +25 -379
diffusers/models/controlnet_sparsectrl.py +46 -718
diffusers/models/controlnets/__init__.py +23 -0
diffusers/models/controlnets/controlnet.py +872 -0
diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
diffusers/models/controlnets/controlnet_flux.py +536 -0
diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
diffusers/models/controlnets/controlnet_sd3.py +489 -0
diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
diffusers/models/controlnets/controlnet_union.py +832 -0
diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
diffusers/models/controlnets/multicontrolnet.py +183 -0
diffusers/models/embeddings.py +838 -43
diffusers/models/model_loading_utils.py +88 -6
diffusers/models/modeling_flax_utils.py +1 -1
diffusers/models/modeling_utils.py +74 -28
diffusers/models/normalization.py +78 -13
diffusers/models/transformers/__init__.py +5 -0
diffusers/models/transformers/auraflow_transformer_2d.py +2 -2
diffusers/models/transformers/cogvideox_transformer_3d.py +46 -11
diffusers/models/transformers/dit_transformer_2d.py +1 -1
diffusers/models/transformers/latte_transformer_3d.py +4 -4
diffusers/models/transformers/pixart_transformer_2d.py +1 -1
diffusers/models/transformers/sana_transformer.py +488 -0
diffusers/models/transformers/stable_audio_transformer.py +1 -1
diffusers/models/transformers/transformer_2d.py +1 -1
diffusers/models/transformers/transformer_allegro.py +422 -0
diffusers/models/transformers/transformer_cogview3plus.py +1 -1
diffusers/models/transformers/transformer_flux.py +30 -9
diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
diffusers/models/transformers/transformer_ltx.py +469 -0
diffusers/models/transformers/transformer_mochi.py +499 -0
diffusers/models/transformers/transformer_sd3.py +105 -17
diffusers/models/transformers/transformer_temporal.py +1 -1
diffusers/models/unets/unet_1d_blocks.py +1 -1
diffusers/models/unets/unet_2d.py +8 -1
diffusers/models/unets/unet_2d_blocks.py +88 -21
diffusers/models/unets/unet_2d_condition.py +1 -1
diffusers/models/unets/unet_3d_blocks.py +9 -7
diffusers/models/unets/unet_motion_model.py +5 -5
diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
diffusers/models/unets/unet_stable_cascade.py +2 -2
diffusers/models/unets/uvit_2d.py +1 -1
diffusers/models/upsampling.py +8 -0
diffusers/pipelines/__init__.py +34 -0
diffusers/pipelines/allegro/__init__.py +48 -0
diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
diffusers/pipelines/allegro/pipeline_output.py +23 -0
diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +8 -2
diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1 -1
diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +0 -6
diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +8 -8
diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -8
diffusers/pipelines/auto_pipeline.py +53 -6
diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
diffusers/pipelines/cogvideo/pipeline_cogvideox.py +50 -22
diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +51 -20
diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +69 -21
diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +47 -21
diffusers/pipelines/cogview3/pipeline_cogview3plus.py +1 -1
diffusers/pipelines/controlnet/__init__.py +86 -80
diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
diffusers/pipelines/controlnet/pipeline_controlnet.py +11 -2
diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +1 -2
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +1 -2
diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +1 -2
diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +3 -3
diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +1 -3
diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +5 -1
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +53 -19
diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -8
diffusers/pipelines/flux/__init__.py +13 -1
diffusers/pipelines/flux/modeling_flux.py +47 -0
diffusers/pipelines/flux/pipeline_flux.py +204 -29
diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
diffusers/pipelines/flux/pipeline_flux_controlnet.py +49 -27
diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +40 -30
diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +78 -56
diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
diffusers/pipelines/flux/pipeline_flux_img2img.py +33 -27
diffusers/pipelines/flux/pipeline_flux_inpaint.py +36 -29
diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
diffusers/pipelines/flux/pipeline_output.py +16 -0
diffusers/pipelines/hunyuan_video/__init__.py +48 -0
diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +5 -1
diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
diffusers/pipelines/kolors/text_encoder.py +2 -2
diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
diffusers/pipelines/ltx/__init__.py +50 -0
diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
diffusers/pipelines/ltx/pipeline_output.py +20 -0
diffusers/pipelines/lumina/pipeline_lumina.py +1 -8
diffusers/pipelines/mochi/__init__.py +48 -0
diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
diffusers/pipelines/mochi/pipeline_output.py +20 -0
diffusers/pipelines/pag/__init__.py +7 -0
diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -2
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1 -2
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1 -3
diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1 -3
diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +5 -1
diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +6 -13
diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
diffusers/pipelines/pag/pipeline_pag_sd_3.py +6 -6
diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +3 -0
diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
diffusers/pipelines/pipeline_flax_utils.py +1 -1
diffusers/pipelines/pipeline_loading_utils.py +25 -4
diffusers/pipelines/pipeline_utils.py +35 -6
diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +6 -13
diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +6 -13
diffusers/pipelines/sana/__init__.py +47 -0
diffusers/pipelines/sana/pipeline_output.py +21 -0
diffusers/pipelines/sana/pipeline_sana.py +884 -0
diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -3
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +216 -20
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +62 -9
diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +57 -8
diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -8
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -8
diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -8
diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
diffusers/quantizers/auto.py +14 -1
diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -1
diffusers/quantizers/gguf/__init__.py +1 -0
diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
diffusers/quantizers/gguf/utils.py +456 -0
diffusers/quantizers/quantization_config.py +280 -2
diffusers/quantizers/torchao/__init__.py +15 -0
diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
diffusers/schedulers/scheduling_ddpm.py +2 -6
diffusers/schedulers/scheduling_ddpm_parallel.py +2 -6
diffusers/schedulers/scheduling_deis_multistep.py +28 -9
diffusers/schedulers/scheduling_dpmsolver_multistep.py +35 -9
diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +35 -8
diffusers/schedulers/scheduling_dpmsolver_sde.py +4 -4
diffusers/schedulers/scheduling_dpmsolver_singlestep.py +48 -10
diffusers/schedulers/scheduling_euler_discrete.py +4 -4
diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
diffusers/schedulers/scheduling_heun_discrete.py +4 -4
diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +4 -4
diffusers/schedulers/scheduling_k_dpm_2_discrete.py +4 -4
diffusers/schedulers/scheduling_lcm.py +2 -6
diffusers/schedulers/scheduling_lms_discrete.py +4 -4
diffusers/schedulers/scheduling_repaint.py +1 -1
diffusers/schedulers/scheduling_sasolver.py +28 -9
diffusers/schedulers/scheduling_tcd.py +2 -6
diffusers/schedulers/scheduling_unipc_multistep.py +53 -8
diffusers/training_utils.py +16 -2
diffusers/utils/__init__.py +5 -0
diffusers/utils/constants.py +1 -0
diffusers/utils/dummy_pt_objects.py +180 -0
diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
diffusers/utils/dynamic_modules_utils.py +3 -3
diffusers/utils/hub_utils.py +31 -39
diffusers/utils/import_utils.py +67 -0
diffusers/utils/peft_utils.py +3 -0
diffusers/utils/testing_utils.py +56 -1
diffusers/utils/torch_utils.py +3 -0
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/METADATA +69 -69
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/RECORD +214 -162
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
{diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0

diffusers/models/model_loading_utils.py CHANGED Viewed

@@ -17,6 +17,7 @@
 import importlib
 import inspect
 import os
+from array import array
 from collections import OrderedDict
 from pathlib import Path
 from typing import List, Optional, Union
@@ -25,8 +26,8 @@ import safetensors
 import torch
 from huggingface_hub.utils import EntryNotFoundError
-from ..quantizers.quantization_config import QuantizationMethod
 from ..utils import (
+    GGUF_FILE_EXTENSION,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFETENSORS_FILE_EXTENSION,
     WEIGHTS_INDEX_NAME,
@@ -34,6 +35,8 @@ from ..utils import (
     _get_model_file,
     deprecate,
     is_accelerate_available,
+    is_gguf_available,
+    is_torch_available,
     is_torch_version,
     logging,
 )
@@ -140,6 +143,8 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[
         file_extension = os.path.basename(checkpoint_file).split(".")[-1]
         if file_extension == SAFETENSORS_FILE_EXTENSION:
             return safetensors.torch.load_file(checkpoint_file, device="cpu")
+        elif file_extension == GGUF_FILE_EXTENSION:
+            return load_gguf_checkpoint(checkpoint_file)
         else:
             weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
             return torch.load(
@@ -176,11 +181,12 @@ def load_model_dict_into_meta(
     hf_quantizer=None,
     keep_in_fp32_modules=None,
 ) -> List[str]:
+    if device is not None and not isinstance(device, (str, torch.device)):
+        raise ValueError(f"Expected device to have type `str` or `torch.device`, but got {type(device)=}.")
     if hf_quantizer is None:
         device = device or torch.device("cpu")
     dtype = dtype or torch.float32
     is_quantized = hf_quantizer is not None
-    is_quant_method_bnb = getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
     accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
     empty_state_dict = model.state_dict()
@@ -211,17 +217,18 @@ def load_model_dict_into_meta(
                     set_module_kwargs["dtype"] = dtype
         # bnb params are flattened.
+        # gguf quants have a different shape based on the type of quantization applied
         if empty_state_dict[param_name].shape != param.shape:
             if (
-                is_quant_method_bnb
+                is_quantized
                 and hf_quantizer.pre_quantized
                 and hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=device)
             ):
-                hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name].shape, param.shape)
-            elif not is_quant_method_bnb:
+                hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name], param)
+            else:
                 model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
                 raise ValueError(
-                    f"Cannot load {model_name_or_path_str} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
+                    f"Cannot load {model_name_or_path_str} because {param_name} expected shape {empty_state_dict[param_name].shape}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
                 )
         if is_quantized and (
@@ -396,3 +403,78 @@ def _fetch_index_file_legacy(
                 index_file = None
     return index_file
+def _gguf_parse_value(_value, data_type):
+    if not isinstance(data_type, list):
+        data_type = [data_type]
+    if len(data_type) == 1:
+        data_type = data_type[0]
+        array_data_type = None
+    else:
+        if data_type[0] != 9:
+            raise ValueError("Received multiple types, therefore expected the first type to indicate an array.")
+        data_type, array_data_type = data_type
+    if data_type in [0, 1, 2, 3, 4, 5, 10, 11]:
+        _value = int(_value[0])
+    elif data_type in [6, 12]:
+        _value = float(_value[0])
+    elif data_type in [7]:
+        _value = bool(_value[0])
+    elif data_type in [8]:
+        _value = array("B", list(_value)).tobytes().decode()
+    elif data_type in [9]:
+        _value = _gguf_parse_value(_value, array_data_type)
+    return _value
+def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
+    """
+    Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed tokenizer and config
+    attributes.
+    Args:
+        gguf_checkpoint_path (`str`):
+            The path the to GGUF file to load
+        return_tensors (`bool`, defaults to `True`):
+            Whether to read the tensors from the file and return them. Not doing so is faster and only loads the
+            metadata in memory.
+    """
+    if is_gguf_available() and is_torch_available():
+        import gguf
+        from gguf import GGUFReader
+        from ..quantizers.gguf.utils import SUPPORTED_GGUF_QUANT_TYPES, GGUFParameter
+    else:
+        logger.error(
+            "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
+            "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions."
+        )
+        raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
+    reader = GGUFReader(gguf_checkpoint_path)
+    parsed_parameters = {}
+    for tensor in reader.tensors:
+        name = tensor.name
+        quant_type = tensor.tensor_type
+        # if the tensor is a torch supported dtype do not use GGUFParameter
+        is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
+        if is_gguf_quant and quant_type not in SUPPORTED_GGUF_QUANT_TYPES:
+            _supported_quants_str = "\n".join([str(type) for type in SUPPORTED_GGUF_QUANT_TYPES])
+            raise ValueError(
+                (
+                    f"{name} has a quantization type: {str(quant_type)} which is unsupported."
+                    "\n\nCurrently the following quantization types are supported: \n\n"
+                    f"{_supported_quants_str}"
+                    "\n\nTo request support for this quantization type please open an issue here: https://github.com/huggingface/diffusers"
+                )
+            )
+        weights = torch.from_numpy(tensor.data.copy())
+        parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights
+    return parsed_parameters

diffusers/models/modeling_flax_utils.py CHANGED Viewed

@@ -530,7 +530,7 @@ class FlaxModelMixin(PushToHubMixin):
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
-            private = kwargs.pop("private", False)
+            private = kwargs.pop("private", None)
             create_pr = kwargs.pop("create_pr", False)
             token = kwargs.pop("token", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])

diffusers/models/modeling_utils.py CHANGED Viewed

@@ -99,21 +99,39 @@ def get_parameter_device(parameter: torch.nn.Module) -> torch.device:
 def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
-    try:
-        return next(parameter.parameters()).dtype
-    except StopIteration:
-        try:
-            return next(parameter.buffers()).dtype
-        except StopIteration:
-            # For torch.nn.DataParallel compatibility in PyTorch 1.5
-            def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
-                tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-                return tuples
-            gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-            first_tuple = next(gen)
-            return first_tuple[1].dtype
+    """
+    Returns the first found floating dtype in parameters if there is one, otherwise returns the last dtype it found.
+    """
+    last_dtype = None
+    for param in parameter.parameters():
+        last_dtype = param.dtype
+        if param.is_floating_point():
+            return param.dtype
+    for buffer in parameter.buffers():
+        last_dtype = buffer.dtype
+        if buffer.is_floating_point():
+            return buffer.dtype
+    if last_dtype is not None:
+        # if no floating dtype was found return whatever the first dtype is
+        return last_dtype
+    # For nn.DataParallel compatibility in PyTorch > 1.5
+    def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+        tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+        return tuples
+    gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+    last_tuple = None
+    for tuple in gen:
+        last_tuple = tuple
+        if tuple[1].is_floating_point():
+            return tuple[1].dtype
+    if last_tuple is not None:
+        # fallback to the last dtype
+        return last_tuple[1].dtype
 class ModelMixin(torch.nn.Module, PushToHubMixin):
@@ -208,6 +226,35 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         """
         self.set_use_npu_flash_attention(False)
+    def set_use_xla_flash_attention(
+        self, use_xla_flash_attention: bool, partition_spec: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_xla_flash_attention method
+        # gets the message
+        def fn_recursive_set_flash_attention(module: torch.nn.Module):
+            if hasattr(module, "set_use_xla_flash_attention"):
+                module.set_use_xla_flash_attention(use_xla_flash_attention, partition_spec)
+            for child in module.children():
+                fn_recursive_set_flash_attention(child)
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_flash_attention(module)
+    def enable_xla_flash_attention(self, partition_spec: Optional[Callable] = None):
+        r"""
+        Enable the flash attention pallals kernel for torch_xla.
+        """
+        self.set_use_xla_flash_attention(True, partition_spec)
+    def disable_xla_flash_attention(self):
+        r"""
+        Disable the flash attention pallals kernel for torch_xla.
+        """
+        self.set_use_xla_flash_attention(False)
     def set_use_memory_efficient_attention_xformers(
         self, valid: bool, attention_op: Optional[Callable] = None
     ) -> None:
@@ -338,7 +385,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         if push_to_hub:
             commit_message = kwargs.pop("commit_message", None)
-            private = kwargs.pop("private", False)
+            private = kwargs.pop("private", None)
             create_pr = kwargs.pop("create_pr", False)
             token = kwargs.pop("token", None)
             repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
@@ -671,10 +718,12 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
             hf_quantizer = None
         if hf_quantizer is not None:
-            if device_map is not None:
+            is_bnb_quantization_method = hf_quantizer.quantization_config.quant_method.value == "bitsandbytes"
+            if is_bnb_quantization_method and device_map is not None:
                 raise NotImplementedError(
-                    "Currently, `device_map` is automatically inferred for quantized models. Support for providing `device_map` as an input will be added in the future."
+                    "Currently, `device_map` is automatically inferred for quantized bitsandbytes models. Support for providing `device_map` as an input will be added in the future."
                 )
             hf_quantizer.validate_environment(torch_dtype=torch_dtype, from_flax=from_flax, device_map=device_map)
             torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
@@ -771,7 +820,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                     revision=revision,
                     subfolder=subfolder or "",
                 )
-                if hf_quantizer is not None:
+                if hf_quantizer is not None and is_bnb_quantization_method:
                     model_file = _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata)
                     logger.info("Merged sharded checkpoints as `hf_quantizer` is not None.")
                     is_sharded = False
@@ -829,14 +878,11 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                 if device_map is None and not is_sharded:
                     # `torch.cuda.current_device()` is fine here when `hf_quantizer` is not None.
                     # It would error out during the `validate_environment()` call above in the absence of cuda.
-                    is_quant_method_bnb = (
-                        getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
-                    )
                     if hf_quantizer is None:
                         param_device = "cpu"
                     # TODO (sayakpaul,  SunMarc): remove this after model loading refactor
-                    elif is_quant_method_bnb:
-                        param_device = torch.cuda.current_device()
+                    else:
+                        param_device = torch.device(torch.cuda.current_device())
                     state_dict = load_state_dict(model_file, variant=variant)
                     model._convert_deprecated_attention_blocks(state_dict)
@@ -1010,14 +1056,14 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                     dtype_present_in_args = True
                     break
-        # Checks if the model has been loaded in 4-bit or 8-bit with BNB
-        if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
+        if getattr(self, "is_quantized", False):
             if dtype_present_in_args:
                 raise ValueError(
-                    "You cannot cast a bitsandbytes model in a new `dtype`. Make sure to load the model using `from_pretrained` using the"
-                    " desired `dtype` by passing the correct `torch_dtype` argument."
+                    "Casting a quantized model to a new `dtype` is unsupported. To set the dtype of unquantized layers, please "
+                    "use the `torch_dtype` argument when loading the model using `from_pretrained` or `from_single_file`"
                 )
+        if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
             if getattr(self, "is_loaded_in_8bit", False):
                 raise ValueError(
                     "`.to` is not supported for `8-bit` bitsandbytes models. Please use the model as it is, since the"

diffusers/models/normalization.py CHANGED Viewed

@@ -22,10 +22,7 @@ import torch.nn.functional as F
 from ..utils import is_torch_version
 from .activations import get_activation
-from .embeddings import (
-    CombinedTimestepLabelEmbeddings,
-    PixArtAlphaCombinedTimestepSizeEmbeddings,
-)
+from .embeddings import CombinedTimestepLabelEmbeddings, PixArtAlphaCombinedTimestepSizeEmbeddings
 class AdaLayerNorm(nn.Module):
@@ -266,6 +263,7 @@ class AdaLayerNormSingle(nn.Module):
         hidden_dtype: Optional[torch.dtype] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         # No modulation happening here.
+        added_cond_kwargs = added_cond_kwargs or {"resolution": None, "aspect_ratio": None}
         embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
         return self.linear(self.silu(embedded_timestep)), embedded_timestep
@@ -358,20 +356,21 @@ class LuminaLayerNormContinuous(nn.Module):
         out_dim: Optional[int] = None,
     ):
         super().__init__()
         # AdaLN
         self.silu = nn.SiLU()
         self.linear_1 = nn.Linear(conditioning_embedding_dim, embedding_dim, bias=bias)
         if norm_type == "layer_norm":
             self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine)
         else:
             raise ValueError(f"unknown norm_type {norm_type}")
-        # linear_2
+        self.linear_2 = None
         if out_dim is not None:
-            self.linear_2 = nn.Linear(
-                embedding_dim,
-                out_dim,
-                bias=bias,
-            )
+            self.linear_2 = nn.Linear(embedding_dim, out_dim, bias=bias)
     def forward(
         self,
@@ -486,20 +485,24 @@ else:
 class RMSNorm(nn.Module):
-    def __init__(self, dim, eps: float, elementwise_affine: bool = True):
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False):
         super().__init__()
         self.eps = eps
+        self.elementwise_affine = elementwise_affine
         if isinstance(dim, numbers.Integral):
             dim = (dim,)
         self.dim = torch.Size(dim)
+        self.weight = None
+        self.bias = None
         if elementwise_affine:
             self.weight = nn.Parameter(torch.ones(dim))
-        else:
-            self.weight = None
+            if bias:
+                self.bias = nn.Parameter(torch.zeros(dim))
     def forward(self, hidden_states):
         input_dtype = hidden_states.dtype
@@ -511,12 +514,44 @@ class RMSNorm(nn.Module):
             if self.weight.dtype in [torch.float16, torch.bfloat16]:
                 hidden_states = hidden_states.to(self.weight.dtype)
             hidden_states = hidden_states * self.weight
+            if self.bias is not None:
+                hidden_states = hidden_states + self.bias
         else:
             hidden_states = hidden_states.to(input_dtype)
         return hidden_states
+# TODO: (Dhruv) This can be replaced with regular RMSNorm in Mochi once `_keep_in_fp32_modules` is supported
+# for sharded checkpoints, see: https://github.com/huggingface/diffusers/issues/10013
+class MochiRMSNorm(nn.Module):
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True):
+        super().__init__()
+        self.eps = eps
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+        self.dim = torch.Size(dim)
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.weight = None
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        if self.weight is not None:
+            hidden_states = hidden_states * self.weight
+        hidden_states = hidden_states.to(input_dtype)
+        return hidden_states
 class GlobalResponseNorm(nn.Module):
     # Taken from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
     def __init__(self, dim):
@@ -528,3 +563,33 @@ class GlobalResponseNorm(nn.Module):
         gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
         nx = gx / (gx.mean(dim=-1, keepdim=True) + 1e-6)
         return self.gamma * (x * nx) + self.beta + x
+class LpNorm(nn.Module):
+    def __init__(self, p: int = 2, dim: int = -1, eps: float = 1e-12):
+        super().__init__()
+        self.p = p
+        self.dim = dim
+        self.eps = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return F.normalize(hidden_states, p=self.p, dim=self.dim, eps=self.eps)
+def get_normalization(
+    norm_type: str = "batch_norm",
+    num_features: Optional[int] = None,
+    eps: float = 1e-5,
+    elementwise_affine: bool = True,
+    bias: bool = True,
+) -> nn.Module:
+    if norm_type == "rms_norm":
+        norm = RMSNorm(num_features, eps=eps, elementwise_affine=elementwise_affine, bias=bias)
+    elif norm_type == "layer_norm":
+        norm = nn.LayerNorm(num_features, eps=eps, elementwise_affine=elementwise_affine, bias=bias)
+    elif norm_type == "batch_norm":
+        norm = nn.BatchNorm2d(num_features, eps=eps, affine=elementwise_affine)
+    else:
+        raise ValueError(f"{norm_type=} is not supported.")
+    return norm

diffusers/models/transformers/__init__.py CHANGED Viewed

@@ -11,10 +11,15 @@ if is_torch_available():
     from .lumina_nextdit2d import LuminaNextDiT2DModel
     from .pixart_transformer_2d import PixArtTransformer2DModel
     from .prior_transformer import PriorTransformer
+    from .sana_transformer import SanaTransformer2DModel
     from .stable_audio_transformer import StableAudioDiTModel
     from .t5_film_transformer import T5FilmDecoder
     from .transformer_2d import Transformer2DModel
+    from .transformer_allegro import AllegroTransformer3DModel
     from .transformer_cogview3plus import CogView3PlusTransformer2DModel
     from .transformer_flux import FluxTransformer2DModel
+    from .transformer_hunyuan_video import HunyuanVideoTransformer3DModel
+    from .transformer_ltx import LTXVideoTransformer3DModel
+    from .transformer_mochi import MochiTransformer3DModel
     from .transformer_sd3 import SD3Transformer2DModel
     from .transformer_temporal import TransformerTemporalModel

diffusers/models/transformers/auraflow_transformer_2d.py CHANGED Viewed

@@ -466,7 +466,7 @@ class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin):
         # MMDiT blocks.
         for index_block, block in enumerate(self.joint_transformer_blocks):
-            if self.training and self.gradient_checkpointing:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
                 def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):
@@ -497,7 +497,7 @@ class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin):
             combined_hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
             for index_block, block in enumerate(self.single_transformer_blocks):
-                if self.training and self.gradient_checkpointing:
+                if torch.is_grad_enabled() and self.gradient_checkpointing:
                     def create_custom_forward(module, return_dict=None):
                         def custom_forward(*inputs):

diffusers/models/transformers/cogvideox_transformer_3d.py CHANGED Viewed

@@ -170,6 +170,8 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             Whether to flip the sin to cos in the time embedding.
         time_embed_dim (`int`, defaults to `512`):
             Output dimension of timestep embeddings.
+        ofs_embed_dim (`int`, defaults to `512`):
+            Output dimension of "ofs" embeddings used in CogVideoX-5b-I2B in version 1.5
         text_embed_dim (`int`, defaults to `4096`):
             Input dimension of text embeddings from the text encoder.
         num_layers (`int`, defaults to `30`):
@@ -177,7 +179,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         dropout (`float`, defaults to `0.0`):
             The dropout probability to use.
         attention_bias (`bool`, defaults to `True`):
-            Whether or not to use bias in the attention projection layers.
+            Whether to use bias in the attention projection layers.
         sample_width (`int`, defaults to `90`):
             The width of the input latents.
         sample_height (`int`, defaults to `60`):
@@ -198,7 +200,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         timestep_activation_fn (`str`, defaults to `"silu"`):
             Activation function to use when generating the timestep embeddings.
         norm_elementwise_affine (`bool`, defaults to `True`):
-            Whether or not to use elementwise affine in normalization layers.
+            Whether to use elementwise affine in normalization layers.
         norm_eps (`float`, defaults to `1e-5`):
             The epsilon value to use in normalization layers.
         spatial_interpolation_scale (`float`, defaults to `1.875`):
@@ -219,6 +221,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         flip_sin_to_cos: bool = True,
         freq_shift: int = 0,
         time_embed_dim: int = 512,
+        ofs_embed_dim: Optional[int] = None,
         text_embed_dim: int = 4096,
         num_layers: int = 30,
         dropout: float = 0.0,
@@ -227,6 +230,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         sample_height: int = 60,
         sample_frames: int = 49,
         patch_size: int = 2,
+        patch_size_t: Optional[int] = None,
         temporal_compression_ratio: int = 4,
         max_text_seq_length: int = 226,
         activation_fn: str = "gelu-approximate",
@@ -237,6 +241,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         temporal_interpolation_scale: float = 1.0,
         use_rotary_positional_embeddings: bool = False,
         use_learned_positional_embeddings: bool = False,
+        patch_bias: bool = True,
     ):
         super().__init__()
         inner_dim = num_attention_heads * attention_head_dim
@@ -251,10 +256,11 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         # 1. Patch embedding
         self.patch_embed = CogVideoXPatchEmbed(
             patch_size=patch_size,
+            patch_size_t=patch_size_t,
             in_channels=in_channels,
             embed_dim=inner_dim,
             text_embed_dim=text_embed_dim,
-            bias=True,
+            bias=patch_bias,
             sample_width=sample_width,
             sample_height=sample_height,
             sample_frames=sample_frames,
@@ -267,10 +273,19 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         )
         self.embedding_dropout = nn.Dropout(dropout)
-        # 2. Time embeddings
+        # 2. Time embeddings and ofs embedding(Only CogVideoX1.5-5B I2V have)
         self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
         self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
+        self.ofs_proj = None
+        self.ofs_embedding = None
+        if ofs_embed_dim:
+            self.ofs_proj = Timesteps(ofs_embed_dim, flip_sin_to_cos, freq_shift)
+            self.ofs_embedding = TimestepEmbedding(
+                ofs_embed_dim, ofs_embed_dim, timestep_activation_fn
+            )  # same as time embeddings, for ofs
         # 3. Define spatio-temporal transformers blocks
         self.transformer_blocks = nn.ModuleList(
             [
@@ -298,7 +313,15 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             norm_eps=norm_eps,
             chunk_dim=1,
         )
-        self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
+        if patch_size_t is None:
+            # For CogVideox 1.0
+            output_dim = patch_size * patch_size * out_channels
+        else:
+            # For CogVideoX 1.5
+            output_dim = patch_size * patch_size * patch_size_t * out_channels
+        self.proj_out = nn.Linear(inner_dim, output_dim)
         self.gradient_checkpointing = False
@@ -411,6 +434,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         encoder_hidden_states: torch.Tensor,
         timestep: Union[int, float, torch.LongTensor],
         timestep_cond: Optional[torch.Tensor] = None,
+        ofs: Optional[Union[int, float, torch.LongTensor]] = None,
         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
@@ -442,6 +466,12 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         t_emb = t_emb.to(dtype=hidden_states.dtype)
         emb = self.time_embedding(t_emb, timestep_cond)
+        if self.ofs_embedding is not None:
+            ofs_emb = self.ofs_proj(ofs)
+            ofs_emb = ofs_emb.to(dtype=hidden_states.dtype)
+            ofs_emb = self.ofs_embedding(ofs_emb)
+            emb = emb + ofs_emb
         # 2. Patch embedding
         hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
         hidden_states = self.embedding_dropout(hidden_states)
@@ -452,7 +482,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         # 3. Transformer blocks
         for i, block in enumerate(self.transformer_blocks):
-            if self.training and self.gradient_checkpointing:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -491,12 +521,17 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         hidden_states = self.proj_out(hidden_states)
         # 5. Unpatchify
-        # Note: we use `-1` instead of `channels`:
-        #   - It is okay to `channels` use for CogVideoX-2b and CogVideoX-5b (number of input channels is equal to output channels)
-        #   - However, for CogVideoX-5b-I2V also takes concatenated input image latents (number of input channels is twice the output channels)
         p = self.config.patch_size
-        output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
-        output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        p_t = self.config.patch_size_t
+        if p_t is None:
+            output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        else:
+            output = hidden_states.reshape(
+                batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+            )
+            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
         if USE_PEFT_BACKEND:
             # remove `lora_scale` from each PEFT layer

diffusers/models/transformers/dit_transformer_2d.py CHANGED Viewed

@@ -184,7 +184,7 @@ class DiTTransformer2DModel(ModelMixin, ConfigMixin):
         # 2. Blocks
         for block in self.transformer_blocks:
-            if self.training and self.gradient_checkpointing:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
                 def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):

diffusers 0.31.0__py3-none-any.whl → 0.32.0__py3-none-any.whl

diffusers 0.31.0py3-none-any.whl → 0.32.0py3-none-any.whl