PyPI - diffusers - Versions diffs - 0.28.2__py3-none-any.whl → 0.29.1__py3-none-any.whl - Mend

diffusers 0.28.2py3-none-any.whl → 0.29.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

diffusers/models/embeddings.py CHANGED Viewed

@@ -123,7 +123,7 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
 class PatchEmbed(nn.Module):
-    """2D Image to Patch Embedding"""
+    """2D Image to Patch Embedding with support for SD3 cropping."""
     def __init__(
         self,
@@ -137,12 +137,14 @@ class PatchEmbed(nn.Module):
         bias=True,
         interpolation_scale=1,
         pos_embed_type="sincos",
+        pos_embed_max_size=None,  # For SD3 cropping
     ):
         super().__init__()
         num_patches = (height // patch_size) * (width // patch_size)
         self.flatten = flatten
         self.layer_norm = layer_norm
+        self.pos_embed_max_size = pos_embed_max_size
         self.proj = nn.Conv2d(
             in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
@@ -153,26 +155,55 @@ class PatchEmbed(nn.Module):
             self.norm = None
         self.patch_size = patch_size
-        # See:
-        # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
         self.height, self.width = height // patch_size, width // patch_size
         self.base_size = height // patch_size
         self.interpolation_scale = interpolation_scale
+        # Calculate positional embeddings based on max size or default
+        if pos_embed_max_size:
+            grid_size = pos_embed_max_size
+        else:
+            grid_size = int(num_patches**0.5)
         if pos_embed_type is None:
             self.pos_embed = None
         elif pos_embed_type == "sincos":
             pos_embed = get_2d_sincos_pos_embed(
-                embed_dim,
-                int(num_patches**0.5),
-                base_size=self.base_size,
-                interpolation_scale=self.interpolation_scale,
+                embed_dim, grid_size, base_size=self.base_size, interpolation_scale=self.interpolation_scale
             )
-            self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+            persistent = True if pos_embed_max_size else False
+            self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=persistent)
         else:
             raise ValueError(f"Unsupported pos_embed_type: {pos_embed_type}")
+    def cropped_pos_embed(self, height, width):
+        """Crops positional embeddings for SD3 compatibility."""
+        if self.pos_embed_max_size is None:
+            raise ValueError("`pos_embed_max_size` must be set for cropping.")
+        height = height // self.patch_size
+        width = width // self.patch_size
+        if height > self.pos_embed_max_size:
+            raise ValueError(
+                f"Height ({height}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        if width > self.pos_embed_max_size:
+            raise ValueError(
+                f"Width ({width}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}."
+            )
+        top = (self.pos_embed_max_size - height) // 2
+        left = (self.pos_embed_max_size - width) // 2
+        spatial_pos_embed = self.pos_embed.reshape(1, self.pos_embed_max_size, self.pos_embed_max_size, -1)
+        spatial_pos_embed = spatial_pos_embed[:, top : top + height, left : left + width, :]
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        return spatial_pos_embed
     def forward(self, latent):
-        height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+        if self.pos_embed_max_size is not None:
+            height, width = latent.shape[-2:]
+        else:
+            height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
         latent = self.proj(latent)
         if self.flatten:
@@ -181,20 +212,20 @@ class PatchEmbed(nn.Module):
             latent = self.norm(latent)
         if self.pos_embed is None:
             return latent.to(latent.dtype)
-        # Interpolate positional embeddings if needed.
-        # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
-        if self.height != height or self.width != width:
-            pos_embed = get_2d_sincos_pos_embed(
-                embed_dim=self.pos_embed.shape[-1],
-                grid_size=(height, width),
-                base_size=self.base_size,
-                interpolation_scale=self.interpolation_scale,
-            )
-            pos_embed = torch.from_numpy(pos_embed)
-            pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+        # Interpolate or crop positional embeddings as needed
+        if self.pos_embed_max_size:
+            pos_embed = self.cropped_pos_embed(height, width)
         else:
-            pos_embed = self.pos_embed
+            if self.height != height or self.width != width:
+                pos_embed = get_2d_sincos_pos_embed(
+                    embed_dim=self.pos_embed.shape[-1],
+                    grid_size=(height, width),
+                    base_size=self.base_size,
+                    interpolation_scale=self.interpolation_scale,
+                )
+                pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).to(latent.device)
+            else:
+                pos_embed = self.pos_embed
         return (latent + pos_embed).to(latent.dtype)
@@ -626,6 +657,25 @@ class CombinedTimestepLabelEmbeddings(nn.Module):
         return conditioning
+class CombinedTimestepTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = timesteps_emb + pooled_projections
+        return conditioning
 class HunyuanDiTAttentionPool(nn.Module):
     # Copied from https://github.com/Tencent/HunyuanDiT/blob/cb709308d92e6c7e8d59d0dff41b74d35088db6a/hydit/modules/poolers.py#L6
@@ -1001,6 +1051,8 @@ class PixArtAlphaTextProjection(nn.Module):
         self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
         if act_fn == "gelu_tanh":
             self.act_1 = nn.GELU(approximate="tanh")
+        elif act_fn == "silu":
+            self.act_1 = nn.SiLU()
         elif act_fn == "silu_fp32":
             self.act_1 = FP32SiLU()
         else:
@@ -1014,6 +1066,39 @@ class PixArtAlphaTextProjection(nn.Module):
         return hidden_states
+class IPAdapterPlusImageProjectionBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dims: int = 768,
+        dim_head: int = 64,
+        heads: int = 16,
+        ffn_ratio: float = 4,
+    ) -> None:
+        super().__init__()
+        from .attention import FeedForward
+        self.ln0 = nn.LayerNorm(embed_dims)
+        self.ln1 = nn.LayerNorm(embed_dims)
+        self.attn = Attention(
+            query_dim=embed_dims,
+            dim_head=dim_head,
+            heads=heads,
+            out_bias=False,
+        )
+        self.ff = nn.Sequential(
+            nn.LayerNorm(embed_dims),
+            FeedForward(embed_dims, embed_dims, activation_fn="gelu", mult=ffn_ratio, bias=False),
+        )
+    def forward(self, x, latents, residual):
+        encoder_hidden_states = self.ln0(x)
+        latents = self.ln1(latents)
+        encoder_hidden_states = torch.cat([encoder_hidden_states, latents], dim=-2)
+        latents = self.attn(latents, encoder_hidden_states) + residual
+        latents = self.ff(latents) + latents
+        return latents
 class IPAdapterPlusImageProjection(nn.Module):
     """Resampler of IP-Adapter Plus.
@@ -1042,8 +1127,6 @@ class IPAdapterPlusImageProjection(nn.Module):
         ffn_ratio: float = 4,
     ) -> None:
         super().__init__()
-        from .attention import FeedForward  # Lazy import to avoid circular import
         self.latents = nn.Parameter(torch.randn(1, num_queries, hidden_dims) / hidden_dims**0.5)
         self.proj_in = nn.Linear(embed_dims, hidden_dims)
@@ -1051,26 +1134,9 @@ class IPAdapterPlusImageProjection(nn.Module):
         self.proj_out = nn.Linear(hidden_dims, output_dims)
         self.norm_out = nn.LayerNorm(output_dims)
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(
-                nn.ModuleList(
-                    [
-                        nn.LayerNorm(hidden_dims),
-                        nn.LayerNorm(hidden_dims),
-                        Attention(
-                            query_dim=hidden_dims,
-                            dim_head=dim_head,
-                            heads=heads,
-                            out_bias=False,
-                        ),
-                        nn.Sequential(
-                            nn.LayerNorm(hidden_dims),
-                            FeedForward(hidden_dims, hidden_dims, activation_fn="gelu", mult=ffn_ratio, bias=False),
-                        ),
-                    ]
-                )
-            )
+        self.layers = nn.ModuleList(
+            [IPAdapterPlusImageProjectionBlock(hidden_dims, dim_head, heads, ffn_ratio) for _ in range(depth)]
+        )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward pass.
@@ -1084,52 +1150,14 @@ class IPAdapterPlusImageProjection(nn.Module):
         x = self.proj_in(x)
-        for ln0, ln1, attn, ff in self.layers:
+        for block in self.layers:
             residual = latents
-            encoder_hidden_states = ln0(x)
-            latents = ln1(latents)
-            encoder_hidden_states = torch.cat([encoder_hidden_states, latents], dim=-2)
-            latents = attn(latents, encoder_hidden_states) + residual
-            latents = ff(latents) + latents
+            latents = block(x, latents, residual)
         latents = self.proj_out(latents)
         return self.norm_out(latents)
-class IPAdapterPlusImageProjectionBlock(nn.Module):
-    def __init__(
-        self,
-        embed_dims: int = 768,
-        dim_head: int = 64,
-        heads: int = 16,
-        ffn_ratio: float = 4,
-    ) -> None:
-        super().__init__()
-        from .attention import FeedForward
-        self.ln0 = nn.LayerNorm(embed_dims)
-        self.ln1 = nn.LayerNorm(embed_dims)
-        self.attn = Attention(
-            query_dim=embed_dims,
-            dim_head=dim_head,
-            heads=heads,
-            out_bias=False,
-        )
-        self.ff = nn.Sequential(
-            nn.LayerNorm(embed_dims),
-            FeedForward(embed_dims, embed_dims, activation_fn="gelu", mult=ffn_ratio, bias=False),
-        )
-    def forward(self, x, latents, residual):
-        encoder_hidden_states = self.ln0(x)
-        latents = self.ln1(latents)
-        encoder_hidden_states = torch.cat([encoder_hidden_states, latents], dim=-2)
-        latents = self.attn(latents, encoder_hidden_states) + residual
-        latents = self.ff(latents) + latents
-        return latents
 class IPAdapterFaceIDPlusImageProjection(nn.Module):
     """FacePerceiverResampler of IP-Adapter Plus.

diffusers/models/model_loading_utils.py CHANGED Viewed

@@ -18,13 +18,19 @@ import importlib
 import inspect
 import os
 from collections import OrderedDict
+from pathlib import Path
 from typing import List, Optional, Union
 import safetensors
 import torch
+from huggingface_hub.utils import EntryNotFoundError
 from ..utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
     SAFETENSORS_FILE_EXTENSION,
+    WEIGHTS_INDEX_NAME,
+    _add_variant,
+    _get_model_file,
     is_accelerate_available,
     is_torch_version,
     logging,
@@ -175,3 +181,52 @@ def _load_state_dict_into_model(model_to_load, state_dict: OrderedDict) -> List[
     load(model_to_load)
     return error_msgs
+def _fetch_index_file(
+    is_local,
+    pretrained_model_name_or_path,
+    subfolder,
+    use_safetensors,
+    cache_dir,
+    variant,
+    force_download,
+    resume_download,
+    proxies,
+    local_files_only,
+    token,
+    revision,
+    user_agent,
+    commit_hash,
+):
+    if is_local:
+        index_file = Path(
+            pretrained_model_name_or_path,
+            subfolder or "",
+            _add_variant(SAFE_WEIGHTS_INDEX_NAME if use_safetensors else WEIGHTS_INDEX_NAME, variant),
+        )
+    else:
+        index_file_in_repo = Path(
+            subfolder or "",
+            _add_variant(SAFE_WEIGHTS_INDEX_NAME if use_safetensors else WEIGHTS_INDEX_NAME, variant),
+        ).as_posix()
+        try:
+            index_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=index_file_in_repo,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+                commit_hash=commit_hash,
+            )
+            index_file = Path(index_file)
+        except (EntryNotFoundError, EnvironmentError):
+            index_file = None
+    return index_file

diffusers/models/modeling_utils.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import inspect
 import itertools
+import json
 import os
 import re
 from collections import OrderedDict
@@ -25,7 +26,7 @@ from typing import Any, Callable, List, Optional, Tuple, Union
 import safetensors
 import torch
-from huggingface_hub import create_repo
+from huggingface_hub import create_repo, split_torch_state_dict_into_shards
 from huggingface_hub.utils import validate_hf_hub_args
 from torch import Tensor, nn
@@ -33,9 +34,12 @@ from .. import __version__
 from ..utils import (
     CONFIG_NAME,
     FLAX_WEIGHTS_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
     SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
     _add_variant,
+    _get_checkpoint_shard_files,
     _get_model_file,
     deprecate,
     is_accelerate_available,
@@ -49,6 +53,7 @@ from ..utils.hub_utils import (
 )
 from .model_loading_utils import (
     _determine_device_map,
+    _fetch_index_file,
     _load_state_dict_into_model,
     load_model_dict_into_meta,
     load_state_dict,
@@ -57,6 +62,8 @@ from .model_loading_utils import (
 logger = logging.get_logger(__name__)
+_REGEX_SHARD = re.compile(r"(.*?)-\d{5}-of-\d{5}")
 if is_torch_version(">=", "1.9.0"):
     _LOW_CPU_MEM_USAGE_DEFAULT = True
@@ -263,6 +270,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         save_function: Optional[Callable] = None,
         safe_serialization: bool = True,
         variant: Optional[str] = None,
+        max_shard_size: Union[int, str] = "10GB",
         push_to_hub: bool = False,
         **kwargs,
     ):
@@ -285,6 +293,13 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                 Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
             variant (`str`, *optional*):
                 If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+            max_shard_size (`int` or `str`, defaults to `"10GB"`):
+                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`).
+                If expressed as an integer, the unit is bytes. Note that this limit will be decreased after a certain
+                period of time (starting from Oct 2024) to allow users to upgrade to the latest version of `diffusers`.
+                This is to establish a common default size for this argument across different libraries in the Hugging
+                Face ecosystem (`transformers`, and `accelerate`, for example).
             push_to_hub (`bool`, *optional*, defaults to `False`):
                 Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
@@ -296,6 +311,14 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
+        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        weights_name = _add_variant(weights_name, variant)
+        weight_name_split = weights_name.split(".")
+        if len(weight_name_split) in [2, 3]:
+            weights_name_pattern = weight_name_split[0] + "{suffix}." + ".".join(weight_name_split[1:])
+        else:
+            raise ValueError(f"Invalid {weights_name} provided.")
         os.makedirs(save_directory, exist_ok=True)
         if push_to_hub:
@@ -317,18 +340,58 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         # Save the model
         state_dict = model_to_save.state_dict()
-        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
-        weights_name = _add_variant(weights_name, variant)
         # Save the model
-        if safe_serialization:
-            safetensors.torch.save_file(
-                state_dict, Path(save_directory, weights_name).as_posix(), metadata={"format": "pt"}
+        state_dict_split = split_torch_state_dict_into_shards(
+            state_dict, max_shard_size=max_shard_size, filename_pattern=weights_name_pattern
+        )
+        # Clean the folder from a previous save
+        if is_main_process:
+            for filename in os.listdir(save_directory):
+                if filename in state_dict_split.filename_to_tensors.keys():
+                    continue
+                full_filename = os.path.join(save_directory, filename)
+                if not os.path.isfile(full_filename):
+                    continue
+                weights_without_ext = weights_name_pattern.replace(".bin", "").replace(".safetensors", "")
+                weights_without_ext = weights_without_ext.replace("{suffix}", "")
+                filename_without_ext = filename.replace(".bin", "").replace(".safetensors", "")
+                # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
+                if (
+                    filename.startswith(weights_without_ext)
+                    and _REGEX_SHARD.fullmatch(filename_without_ext) is not None
+                ):
+                    os.remove(full_filename)
+        for filename, tensors in state_dict_split.filename_to_tensors.items():
+            shard = {tensor: state_dict[tensor] for tensor in tensors}
+            filepath = os.path.join(save_directory, filename)
+            if safe_serialization:
+                # At some point we will need to deal better with save_function (used for TPU and other distributed
+                # joyfulness), but for now this enough.
+                safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"})
+            else:
+                torch.save(shard, filepath)
+        if state_dict_split.is_sharded:
+            index = {
+                "metadata": state_dict_split.metadata,
+                "weight_map": state_dict_split.tensor_to_filename,
+            }
+            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
+            save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
+            # Save the index as well
+            with open(save_index_file, "w", encoding="utf-8") as f:
+                content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+                f.write(content)
+            logger.info(
+                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+                f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the "
+                f"index located at {save_index_file}."
             )
         else:
-            torch.save(state_dict, Path(save_directory, weights_name).as_posix())
-        logger.info(f"Model weights saved in {Path(save_directory, weights_name).as_posix()}")
+            path_to_weights = os.path.join(save_directory, weights_name)
+            logger.info(f"Model weights saved in {path_to_weights}")
         if push_to_hub:
             # Create a new empty model card and eventually tag it
@@ -399,7 +462,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
             device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
                 A map that specifies where each submodule should go. It doesn't need to be defined for each
                 parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
-                same device.
+                same device. Defaults to `None`, meaning that the model will be loaded on CPU.
                 Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
                 more information about each option see [designing a device
@@ -566,6 +629,32 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
             **kwargs,
         )
+        # Determine if we're loading from a directory of sharded checkpoints.
+        is_sharded = False
+        index_file = None
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        index_file = _fetch_index_file(
+            is_local=is_local,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder or "",
+            use_safetensors=use_safetensors,
+            cache_dir=cache_dir,
+            variant=variant,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            user_agent=user_agent,
+            commit_hash=commit_hash,
+        )
+        if index_file is not None and index_file.is_file():
+            is_sharded = True
+        if is_sharded and from_flax:
+            raise ValueError("Loading of sharded checkpoints is not supported when `from_flax=True`.")
         # load model
         model_file = None
         if from_flax:
@@ -590,7 +679,21 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
             model = load_flax_checkpoint_in_pytorch_model(model, model_file)
         else:
-            if use_safetensors:
+            if is_sharded:
+                sharded_ckpt_cached_folder, sharded_metadata = _get_checkpoint_shard_files(
+                    pretrained_model_name_or_path,
+                    index_file,
+                    cache_dir=cache_dir,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder or "",
+                )
+            elif use_safetensors and not is_sharded:
                 try:
                     model_file = _get_model_file(
                         pretrained_model_name_or_path,
@@ -606,11 +709,16 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                         user_agent=user_agent,
                         commit_hash=commit_hash,
                     )
                 except IOError as e:
+                    logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
                     if not allow_pickle:
-                        raise e
-                    pass
-            if model_file is None:
+                        raise
+                    logger.warning(
+                        "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
+                    )
+            if model_file is None and not is_sharded:
                 model_file = _get_model_file(
                     pretrained_model_name_or_path,
                     weights_name=_add_variant(WEIGHTS_NAME, variant),
@@ -632,7 +740,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                     model = cls.from_config(config, **unused_kwargs)
                 # if device_map is None, load the state dict and move the params from meta device to the cpu
-                if device_map is None:
+                if device_map is None and not is_sharded:
                     param_device = "cpu"
                     state_dict = load_state_dict(model_file, variant=variant)
                     model._convert_deprecated_attention_blocks(state_dict)
@@ -666,17 +774,22 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                 else:  # else let accelerate handle loading and dispatching.
                     # Load weights and dispatch according to the device_map
                     # by default the device_map is None and the weights are loaded on the CPU
+                    force_hook = True
                     device_map = _determine_device_map(model, device_map, max_memory, torch_dtype)
+                    if device_map is None and is_sharded:
+                        # we load the parameters on the cpu
+                        device_map = {"": "cpu"}
+                        force_hook = False
                     try:
                         accelerate.load_checkpoint_and_dispatch(
                             model,
-                            model_file,
+                            model_file if not is_sharded else sharded_ckpt_cached_folder,
                             device_map,
                             max_memory=max_memory,
                             offload_folder=offload_folder,
                             offload_state_dict=offload_state_dict,
                             dtype=torch_dtype,
-                            force_hooks=True,
+                            force_hooks=force_hook,
                             strict=True,
                         )
                     except AttributeError as e:
@@ -700,12 +813,14 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                             model._temp_convert_self_to_deprecated_attention_blocks()
                             accelerate.load_checkpoint_and_dispatch(
                                 model,
-                                model_file,
+                                model_file if not is_sharded else sharded_ckpt_cached_folder,
                                 device_map,
                                 max_memory=max_memory,
                                 offload_folder=offload_folder,
                                 offload_state_dict=offload_state_dict,
                                 dtype=torch_dtype,
+                                force_hook=force_hook,
+                                strict=True,
                             )
                             model._undo_temp_convert_self_to_deprecated_attention_blocks()
                         else:
@@ -1057,6 +1172,9 @@ class LegacyModelMixin(ModelMixin):
         # To prevent depedency import problem.
         from .model_loading_utils import _fetch_remapped_cls_from_config
+        # Create a copy of the kwargs so that we don't mess with the keyword arguments in the downstream calls.
+        kwargs_copy = kwargs.copy()
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", None)
@@ -1094,4 +1212,4 @@ class LegacyModelMixin(ModelMixin):
         # resolve remapping
         remapped_class = _fetch_remapped_cls_from_config(config, cls)
-        return remapped_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return remapped_class.from_pretrained(pretrained_model_name_or_path, **kwargs_copy)

diffusers 0.28.2__py3-none-any.whl → 0.29.1__py3-none-any.whl

diffusers 0.28.2py3-none-any.whl → 0.29.1py3-none-any.whl