PyPI - diffusers - Versions diffs - 0.33.1__py3-none-any.whl → 0.35.0__py3-none-any.whl - Mend

diffusers 0.33.1py3-none-any.whl → 0.35.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (551) hide show

diffusers/hooks/group_offloading.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
+# Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import hashlib
+import os
 from contextlib import contextmanager, nullcontext
-from typing import Dict, List, Optional, Set, Tuple
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Optional, Set, Tuple, Union
+import safetensors.torch
 import torch
 from ..utils import get_logger, is_accelerate_available
+from ._common import _GO_LC_SUPPORTED_PYTORCH_LAYERS
 from .hooks import HookRegistry, ModelHook
@@ -33,17 +39,28 @@ logger = get_logger(__name__)  # pylint: disable=invalid-name
 _GROUP_OFFLOADING = "group_offloading"
 _LAYER_EXECUTION_TRACKER = "layer_execution_tracker"
 _LAZY_PREFETCH_GROUP_OFFLOADING = "lazy_prefetch_group_offloading"
-_SUPPORTED_PYTORCH_LAYERS = (
-    torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d,
-    torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d, torch.nn.ConvTranspose3d,
-    torch.nn.Linear,
-    # TODO(aryan): look into torch.nn.LayerNorm, torch.nn.GroupNorm later, seems to be causing some issues with CogVideoX
-    # because of double invocation of the same norm layer in CogVideoXLayerNorm
-)
+_GROUP_ID_LAZY_LEAF = "lazy_leafs"
 # fmt: on
+class GroupOffloadingType(str, Enum):
+    BLOCK_LEVEL = "block_level"
+    LEAF_LEVEL = "leaf_level"
+@dataclass
+class GroupOffloadingConfig:
+    onload_device: torch.device
+    offload_device: torch.device
+    offload_type: GroupOffloadingType
+    non_blocking: bool
+    record_stream: bool
+    low_cpu_mem_usage: bool
+    num_blocks_per_group: Optional[int] = None
+    offload_to_disk_path: Optional[str] = None
+    stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
 class ModuleGroup:
     def __init__(
         self,
@@ -55,10 +72,12 @@ class ModuleGroup:
         parameters: Optional[List[torch.nn.Parameter]] = None,
         buffers: Optional[List[torch.Tensor]] = None,
         non_blocking: bool = False,
-        stream: Optional[torch.cuda.Stream] = None,
+        stream: Union[torch.cuda.Stream, torch.Stream, None] = None,
         record_stream: Optional[bool] = False,
-        low_cpu_mem_usage=False,
+        low_cpu_mem_usage: bool = False,
         onload_self: bool = True,
+        offload_to_disk_path: Optional[str] = None,
+        group_id: Optional[int] = None,
     ) -> None:
         self.modules = modules
         self.offload_device = offload_device
@@ -72,10 +91,35 @@ class ModuleGroup:
         self.record_stream = record_stream
         self.onload_self = onload_self
         self.low_cpu_mem_usage = low_cpu_mem_usage
-        self.cpu_param_dict = self._init_cpu_param_dict()
-        if self.stream is None and self.record_stream:
-            raise ValueError("`record_stream` cannot be True when `stream` is None.")
+        self.offload_to_disk_path = offload_to_disk_path
+        self._is_offloaded_to_disk = False
+        if self.offload_to_disk_path is not None:
+            # Instead of `group_id or str(id(self))` we do this because `group_id` can be "" as well.
+            self.group_id = group_id if group_id is not None else str(id(self))
+            short_hash = _compute_group_hash(self.group_id)
+            self.safetensors_file_path = os.path.join(self.offload_to_disk_path, f"group_{short_hash}.safetensors")
+            all_tensors = []
+            for module in self.modules:
+                all_tensors.extend(list(module.parameters()))
+                all_tensors.extend(list(module.buffers()))
+            all_tensors.extend(self.parameters)
+            all_tensors.extend(self.buffers)
+            all_tensors = list(dict.fromkeys(all_tensors))  # Remove duplicates
+            self.tensor_to_key = {tensor: f"tensor_{i}" for i, tensor in enumerate(all_tensors)}
+            self.key_to_tensor = {v: k for k, v in self.tensor_to_key.items()}
+            self.cpu_param_dict = {}
+        else:
+            self.cpu_param_dict = self._init_cpu_param_dict()
+        self._torch_accelerator_module = (
+            getattr(torch, torch.accelerator.current_accelerator().type)
+            if hasattr(torch, "accelerator")
+            else torch.cuda
+        )
     def _init_cpu_param_dict(self):
         cpu_param_dict = {}
@@ -100,71 +144,100 @@ class ModuleGroup:
     @contextmanager
     def _pinned_memory_tensors(self):
-        pinned_dict = {}
         try:
-            for param, tensor in self.cpu_param_dict.items():
-                if not tensor.is_pinned():
-                    pinned_dict[param] = tensor.pin_memory()
-                else:
-                    pinned_dict[param] = tensor
+            pinned_dict = {
+                param: tensor.pin_memory() if not tensor.is_pinned() else tensor
+                for param, tensor in self.cpu_param_dict.items()
+            }
             yield pinned_dict
         finally:
             pinned_dict = None
-    def onload_(self):
-        r"""Onloads the group of modules to the onload_device."""
-        context = nullcontext() if self.stream is None else torch.cuda.stream(self.stream)
-        current_stream = torch.cuda.current_stream() if self.record_stream else None
+    def _transfer_tensor_to_device(self, tensor, source_tensor):
+        tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
+        if self.record_stream:
+            tensor.data.record_stream(self._torch_accelerator_module.current_stream())
+    def _process_tensors_from_modules(self, pinned_memory=None):
+        for group_module in self.modules:
+            for param in group_module.parameters():
+                source = pinned_memory[param] if pinned_memory else param.data
+                self._transfer_tensor_to_device(param, source)
+            for buffer in group_module.buffers():
+                source = pinned_memory[buffer] if pinned_memory else buffer.data
+                self._transfer_tensor_to_device(buffer, source)
+        for param in self.parameters:
+            source = pinned_memory[param] if pinned_memory else param.data
+            self._transfer_tensor_to_device(param, source)
+        for buffer in self.buffers:
+            source = pinned_memory[buffer] if pinned_memory else buffer.data
+            self._transfer_tensor_to_device(buffer, source)
+    def _onload_from_disk(self):
         if self.stream is not None:
             # Wait for previous Host->Device transfer to complete
             self.stream.synchronize()
+        context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
+        current_stream = self._torch_accelerator_module.current_stream() if self.record_stream else None
         with context:
-            if self.stream is not None:
-                with self._pinned_memory_tensors() as pinned_memory:
-                    for group_module in self.modules:
-                        for param in group_module.parameters():
-                            param.data = pinned_memory[param].to(self.onload_device, non_blocking=self.non_blocking)
-                            if self.record_stream:
-                                param.data.record_stream(current_stream)
-                        for buffer in group_module.buffers():
-                            buffer.data = pinned_memory[buffer].to(self.onload_device, non_blocking=self.non_blocking)
-                            if self.record_stream:
-                                buffer.data.record_stream(current_stream)
-                    for param in self.parameters:
-                        param.data = pinned_memory[param].to(self.onload_device, non_blocking=self.non_blocking)
-                        if self.record_stream:
-                            param.data.record_stream(current_stream)
-                    for buffer in self.buffers:
-                        buffer.data = pinned_memory[buffer].to(self.onload_device, non_blocking=self.non_blocking)
-                        if self.record_stream:
-                            buffer.data.record_stream(current_stream)
+            # Load to CPU (if using streams) or directly to target device, pin, and async copy to device
+            device = str(self.onload_device) if self.stream is None else "cpu"
+            loaded_tensors = safetensors.torch.load_file(self.safetensors_file_path, device=device)
+            if self.stream is not None:
+                for key, tensor_obj in self.key_to_tensor.items():
+                    pinned_tensor = loaded_tensors[key].pin_memory()
+                    tensor_obj.data = pinned_tensor.to(self.onload_device, non_blocking=self.non_blocking)
+                    if self.record_stream:
+                        tensor_obj.data.record_stream(current_stream)
             else:
-                for group_module in self.modules:
-                    for param in group_module.parameters():
-                        param.data = param.data.to(self.onload_device, non_blocking=self.non_blocking)
-                    for buffer in group_module.buffers():
-                        buffer.data = buffer.data.to(self.onload_device, non_blocking=self.non_blocking)
-                for param in self.parameters:
-                    param.data = param.data.to(self.onload_device, non_blocking=self.non_blocking)
+                onload_device = (
+                    self.onload_device.type if isinstance(self.onload_device, torch.device) else self.onload_device
+                )
+                loaded_tensors = safetensors.torch.load_file(self.safetensors_file_path, device=onload_device)
+                for key, tensor_obj in self.key_to_tensor.items():
+                    tensor_obj.data = loaded_tensors[key]
-                for buffer in self.buffers:
-                    buffer.data = buffer.data.to(self.onload_device, non_blocking=self.non_blocking)
-                    if self.record_stream:
-                        buffer.data.record_stream(current_stream)
+    def _onload_from_memory(self):
+        if self.stream is not None:
+            # Wait for previous Host->Device transfer to complete
+            self.stream.synchronize()
-    def offload_(self):
-        r"""Offloads the group of modules to the offload_device."""
+        context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
+        with context:
+            if self.stream is not None:
+                with self._pinned_memory_tensors() as pinned_memory:
+                    self._process_tensors_from_modules(pinned_memory)
+            else:
+                self._process_tensors_from_modules(None)
+    def _offload_to_disk(self):
+        # TODO: we can potentially optimize this code path by checking if the _all_ the desired
+        # safetensor files exist on the disk and if so, skip this step entirely, reducing IO
+        # overhead. Currently, we just check if the given `safetensors_file_path` exists and if not
+        # we perform a write.
+        # Check if the file has been saved in this session or if it already exists on disk.
+        if not self._is_offloaded_to_disk and not os.path.exists(self.safetensors_file_path):
+            os.makedirs(os.path.dirname(self.safetensors_file_path), exist_ok=True)
+            tensors_to_save = {key: tensor.data.to(self.offload_device) for tensor, key in self.tensor_to_key.items()}
+            safetensors.torch.save_file(tensors_to_save, self.safetensors_file_path)
+        # The group is now considered offloaded to disk for the rest of the session.
+        self._is_offloaded_to_disk = True
+        # We do this to free up the RAM which is still holding the up tensor data.
+        for tensor_obj in self.tensor_to_key.keys():
+            tensor_obj.data = torch.empty_like(tensor_obj.data, device=self.offload_device)
+    def _offload_to_memory(self):
         if self.stream is not None:
             if not self.record_stream:
-                torch.cuda.current_stream().synchronize()
+                self._torch_accelerator_module.current_stream().synchronize()
             for group_module in self.modules:
                 for param in group_module.parameters():
                     param.data = self.cpu_param_dict[param]
@@ -172,14 +245,29 @@ class ModuleGroup:
                 param.data = self.cpu_param_dict[param]
             for buffer in self.buffers:
                 buffer.data = self.cpu_param_dict[buffer]
         else:
             for group_module in self.modules:
-                group_module.to(self.offload_device, non_blocking=self.non_blocking)
+                group_module.to(self.offload_device, non_blocking=False)
             for param in self.parameters:
-                param.data = param.data.to(self.offload_device, non_blocking=self.non_blocking)
+                param.data = param.data.to(self.offload_device, non_blocking=False)
             for buffer in self.buffers:
-                buffer.data = buffer.data.to(self.offload_device, non_blocking=self.non_blocking)
+                buffer.data = buffer.data.to(self.offload_device, non_blocking=False)
+    @torch.compiler.disable()
+    def onload_(self):
+        r"""Onloads the group of parameters to the onload_device."""
+        if self.offload_to_disk_path is not None:
+            self._onload_from_disk()
+        else:
+            self._onload_from_memory()
+    @torch.compiler.disable()
+    def offload_(self):
+        r"""Offloads the group of parameters to the offload_device."""
+        if self.offload_to_disk_path:
+            self._offload_to_disk()
+        else:
+            self._offload_to_memory()
 class GroupOffloadingHook(ModelHook):
@@ -192,13 +280,10 @@ class GroupOffloadingHook(ModelHook):
     _is_stateful = False
-    def __init__(
-        self,
-        group: ModuleGroup,
-        next_group: Optional[ModuleGroup] = None,
-    ) -> None:
+    def __init__(self, group: ModuleGroup, *, config: GroupOffloadingConfig) -> None:
         self.group = group
-        self.next_group = next_group
+        self.next_group: Optional[ModuleGroup] = None
+        self.config = config
     def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
         if self.group.offload_leader == module:
@@ -217,9 +302,23 @@ class GroupOffloadingHook(ModelHook):
         if self.group.onload_leader == module:
             if self.group.onload_self:
                 self.group.onload_()
-            if self.next_group is not None and not self.next_group.onload_self:
+            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
+            if should_onload_next_group:
                 self.next_group.onload_()
+            should_synchronize = (
+                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+            )
+            if should_synchronize:
+                # If this group didn't onload itself, it means it was asynchronously onloaded by the
+                # previous group. We need to synchronize the side stream to ensure parameters
+                # are completely loaded to proceed with forward pass. Without this, uninitialized
+                # weights will be used in the computation, leading to incorrect results
+                # Also, we should only do this synchronization if we don't already do it from the sync call in
+                # self.next_group.onload_, hence the `not should_onload_next_group` check.
+                self.group.stream.synchronize()
         args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
         kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
         return args, kwargs
@@ -232,7 +331,7 @@ class GroupOffloadingHook(ModelHook):
 class LazyPrefetchGroupOffloadingHook(ModelHook):
     r"""
-    A hook, used in conjuction with GroupOffloadingHook, that applies lazy prefetching to groups of torch.nn.Module.
+    A hook, used in conjunction with GroupOffloadingHook, that applies lazy prefetching to groups of torch.nn.Module.
     This hook is used to determine the order in which the layers are executed during the forward pass. Once the layer
     invocation order is known, assignments of the next_group attribute for prefetching can be made, which allows
     prefetching groups in the correct order.
@@ -247,7 +346,8 @@ class LazyPrefetchGroupOffloadingHook(ModelHook):
     def initialize_hook(self, module):
         def make_execution_order_update_callback(current_name, current_submodule):
             def callback():
-                logger.debug(f"Adding {current_name} to the execution order")
+                if not torch.compiler.is_compiling():
+                    logger.debug(f"Adding {current_name} to the execution order")
                 self.execution_order.append((current_name, current_submodule))
             return callback
@@ -284,12 +384,13 @@ class LazyPrefetchGroupOffloadingHook(ModelHook):
         # if the missing layers end up being executed in the future.
         if execution_order_module_names != self._layer_execution_tracker_module_names:
             unexecuted_layers = list(self._layer_execution_tracker_module_names - execution_order_module_names)
-            logger.warning(
-                "It seems like some layers were not executed during the forward pass. This may lead to problems when "
-                "applying lazy prefetching with automatic tracing and lead to device-mismatch related errors. Please "
-                "make sure that all layers are executed during the forward pass. The following layers were not executed:\n"
-                f"{unexecuted_layers=}"
-            )
+            if not torch.compiler.is_compiling():
+                logger.warning(
+                    "It seems like some layers were not executed during the forward pass. This may lead to problems when "
+                    "applying lazy prefetching with automatic tracing and lead to device-mismatch related errors. Please "
+                    "make sure that all layers are executed during the forward pass. The following layers were not executed:\n"
+                    f"{unexecuted_layers=}"
+                )
         # Remove the layer execution tracker hooks from the submodules
         base_module_registry = module._diffusers_hook
@@ -317,7 +418,8 @@ class LazyPrefetchGroupOffloadingHook(ModelHook):
         for i in range(num_executed - 1):
             name1, _ = self.execution_order[i]
             name2, _ = self.execution_order[i + 1]
-            logger.debug(f"Applying lazy prefetch group offloading from {name1} to {name2}")
+            if not torch.compiler.is_compiling():
+                logger.debug(f"Applying lazy prefetch group offloading from {name1} to {name2}")
             group_offloading_hooks[i].next_group = group_offloading_hooks[i + 1].group
             group_offloading_hooks[i].next_group.onload_self = False
@@ -342,14 +444,15 @@ class LayerExecutionTrackerHook(ModelHook):
 def apply_group_offloading(
     module: torch.nn.Module,
-    onload_device: torch.device,
-    offload_device: torch.device = torch.device("cpu"),
-    offload_type: str = "block_level",
+    onload_device: Union[str, torch.device],
+    offload_device: Union[str, torch.device] = torch.device("cpu"),
+    offload_type: Union[str, GroupOffloadingType] = "block_level",
     num_blocks_per_group: Optional[int] = None,
     non_blocking: bool = False,
     use_stream: bool = False,
     record_stream: bool = False,
     low_cpu_mem_usage: bool = False,
+    offload_to_disk_path: Optional[str] = None,
 ) -> None:
     r"""
     Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
@@ -385,9 +488,12 @@ def apply_group_offloading(
             The device to which the group of modules are onloaded.
         offload_device (`torch.device`, defaults to `torch.device("cpu")`):
             The device to which the group of modules are offloaded. This should typically be the CPU. Default is CPU.
-        offload_type (`str`, defaults to "block_level"):
+        offload_type (`str` or `GroupOffloadingType`, defaults to "block_level"):
             The type of offloading to be applied. Can be one of "block_level" or "leaf_level". Default is
             "block_level".
+        offload_to_disk_path (`str`, *optional*, defaults to `None`):
+            The path to the directory where parameters will be offloaded. Setting this option can be useful in limited
+            RAM environment settings where a reasonable speed-memory trade-off is desired.
         num_blocks_per_group (`int`, *optional*):
             The number of blocks per group when using offload_type="block_level". This is required when using
             offload_type="block_level".
@@ -425,80 +531,61 @@ def apply_group_offloading(
         ```
     """
+    onload_device = torch.device(onload_device) if isinstance(onload_device, str) else onload_device
+    offload_device = torch.device(offload_device) if isinstance(offload_device, str) else offload_device
+    offload_type = GroupOffloadingType(offload_type)
     stream = None
     if use_stream:
         if torch.cuda.is_available():
             stream = torch.cuda.Stream()
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            stream = torch.Stream()
         else:
-            raise ValueError("Using streams for data transfer requires a CUDA device.")
+            raise ValueError("Using streams for data transfer requires a CUDA device, or an Intel XPU device.")
+    if not use_stream and record_stream:
+        raise ValueError("`record_stream` cannot be True when `use_stream=False`.")
+    if offload_type == GroupOffloadingType.BLOCK_LEVEL and num_blocks_per_group is None:
+        raise ValueError("`num_blocks_per_group` must be provided when using `offload_type='block_level'.")
     _raise_error_if_accelerate_model_or_sequential_hook_present(module)
-    if offload_type == "block_level":
-        if num_blocks_per_group is None:
-            raise ValueError("num_blocks_per_group must be provided when using offload_type='block_level'.")
-        _apply_group_offloading_block_level(
-            module=module,
-            num_blocks_per_group=num_blocks_per_group,
-            offload_device=offload_device,
-            onload_device=onload_device,
-            non_blocking=non_blocking,
-            stream=stream,
-            record_stream=record_stream,
-            low_cpu_mem_usage=low_cpu_mem_usage,
-        )
-    elif offload_type == "leaf_level":
-        _apply_group_offloading_leaf_level(
-            module=module,
-            offload_device=offload_device,
-            onload_device=onload_device,
-            non_blocking=non_blocking,
-            stream=stream,
-            record_stream=record_stream,
-            low_cpu_mem_usage=low_cpu_mem_usage,
-        )
+    config = GroupOffloadingConfig(
+        onload_device=onload_device,
+        offload_device=offload_device,
+        offload_type=offload_type,
+        num_blocks_per_group=num_blocks_per_group,
+        non_blocking=non_blocking,
+        stream=stream,
+        record_stream=record_stream,
+        low_cpu_mem_usage=low_cpu_mem_usage,
+        offload_to_disk_path=offload_to_disk_path,
+    )
+    _apply_group_offloading(module, config)
+def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    if config.offload_type == GroupOffloadingType.BLOCK_LEVEL:
+        _apply_group_offloading_block_level(module, config)
+    elif config.offload_type == GroupOffloadingType.LEAF_LEVEL:
+        _apply_group_offloading_leaf_level(module, config)
     else:
-        raise ValueError(f"Unsupported offload_type: {offload_type}")
+        assert False
-def _apply_group_offloading_block_level(
-    module: torch.nn.Module,
-    num_blocks_per_group: int,
-    offload_device: torch.device,
-    onload_device: torch.device,
-    non_blocking: bool,
-    stream: Optional[torch.cuda.Stream] = None,
-    record_stream: Optional[bool] = False,
-    low_cpu_mem_usage: bool = False,
-) -> None:
+def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
     This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks. In comparison to
     the "leaf_level" offloading, which is more fine-grained, this offloading is done at the top-level blocks.
-    Args:
-        module (`torch.nn.Module`):
-            The module to which group offloading is applied.
-        offload_device (`torch.device`):
-            The device to which the group of modules are offloaded. This should typically be the CPU.
-        onload_device (`torch.device`):
-            The device to which the group of modules are onloaded.
-        non_blocking (`bool`):
-            If True, offloading and onloading is done asynchronously. This can be useful for overlapping computation
-            and data transfer.
-        stream (`torch.cuda.Stream`, *optional*):
-            If provided, offloading and onloading is done asynchronously using the provided stream. This can be useful
-            for overlapping computation and data transfer.
-        record_stream (`bool`, defaults to `False`): When enabled with `use_stream`, it marks the current tensor
-            as having been used by this stream. It is faster at the expense of slightly more memory usage. Refer to the
-            [PyTorch official docs](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html) more
-            details.
-        low_cpu_mem_usage (`bool`, defaults to `False`):
-            If True, the CPU memory usage is minimized by pinning tensors on-the-fly instead of pre-pinning them. This
-            option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
-            the CPU memory is a bottleneck but may counteract the benefits of using streams.
     """
+    if config.stream is not None and config.num_blocks_per_group != 1:
+        logger.warning(
+            f"Using streams is only supported for num_blocks_per_group=1. Got {config.num_blocks_per_group=}. Setting it to 1."
+        )
+        config.num_blocks_per_group = 1
     # Create module groups for ModuleList and Sequential blocks
     modules_with_group_offloading = set()
     unmatched_modules = []
@@ -509,19 +596,22 @@ def _apply_group_offloading_block_level(
             modules_with_group_offloading.add(name)
             continue
-        for i in range(0, len(submodule), num_blocks_per_group):
-            current_modules = submodule[i : i + num_blocks_per_group]
+        for i in range(0, len(submodule), config.num_blocks_per_group):
+            current_modules = submodule[i : i + config.num_blocks_per_group]
+            group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
             group = ModuleGroup(
                 modules=current_modules,
-                offload_device=offload_device,
-                onload_device=onload_device,
+                offload_device=config.offload_device,
+                onload_device=config.onload_device,
+                offload_to_disk_path=config.offload_to_disk_path,
                 offload_leader=current_modules[-1],
                 onload_leader=current_modules[0],
-                non_blocking=non_blocking,
-                stream=stream,
-                record_stream=record_stream,
-                low_cpu_mem_usage=low_cpu_mem_usage,
-                onload_self=stream is None,
+                non_blocking=config.non_blocking,
+                stream=config.stream,
+                record_stream=config.record_stream,
+                low_cpu_mem_usage=config.low_cpu_mem_usage,
+                onload_self=True,
+                group_id=group_id,
             )
             matched_module_groups.append(group)
             for j in range(i, i + len(current_modules)):
@@ -529,12 +619,8 @@ def _apply_group_offloading_block_level(
     # Apply group offloading hooks to the module groups
     for i, group in enumerate(matched_module_groups):
-        next_group = (
-            matched_module_groups[i + 1] if i + 1 < len(matched_module_groups) and stream is not None else None
-        )
         for group_module in group.modules:
-            _apply_group_offloading_hook(group_module, group, next_group)
+            _apply_group_offloading_hook(group_module, group, config=config)
     # Parameters and Buffers of the top-level module need to be offloaded/onloaded separately
     # when the forward pass of this module is called. This is because the top-level module is not
@@ -549,8 +635,9 @@ def _apply_group_offloading_block_level(
     unmatched_modules = [unmatched_module for _, unmatched_module in unmatched_modules]
     unmatched_group = ModuleGroup(
         modules=unmatched_modules,
-        offload_device=offload_device,
-        onload_device=onload_device,
+        offload_device=config.offload_device,
+        onload_device=config.onload_device,
+        offload_to_disk_path=config.offload_to_disk_path,
         offload_leader=module,
         onload_leader=module,
         parameters=parameters,
@@ -559,67 +646,41 @@ def _apply_group_offloading_block_level(
         stream=None,
         record_stream=False,
         onload_self=True,
+        group_id=f"{module.__class__.__name__}_unmatched_group",
     )
-    next_group = matched_module_groups[0] if len(matched_module_groups) > 0 else None
-    _apply_group_offloading_hook(module, unmatched_group, next_group)
+    if config.stream is None:
+        _apply_group_offloading_hook(module, unmatched_group, config=config)
+    else:
+        _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
-def _apply_group_offloading_leaf_level(
-    module: torch.nn.Module,
-    offload_device: torch.device,
-    onload_device: torch.device,
-    non_blocking: bool,
-    stream: Optional[torch.cuda.Stream] = None,
-    record_stream: Optional[bool] = False,
-    low_cpu_mem_usage: bool = False,
-) -> None:
+def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
     This function applies offloading to groups of leaf modules in a torch.nn.Module. This method has minimal memory
     requirements. However, it can be slower compared to other offloading methods due to the excessive number of device
     synchronizations. When using devices that support streams to overlap data transfer and computation, this method can
     reduce memory usage without any performance degradation.
-    Args:
-        module (`torch.nn.Module`):
-            The module to which group offloading is applied.
-        offload_device (`torch.device`):
-            The device to which the group of modules are offloaded. This should typically be the CPU.
-        onload_device (`torch.device`):
-            The device to which the group of modules are onloaded.
-        non_blocking (`bool`):
-            If True, offloading and onloading is done asynchronously. This can be useful for overlapping computation
-            and data transfer.
-        stream (`torch.cuda.Stream`, *optional*):
-            If provided, offloading and onloading is done asynchronously using the provided stream. This can be useful
-            for overlapping computation and data transfer.
-        record_stream (`bool`, defaults to `False`): When enabled with `use_stream`, it marks the current tensor
-            as having been used by this stream. It is faster at the expense of slightly more memory usage. Refer to the
-            [PyTorch official docs](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html) more
-            details.
-        low_cpu_mem_usage (`bool`, defaults to `False`):
-            If True, the CPU memory usage is minimized by pinning tensors on-the-fly instead of pre-pinning them. This
-            option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
-            the CPU memory is a bottleneck but may counteract the benefits of using streams.
     """
     # Create module groups for leaf modules and apply group offloading hooks
     modules_with_group_offloading = set()
     for name, submodule in module.named_modules():
-        if not isinstance(submodule, _SUPPORTED_PYTORCH_LAYERS):
+        if not isinstance(submodule, _GO_LC_SUPPORTED_PYTORCH_LAYERS):
             continue
         group = ModuleGroup(
             modules=[submodule],
-            offload_device=offload_device,
-            onload_device=onload_device,
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
             offload_leader=submodule,
             onload_leader=submodule,
-            non_blocking=non_blocking,
-            stream=stream,
-            record_stream=record_stream,
-            low_cpu_mem_usage=low_cpu_mem_usage,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
             onload_self=True,
+            group_id=name,
         )
-        _apply_group_offloading_hook(submodule, group, None)
+        _apply_group_offloading_hook(submodule, group, config=config)
         modules_with_group_offloading.add(name)
     # Parameters and Buffers at all non-leaf levels need to be offloaded/onloaded separately when the forward pass
@@ -650,31 +711,33 @@ def _apply_group_offloading_leaf_level(
         parameters = parent_to_parameters.get(name, [])
         buffers = parent_to_buffers.get(name, [])
         parent_module = module_dict[name]
-        assert getattr(parent_module, "_diffusers_hook", None) is None
         group = ModuleGroup(
             modules=[],
-            offload_device=offload_device,
-            onload_device=onload_device,
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
             offload_leader=parent_module,
             onload_leader=parent_module,
+            offload_to_disk_path=config.offload_to_disk_path,
             parameters=parameters,
             buffers=buffers,
-            non_blocking=non_blocking,
-            stream=stream,
-            record_stream=record_stream,
-            low_cpu_mem_usage=low_cpu_mem_usage,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
             onload_self=True,
+            group_id=name,
         )
-        _apply_group_offloading_hook(parent_module, group, None)
+        _apply_group_offloading_hook(parent_module, group, config=config)
-    if stream is not None:
+    if config.stream is not None:
         # When using streams, we need to know the layer execution order for applying prefetching (to overlap data transfer
         # and computation). Since we don't know the order beforehand, we apply a lazy prefetching hook that will find the
         # execution order and apply prefetching in the correct order.
         unmatched_group = ModuleGroup(
             modules=[],
-            offload_device=offload_device,
-            onload_device=onload_device,
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
             offload_leader=module,
             onload_leader=module,
             parameters=None,
@@ -682,37 +745,40 @@ def _apply_group_offloading_leaf_level(
             non_blocking=False,
             stream=None,
             record_stream=False,
-            low_cpu_mem_usage=low_cpu_mem_usage,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
             onload_self=True,
+            group_id=_GROUP_ID_LAZY_LEAF,
         )
-        _apply_lazy_group_offloading_hook(module, unmatched_group, None)
+        _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
 def _apply_group_offloading_hook(
     module: torch.nn.Module,
     group: ModuleGroup,
-    next_group: Optional[ModuleGroup] = None,
+    *,
+    config: GroupOffloadingConfig,
 ) -> None:
     registry = HookRegistry.check_if_exists_or_initialize(module)
     # We may have already registered a group offloading hook if the module had a torch.nn.Parameter whose parent
     # is the current module. In such cases, we don't want to overwrite the existing group offloading hook.
     if registry.get_hook(_GROUP_OFFLOADING) is None:
-        hook = GroupOffloadingHook(group, next_group)
+        hook = GroupOffloadingHook(group, config=config)
         registry.register_hook(hook, _GROUP_OFFLOADING)
 def _apply_lazy_group_offloading_hook(
     module: torch.nn.Module,
     group: ModuleGroup,
-    next_group: Optional[ModuleGroup] = None,
+    *,
+    config: GroupOffloadingConfig,
 ) -> None:
     registry = HookRegistry.check_if_exists_or_initialize(module)
     # We may have already registered a group offloading hook if the module had a torch.nn.Parameter whose parent
     # is the current module. In such cases, we don't want to overwrite the existing group offloading hook.
     if registry.get_hook(_GROUP_OFFLOADING) is None:
-        hook = GroupOffloadingHook(group, next_group)
+        hook = GroupOffloadingHook(group, config=config)
         registry.register_hook(hook, _GROUP_OFFLOADING)
     lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook()
@@ -779,15 +845,54 @@ def _raise_error_if_accelerate_model_or_sequential_hook_present(module: torch.nn
             )
-def _is_group_offload_enabled(module: torch.nn.Module) -> bool:
+def _get_top_level_group_offload_hook(module: torch.nn.Module) -> Optional[GroupOffloadingHook]:
     for submodule in module.modules():
-        if hasattr(submodule, "_diffusers_hook") and submodule._diffusers_hook.get_hook(_GROUP_OFFLOADING) is not None:
-            return True
-    return False
+        if hasattr(submodule, "_diffusers_hook"):
+            group_offloading_hook = submodule._diffusers_hook.get_hook(_GROUP_OFFLOADING)
+            if group_offloading_hook is not None:
+                return group_offloading_hook
+    return None
+def _is_group_offload_enabled(module: torch.nn.Module) -> bool:
+    top_level_group_offload_hook = _get_top_level_group_offload_hook(module)
+    return top_level_group_offload_hook is not None
 def _get_group_onload_device(module: torch.nn.Module) -> torch.device:
-    for submodule in module.modules():
-        if hasattr(submodule, "_diffusers_hook") and submodule._diffusers_hook.get_hook(_GROUP_OFFLOADING) is not None:
-            return submodule._diffusers_hook.get_hook(_GROUP_OFFLOADING).group.onload_device
+    top_level_group_offload_hook = _get_top_level_group_offload_hook(module)
+    if top_level_group_offload_hook is not None:
+        return top_level_group_offload_hook.config.onload_device
     raise ValueError("Group offloading is not enabled for the provided module.")
+def _compute_group_hash(group_id):
+    hashed_id = hashlib.sha256(group_id.encode("utf-8")).hexdigest()
+    # first 16 characters for a reasonably short but unique name
+    return hashed_id[:16]
+def _maybe_remove_and_reapply_group_offloading(module: torch.nn.Module) -> None:
+    r"""
+    Removes the group offloading hook from the module and re-applies it. This is useful when the module has been
+    modified in-place and the group offloading hook references-to-tensors needs to be updated. The in-place
+    modification can happen in a number of ways, for example, fusing QKV or unloading/loading LoRAs on-the-fly.
+    In this implementation, we make an assumption that group offloading has only been applied at the top-level module,
+    and therefore all submodules have the same onload and offload devices. If this assumption is not true, say in the
+    case where user has applied group offloading at multiple levels, this function will not work as expected.
+    There is some performance penalty associated with doing this when non-default streams are used, because we need to
+    retrace the execution order of the layers with `LazyPrefetchGroupOffloadingHook`.
+    """
+    top_level_group_offload_hook = _get_top_level_group_offload_hook(module)
+    if top_level_group_offload_hook is None:
+        return
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    registry.remove_hook(_GROUP_OFFLOADING, recurse=True)
+    registry.remove_hook(_LAYER_EXECUTION_TRACKER, recurse=True)
+    registry.remove_hook(_LAZY_PREFETCH_GROUP_OFFLOADING, recurse=True)
+    _apply_group_offloading(module, top_level_group_offload_hook.config)

diffusers 0.33.1__py3-none-any.whl → 0.35.0__py3-none-any.whl

diffusers 0.33.1py3-none-any.whl → 0.35.0py3-none-any.whl