PyPI - cache-dit - Versions diffs - 0.3.2__py3-none-any.whl → 1.0.14__py3-none-any.whl - Mend

cache-dit 0.3.2py3-none-any.whl → 1.0.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

cache_dit/parallelism/backends/native_diffusers/context_parallelism/attention/_attention_dispatch.py ADDED Viewed

@@ -0,0 +1,304 @@
+import os
+import torch
+from typing import Optional
+try:
+    from diffusers.models.attention_dispatch import (
+        _AttentionBackendRegistry,
+        AttentionBackendName,
+        _check_device,
+        _check_shape,
+        TemplatedRingAttention,
+        TemplatedUlyssesAttention,
+    )
+    from diffusers.models._modeling_parallel import ParallelConfig
+except ImportError:
+    raise ImportError(
+        "Context parallelism requires the 'diffusers>=0.36.dev0'."
+        "Please install latest version of diffusers from source: \n"
+        "pip3 install git+https://github.com/huggingface/diffusers.git"
+    )
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+__all__ = [
+    "_native_attention",
+]
+# Enable custom native attention backend with context parallelism
+# by default. Users can set the environment variable to 0 to disable
+# this behavior. Default to enabled for better compatibility.
+_CACHE_DIT_ENABLE_CUSTOM_CP_NATIVE_ATTN_DISPATCH = bool(
+    int(os.getenv("CACHE_DIT_ENABLE_CUSTOM_CP_NATIVE_ATTN_DISPATCH", "1"))
+)
+def _is_native_attn_supported_context_parallel() -> bool:
+    try:
+        return (
+            AttentionBackendName.NATIVE
+            in _AttentionBackendRegistry._supports_context_parallel
+            and _AttentionBackendRegistry._supports_context_parallel[
+                AttentionBackendName.NATIVE
+            ]
+        )
+    except Exception:
+        assert isinstance(
+            _AttentionBackendRegistry._supports_context_parallel, set
+        )
+        return (
+            AttentionBackendName.NATIVE.value
+            in _AttentionBackendRegistry._supports_context_parallel
+        )
+if _CACHE_DIT_ENABLE_CUSTOM_CP_NATIVE_ATTN_DISPATCH:
+    logger.warning(
+        "Re-registering NATIVE attention backend to enable context parallelism. "
+        "This is a temporary workaround and should be removed after the native "
+        "attention backend supports context parallelism natively. Please check: "
+        "https://github.com/huggingface/diffusers/pull/12563 for more details. "
+        "Or, you can disable this behavior by setting the environment variable "
+        "`CACHE_DIT_ENABLE_CUSTOM_CP_NATIVE_ATTN_DISPATCH=0`."
+    )
+    _AttentionBackendRegistry._backends.pop(AttentionBackendName.NATIVE)
+    _AttentionBackendRegistry._constraints.pop(AttentionBackendName.NATIVE)
+    _AttentionBackendRegistry._supported_arg_names.pop(
+        AttentionBackendName.NATIVE
+    )
+    if _is_native_attn_supported_context_parallel():
+        if isinstance(
+            _AttentionBackendRegistry._supports_context_parallel, dict
+        ):
+            _AttentionBackendRegistry._supports_context_parallel.pop(
+                AttentionBackendName.NATIVE
+            )
+        else:
+            _AttentionBackendRegistry._supports_context_parallel.remove(
+                AttentionBackendName.NATIVE.value
+            )
+    # Re-define templated context parallel attention to support attn mask
+    def _templated_context_parallel_attention_v2(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        scale: Optional[float] = None,
+        enable_gqa: bool = False,
+        return_lse: bool = False,
+        *,
+        forward_op,
+        backward_op,
+        _parallel_config: Optional["ParallelConfig"] = None,
+    ):
+        if attn_mask is not None:
+            # NOTE(DefTruth): Check if forward_op is native attention forward op
+            forward_op_name = forward_op.__name__
+            if not forward_op_name == "_native_attention_forward_op":
+                raise ValueError(
+                    "Templated context parallel attention with attn_mask "
+                    "is only supported for native attention backend, "
+                    f"but got forward_op: {forward_op_name}."
+                )
+        if is_causal:
+            raise ValueError(
+                "Causal attention is not yet supported for templated attention."
+            )
+        if enable_gqa:
+            raise ValueError(
+                "GQA is not yet supported for templated attention."
+            )
+        # TODO: add support for unified attention with ring/ulysses degree both being > 1
+        if _parallel_config.context_parallel_config.ring_degree > 1:
+            return TemplatedRingAttention.apply(
+                query,
+                key,
+                value,
+                attn_mask,
+                dropout_p,
+                is_causal,
+                scale,
+                enable_gqa,
+                return_lse,
+                forward_op,
+                backward_op,
+                _parallel_config,
+            )
+        elif _parallel_config.context_parallel_config.ulysses_degree > 1:
+            return TemplatedUlyssesAttention.apply(
+                query,
+                key,
+                value,
+                attn_mask,
+                dropout_p,
+                is_causal,
+                scale,
+                enable_gqa,
+                return_lse,
+                forward_op,
+                backward_op,
+                _parallel_config,
+            )
+        else:
+            raise ValueError(
+                "Reaching this branch of code is unexpected. Please report a bug."
+            )
+    # NOTE:Remove NATIVE attention backend constraints and re-register it.
+    # Here is a temporary workaround to enable context parallelism with
+    # native attention backend. We should remove this workaround after
+    # the native attention backend supports context parallelism natively.
+    # Adapted from: https://github.com/huggingface/diffusers/pull/12563
+    def _native_attention_forward_op(
+        ctx: torch.autograd.function.FunctionCtx,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        scale: Optional[float] = None,
+        enable_gqa: bool = False,
+        return_lse: bool = False,
+        _save_ctx: bool = True,
+        _parallel_config: Optional["ParallelConfig"] = None,
+    ):
+        # Native attention does not return_lse
+        if return_lse:
+            raise ValueError(
+                "Native attention does not support return_lse=True"
+            )
+        # used for backward pass
+        if _save_ctx:
+            ctx.save_for_backward(query, key, value)
+            ctx.attn_mask = attn_mask
+            ctx.dropout_p = dropout_p
+            ctx.is_causal = is_causal
+            ctx.scale = scale
+            ctx.enable_gqa = enable_gqa
+        query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))
+        out = torch.nn.functional.scaled_dot_product_attention(
+            query=query,
+            key=key,
+            value=value,
+            attn_mask=attn_mask,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            scale=scale,
+            enable_gqa=enable_gqa,
+        )
+        out = out.permute(0, 2, 1, 3)
+        return out
+    def _native_attention_backward_op(
+        ctx: torch.autograd.function.FunctionCtx,
+        grad_out: torch.Tensor,
+        *args,
+        **kwargs,
+    ):
+        query, key, value = ctx.saved_tensors
+        query.requires_grad_(True)
+        key.requires_grad_(True)
+        value.requires_grad_(True)
+        query_t, key_t, value_t = (
+            x.permute(0, 2, 1, 3) for x in (query, key, value)
+        )
+        out = torch.nn.functional.scaled_dot_product_attention(
+            query=query_t,
+            key=key_t,
+            value=value_t,
+            attn_mask=ctx.attn_mask,
+            dropout_p=ctx.dropout_p,
+            is_causal=ctx.is_causal,
+            scale=ctx.scale,
+            enable_gqa=ctx.enable_gqa,
+        )
+        out = out.permute(0, 2, 1, 3)
+        grad_out_t = grad_out.permute(0, 2, 1, 3)
+        grad_query_t, grad_key_t, grad_value_t = torch.autograd.grad(
+            outputs=out,
+            inputs=[query_t, key_t, value_t],
+            grad_outputs=grad_out_t,
+            retain_graph=False,
+        )
+        grad_query = grad_query_t.permute(0, 2, 1, 3)
+        grad_key = grad_key_t.permute(0, 2, 1, 3)
+        grad_value = grad_value_t.permute(0, 2, 1, 3)
+        return grad_query, grad_key, grad_value
+    @_AttentionBackendRegistry.register(
+        AttentionBackendName.NATIVE,
+        constraints=[_check_device, _check_shape],
+        supports_context_parallel=True,
+    )
+    def _native_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        scale: Optional[float] = None,
+        enable_gqa: bool = False,
+        return_lse: bool = False,
+        _parallel_config: Optional["ParallelConfig"] = None,
+    ) -> torch.Tensor:
+        if return_lse:
+            raise ValueError(
+                "Native attention backend does not support setting `return_lse=True`."
+            )
+        if _parallel_config is None:
+            query, key, value = (
+                x.permute(0, 2, 1, 3) for x in (query, key, value)
+            )
+            out = torch.nn.functional.scaled_dot_product_attention(
+                query=query,
+                key=key,
+                value=value,
+                attn_mask=attn_mask,
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+                scale=scale,
+                enable_gqa=enable_gqa,
+            )
+            out = out.permute(0, 2, 1, 3)
+        else:
+            out = _templated_context_parallel_attention_v2(
+                query,
+                key,
+                value,
+                attn_mask,
+                dropout_p,
+                is_causal,
+                scale,
+                enable_gqa,
+                return_lse,
+                forward_op=_native_attention_forward_op,
+                backward_op=_native_attention_backward_op,
+                _parallel_config=_parallel_config,
+            )
+        return out
+else:
+    from diffusers.models.attention_dispatch import (
+        _native_attention,
+    )  # noqa: F401
+    logger.info(
+        "Native attention backend already supports context parallelism."
+    )

cache_dit/parallelism/backends/native_diffusers/context_parallelism/cp_plan_chroma.py ADDED Viewed

@@ -0,0 +1,95 @@
+import torch
+from typing import Optional
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.transformers.transformer_chroma import (
+    ChromaTransformer2DModel,
+)
+try:
+    from diffusers.models._modeling_parallel import (
+        ContextParallelInput,
+        ContextParallelOutput,
+        ContextParallelModelPlan,
+    )
+except ImportError:
+    raise ImportError(
+        "Context parallelism requires the 'diffusers>=0.36.dev0'."
+        "Please install latest version of diffusers from source: \n"
+        "pip3 install git+https://github.com/huggingface/diffusers.git"
+    )
+from .cp_plan_registers import (
+    ContextParallelismPlanner,
+    ContextParallelismPlannerRegister,
+)
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+@ContextParallelismPlannerRegister.register("Chroma")
+class ChromaContextParallelismPlanner(ContextParallelismPlanner):
+    def apply(
+        self,
+        transformer: Optional[torch.nn.Module | ModelMixin] = None,
+        **kwargs,
+    ) -> ContextParallelModelPlan:
+        # NOTE: Diffusers native CP plan still not supported
+        # for Chroma now.
+        self._cp_planner_preferred_native_diffusers = False
+        if (
+            transformer is not None
+            and self._cp_planner_preferred_native_diffusers
+        ):
+            assert isinstance(
+                transformer, ChromaTransformer2DModel
+            ), "Transformer must be an instance of ChromaTransformer2DModel"
+            if hasattr(transformer, "_cp_plan"):
+                if transformer._cp_plan is not None:
+                    return transformer._cp_plan
+        # Otherwise, use the custom CP plan defined here, this maybe
+        # a little different from the native diffusers implementation
+        # for some models.
+        _cp_plan = {
+            # Here is a Transformer level CP plan for Chroma, which will
+            # only apply the only 1 split hook (pre_forward) on the forward
+            # of Transformer, and gather the output after Transformer forward.
+            # Pattern of transformer forward, split_output=False:
+            #     un-split input -> splited input (inside transformer)
+            # Pattern of the transformer_blocks, single_transformer_blocks:
+            #     splited input (previous splited output) -> to_qkv/...
+            #     -> all2all
+            #     -> attn (local head, full seqlen)
+            #     -> all2all
+            #     -> splited output
+            # The `hidden_states` and `encoder_hidden_states` will still keep
+            # itself splited after block forward (namely, automatic split by
+            # the all2all comm op after attn) for the all blocks.
+            # img_ids and txt_ids will only be splited once at the very beginning,
+            # and keep splited through the whole transformer forward. The all2all
+            # comm op only happens on the `out` tensor after local attn not on
+            # img_ids and txt_ids.
+            "": {
+                "hidden_states": ContextParallelInput(
+                    split_dim=1, expected_dims=3, split_output=False
+                ),
+                "encoder_hidden_states": ContextParallelInput(
+                    split_dim=1, expected_dims=3, split_output=False
+                ),
+                "img_ids": ContextParallelInput(
+                    split_dim=0, expected_dims=2, split_output=False
+                ),
+                "txt_ids": ContextParallelInput(
+                    split_dim=0, expected_dims=2, split_output=False
+                ),
+            },
+            # Then, the final proj_out will gather the splited output.
+            #     splited input (previous splited output)
+            #     -> all gather
+            #     -> un-split output
+            "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3),
+        }
+        return _cp_plan

cache_dit/parallelism/backends/native_diffusers/context_parallelism/cp_plan_cogvideox.py ADDED Viewed

@@ -0,0 +1,202 @@
+import torch
+import functools
+from typing import Optional, Tuple
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.transformers.cogvideox_transformer_3d import (
+    CogVideoXAttnProcessor2_0,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.models.attention_processor import Attention
+from diffusers.models.attention_dispatch import dispatch_attention_fn
+try:
+    from diffusers.models._modeling_parallel import (
+        ContextParallelInput,
+        ContextParallelOutput,
+        ContextParallelModelPlan,
+    )
+except ImportError:
+    raise ImportError(
+        "Context parallelism requires the 'diffusers>=0.36.dev0'."
+        "Please install latest version of diffusers from source: \n"
+        "pip3 install git+https://github.com/huggingface/diffusers.git"
+    )
+from .cp_plan_registers import (
+    ContextParallelismPlanner,
+    ContextParallelismPlannerRegister,
+)
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+@ContextParallelismPlannerRegister.register("CogVideoX")
+class CogVideoXContextParallelismPlanner(ContextParallelismPlanner):
+    def apply(
+        self,
+        transformer: Optional[torch.nn.Module | ModelMixin] = None,
+        **kwargs,
+    ) -> ContextParallelModelPlan:
+        # NOTE: Diffusers native CP plan still not supported
+        # for CogVideoX now.
+        self._cp_planner_preferred_native_diffusers = False
+        if (
+            transformer is not None
+            and self._cp_planner_preferred_native_diffusers
+        ):
+            assert isinstance(
+                transformer, CogVideoXTransformer3DModel
+            ), "Transformer must be an instance of CogVideoXTransformer3DModel"
+            if hasattr(transformer, "_cp_plan"):
+                if transformer._cp_plan is not None:
+                    return transformer._cp_plan
+        CogVideoXAttnProcessor2_0.__call__ = (
+            __patch_CogVideoXAttnProcessor2_0__call__
+        )
+        # Also need to patch the parallel config and attention backend
+        if not hasattr(CogVideoXAttnProcessor2_0, "_parallel_config"):
+            CogVideoXAttnProcessor2_0._parallel_config = None
+        if not hasattr(CogVideoXAttnProcessor2_0, "_attention_backend"):
+            CogVideoXAttnProcessor2_0._attention_backend = None
+        # Otherwise, use the custom CP plan defined here, this maybe
+        # a little different from the native diffusers implementation
+        # for some models.
+        _cp_plan = {
+            # Pattern of transformer_blocks.0, split_output=False:
+            #     un-split input -> split -> to_qkv/...
+            #     -> all2all
+            #     -> attn (local head, full seqlen)
+            #     -> all2all
+            #     -> splited output
+            # Pattern of the rest transformer_blocks, split_output=False:
+            #     splited input (previous splited output) -> to_qkv/...
+            #     -> all2all
+            #     -> attn (local head, full seqlen)
+            #     -> all2all
+            #     -> splited output
+            # The `encoder_hidden_states` will be changed after each block forward,
+            # so we need to split it at the first block, and keep it splited (namely,
+            # automatically split by the all2all op after attn) for the rest blocks.
+            # The `out` tensor of local attn will be splited into `hidden_states` and
+            # `encoder_hidden_states` after each block forward, thus both of them
+            # will be automatically splited by all2all comm op after local attn.
+            "transformer_blocks.0": {
+                "hidden_states": ContextParallelInput(
+                    split_dim=1, expected_dims=3, split_output=False
+                ),
+                "encoder_hidden_states": ContextParallelInput(
+                    split_dim=1, expected_dims=3, split_output=False
+                ),
+            },
+            # Pattern of the image_rotary_emb, split at every block, because the it
+            # is not automatically splited by all2all comm op and keep un-splited
+            # while the block forward finished:
+            #    un-split input -> split output
+            #    -> after block forward
+            #    -> un-split input
+            #    un-split input -> split output
+            #    ...
+            "transformer_blocks.*": {
+                "image_rotary_emb": [
+                    ContextParallelInput(
+                        split_dim=0, expected_dims=2, split_output=False
+                    ),
+                    ContextParallelInput(
+                        split_dim=0, expected_dims=2, split_output=False
+                    ),
+                ],
+            },
+            # transformer forward while using CP, since it is not splited here.
+            # Then, the final proj_out will gather the splited output.
+            #     splited input (previous splited output)
+            #     -> all gather
+            #     -> un-split output
+            "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3),
+        }
+        return _cp_plan
+@functools.wraps(CogVideoXAttnProcessor2_0.__call__)
+def __patch_CogVideoXAttnProcessor2_0__call__(
+    self: CogVideoXAttnProcessor2_0,
+    attn: Attention,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    image_rotary_emb: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    text_seq_length = encoder_hidden_states.size(1)
+    hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+    batch_size, sequence_length, _ = hidden_states.shape
+    # NOTE(DefTruth): attention mask is always None in CogVideoX
+    if attention_mask is not None:
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        attention_mask = attention_mask.view(
+            batch_size, attn.heads, -1, attention_mask.shape[-1]
+        )
+    query = attn.to_q(hidden_states)
+    key = attn.to_k(hidden_states)
+    value = attn.to_v(hidden_states)
+    inner_dim = key.shape[-1]
+    head_dim = inner_dim // attn.heads
+    # NOTE(DefTruth): no transpose
+    query = query.view(batch_size, -1, attn.heads, head_dim)
+    key = key.view(batch_size, -1, attn.heads, head_dim)
+    value = value.view(batch_size, -1, attn.heads, head_dim)
+    if attn.norm_q is not None:
+        query = attn.norm_q(query)
+    if attn.norm_k is not None:
+        key = attn.norm_k(key)
+    # Apply RoPE if needed
+    if image_rotary_emb is not None:
+        from diffusers.models.embeddings import apply_rotary_emb
+        query[:, text_seq_length:] = apply_rotary_emb(
+            query[:, text_seq_length:],
+            image_rotary_emb,
+            sequence_dim=1,
+        )
+        if not attn.is_cross_attention:
+            key[:, text_seq_length:] = apply_rotary_emb(
+                key[:, text_seq_length:],
+                image_rotary_emb,
+                sequence_dim=1,
+            )
+    # NOTE(DefTruth): Apply dispatch_attention_fn instead of sdpa directly
+    hidden_states = dispatch_attention_fn(
+        query,
+        key,
+        value,
+        attn_mask=attention_mask,
+        dropout_p=0.0,
+        is_causal=False,
+        backend=getattr(self, "_attention_backend", None),
+        parallel_config=getattr(self, "_parallel_config", None),
+    )
+    hidden_states = hidden_states.reshape(batch_size, -1, attn.heads * head_dim)
+    # linear proj
+    hidden_states = attn.to_out[0](hidden_states)
+    # dropout
+    hidden_states = attn.to_out[1](hidden_states)
+    encoder_hidden_states, hidden_states = hidden_states.split(
+        [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+    )
+    return hidden_states, encoder_hidden_states

cache-dit 0.3.2__py3-none-any.whl → 1.0.14__py3-none-any.whl

cache-dit 0.3.2py3-none-any.whl → 1.0.14py3-none-any.whl