PyPI - diffsynth-engine - Versions diffs - 0.3.6.dev8__tar.gz → 0.3.6.dev10__tar.gz - Mend

diffsynth-engine 0.3.6.dev8tar.gz → 0.3.6.dev10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

{diffsynth_engine-0.3.6.dev8 → diffsynth_engine-0.3.6.dev10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.3.6.dev8
+Version: 0.3.6.dev10
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.3.6.dev8 → diffsynth_engine-0.3.6.dev10}/diffsynth_engine/__init__.py RENAMED Viewed

@@ -1,12 +1,14 @@
+from .configs import (
+    SDPipelineConfig,
+    SDXLPipelineConfig,
+    FluxPipelineConfig,
+    WanPipelineConfig,
+)
 from .pipelines import (
     FluxImagePipeline,
     SDXLImagePipeline,
     SDImagePipeline,
     WanVideoPipeline,
-    FluxModelConfig,
-    SDXLModelConfig,
-    SDModelConfig,
-    WanModelConfig,
     ControlNetParams,
 )
 from .models.flux import FluxControlNet, FluxIPAdapter, FluxRedux
@@ -23,6 +25,10 @@ from .tools import (
 )
 __all__ = [
+    "SDPipelineConfig",
+    "SDXLPipelineConfig",
+    "FluxPipelineConfig",
+    "WanPipelineConfig",
     "FluxImagePipeline",
     "FluxControlNet",
     "FluxIPAdapter",
@@ -32,10 +38,6 @@ __all__ = [
     "SDXLImagePipeline",
     "SDImagePipeline",
     "WanVideoPipeline",
-    "FluxModelConfig",
-    "SDXLModelConfig",
-    "SDModelConfig",
-    "WanModelConfig",
     "FluxInpaintingTool",
     "FluxOutpaintingTool",
     "FluxIPAdapterRefTool",

diffsynth_engine-0.3.6.dev10/diffsynth_engine/configs/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+from .pipeline import (
+    BaseConfig,
+    AttentionConfig,
+    OptimizationConfig,
+    ParallelConfig,
+    SDPipelineConfig,
+    SDXLPipelineConfig,
+    FluxPipelineConfig,
+    WanPipelineConfig,
+)
+from .controlnet import ControlType
+__all__ = [
+    "BaseConfig",
+    "AttentionConfig",
+    "OptimizationConfig",
+    "ParallelConfig",
+    "SDPipelineConfig",
+    "SDXLPipelineConfig",
+    "FluxPipelineConfig",
+    "WanPipelineConfig",
+    "ControlType",
+]

diffsynth_engine-0.3.6.dev10/diffsynth_engine/configs/controlnet.py ADDED Viewed

@@ -0,0 +1,17 @@
+from enum import Enum
+# FLUX ControlType
+class ControlType(Enum):
+    normal = "normal"
+    bfl_control = "bfl_control"
+    bfl_fill = "bfl_fill"
+    bfl_kontext = "bfl_kontext"
+    def get_in_channel(self):
+        if self in [ControlType.normal, ControlType.bfl_kontext]:
+            return 64
+        elif self == ControlType.bfl_control:
+            return 128
+        elif self == ControlType.bfl_fill:
+            return 384

diffsynth_engine-0.3.6.dev10/diffsynth_engine/configs/pipeline.py ADDED Viewed

@@ -0,0 +1,206 @@
+import os
+import torch
+from dataclasses import dataclass, field
+from typing import List, Tuple, Optional
+from diffsynth_engine.configs.controlnet import ControlType
+@dataclass
+class BaseConfig:
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    model_dtype: torch.dtype
+    batch_cfg: bool = False
+    vae_tiled: bool = False
+    vae_tile_size: int | Tuple[int, int] = 256
+    vae_tile_stride: int | Tuple[int, int] = 256
+    device: str = "cuda"
+    offload_mode: Optional[str] = None
+@dataclass
+class AttentionConfig:
+    dit_attn_impl: str = "auto"
+    # Sparge Attention
+    sparge_smooth_k: bool = True
+    sparge_cdfthreshd: float = 0.6
+    sparge_simthreshd1: float = 0.98
+    sparge_pvthreshd: float = 50.0
+@dataclass
+class OptimizationConfig:
+    use_fp8_linear: bool = False
+    use_fbcache: bool = False
+    fbcache_relative_l1_threshold: float = 0.05
+@dataclass
+class ParallelConfig:
+    parallelism: int = 1
+    use_cfg_parallel: bool = False
+    cfg_degree: Optional[int] = None
+    sp_ulysses_degree: Optional[int] = None
+    sp_ring_degree: Optional[int] = None
+    tp_degree: Optional[int] = None
+    use_fsdp: bool = False
+@dataclass
+class SDPipelineConfig(BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    clip_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    model_dtype: torch.dtype = torch.float16
+    clip_dtype: torch.dtype = torch.float16
+    vae_dtype: torch.dtype = torch.float32
+    @classmethod
+    def basic_config(
+        cls,
+        model_path: str | os.PathLike | List[str | os.PathLike],
+        device: str = "cuda",
+        offload_mode: Optional[str] = None,
+    ) -> "SDPipelineConfig":
+        return cls(
+            model_path=model_path,
+            device=device,
+            offload_mode=offload_mode,
+        )
+@dataclass
+class SDXLPipelineConfig(BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    clip_l_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    clip_g_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    model_dtype: torch.dtype = torch.float16
+    clip_l_dtype: torch.dtype = torch.float16
+    clip_g_dtype: torch.dtype = torch.float16
+    vae_dtype: torch.dtype = torch.float32
+    @classmethod
+    def basic_config(
+        cls,
+        model_path: str | os.PathLike | List[str | os.PathLike],
+        device: str = "cuda",
+        offload_mode: Optional[str] = None,
+    ) -> "SDXLPipelineConfig":
+        return cls(
+            model_path=model_path,
+            device=device,
+            offload_mode=offload_mode,
+        )
+@dataclass
+class FluxPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    clip_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    t5_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    model_dtype: torch.dtype = torch.bfloat16
+    clip_dtype: torch.dtype = torch.bfloat16
+    t5_dtype: torch.dtype = torch.bfloat16
+    vae_dtype: torch.dtype = torch.bfloat16
+    load_text_encoder: bool = True
+    control_type: ControlType = ControlType.normal
+    @classmethod
+    def basic_config(
+        cls,
+        model_path: str | os.PathLike | List[str | os.PathLike],
+        device: str = "cuda",
+        parallelism: int = 1,
+        offload_mode: Optional[str] = None,
+    ) -> "FluxPipelineConfig":
+        return cls(
+            model_path=model_path,
+            device=device,
+            parallelism=parallelism,
+            use_fsdp=True,
+            offload_mode=offload_mode,
+        )
+    def __post_init__(self):
+        init_parallel_config(self)
+@dataclass
+class WanPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    t5_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    image_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    model_dtype: torch.dtype = torch.bfloat16
+    t5_dtype: torch.dtype = torch.bfloat16
+    vae_dtype: torch.dtype = torch.bfloat16
+    image_encoder_dtype: torch.dtype = torch.bfloat16
+    shift: Optional[float] = field(default=None, init=False)  # RecifitedFlowScheduler shift factor, set by model type
+    # override BaseConfig
+    vae_tiled: bool = True
+    vae_tile_size: Tuple[int, int] = (34, 34)
+    vae_tile_stride: Tuple[int, int] = (18, 16)
+    @classmethod
+    def basic_config(
+        cls,
+        model_path: str | os.PathLike | List[str | os.PathLike],
+        image_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None,
+        device: str = "cuda",
+        parallelism: int = 1,
+        offload_mode: Optional[str] = None,
+    ) -> "WanPipelineConfig":
+        return cls(
+            model_path=model_path,
+            image_encoder_path=image_encoder_path,
+            device=device,
+            parallelism=parallelism,
+            use_cfg_parallel=True,
+            use_fsdp=True,
+            offload_mode=offload_mode,
+        )
+    def __post_init__(self):
+        init_parallel_config(self)
+def init_parallel_config(config: FluxPipelineConfig | WanPipelineConfig):
+    assert config.parallelism in (1, 2, 4, 8), "parallelism must be 1, 2, 4 or 8"
+    config.batch_cfg = True if config.parallelism > 1 and config.use_cfg_parallel else config.batch_cfg
+    if config.use_cfg_parallel is True and config.cfg_degree is not None:
+        raise ValueError("use_cfg_parallel and cfg_degree should not be specified together")
+    config.cfg_degree = (2 if config.use_cfg_parallel else 1) if config.cfg_degree is None else config.cfg_degree
+    if config.tp_degree is not None:
+        assert config.sp_ulysses_degree is None and config.sp_ring_degree is None, (
+            "not allowed to enable sequence parallel and tensor parallel together; "
+            "either set sp_ulysses_degree=None, sp_ring_degree=None or set tp_degree=None during pipeline initialization"
+        )
+        assert config.use_fsdp is False, (
+            "not allowed to enable fully sharded data parallel and tensor parallel together; "
+            "either set use_fsdp=False or set tp_degree=None during pipeline initialization"
+        )
+        assert config.parallelism == config.cfg_degree * config.tp_degree, (
+            f"parallelism ({config.parallelism}) must be equal to cfg_degree ({config.cfg_degree}) * tp_degree ({config.tp_degree})"
+        )
+        config.sp_ulysses_degree = 1
+        config.sp_ring_degree = 1
+    elif config.sp_ulysses_degree is None and config.sp_ring_degree is None:
+        # use ulysses if not specified
+        config.sp_ulysses_degree = config.parallelism // config.cfg_degree
+        config.sp_ring_degree = 1
+        config.tp_degree = 1
+    elif config.sp_ulysses_degree is not None and config.sp_ring_degree is not None:
+        assert config.parallelism == config.cfg_degree * config.sp_ulysses_degree * config.sp_ring_degree, (
+            f"parallelism ({config.parallelism}) must be equal to cfg_degree ({config.cfg_degree}) * "
+            f"sp_ulysses_degree ({config.sp_ulysses_degree}) * sp_ring_degree ({config.sp_ring_degree})"
+        )
+        config.tp_degree = 1
+    else:
+        raise ValueError("sp_ulysses_degree and sp_ring_degree must be specified together")

{diffsynth_engine-0.3.6.dev8 → diffsynth_engine-0.3.6.dev10}/diffsynth_engine/models/basic/attention.py RENAMED Viewed

@@ -61,12 +61,33 @@ if SAGE_ATTN_AVAILABLE:
 if SPARGE_ATTN_AVAILABLE:
     from spas_sage_attn import spas_sage2_attn_meansim_cuda
+    from spas_sage_attn.autotune import SparseAttentionMeansim
-    def sparge_attn(q, k, v, attn_mask=None, scale=None):
+    def sparge_attn(
+        q,
+        k,
+        v,
+        attn_mask=None,
+        scale=None,
+        smooth_k=True,
+        simthreshd1=0.6,
+        cdfthreshd=0.98,
+        pvthreshd=50,
+    ):
         q = q.transpose(1, 2)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
-        out = spas_sage2_attn_meansim_cuda(q, k, v, attn_mask=attn_mask, scale=scale)
+        out = spas_sage2_attn_meansim_cuda(
+            q,
+            k,
+            v,
+            attn_mask=attn_mask,
+            scale=scale,
+            smooth_k=smooth_k,
+            simthreshd1=simthreshd1,
+            cdfthreshd=cdfthreshd,
+            pvthreshd=pvthreshd,
+        )
         return out.transpose(1, 2)
@@ -91,6 +112,7 @@ def attention(
     attn_impl: Optional[str] = None,
     attn_mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
+    **kwargs,
 ):
     """
     q: [B, Lq, Nq, C1]
@@ -133,7 +155,17 @@ def attention(
         elif attn_impl == "sage_attn":
             return sage_attn(q, k, v, attn_mask=attn_mask, scale=scale)
         elif attn_impl == "sparge_attn":
-            return sparge_attn(q, k, v, attn_mask=attn_mask, scale=scale)
+            return sparge_attn(
+                q,
+                k,
+                v,
+                attn_mask=attn_mask,
+                scale=scale,
+                smooth_k=kwargs.get("sparge_smooth_k", True),
+                simthreshd1=kwargs.get("sparge_simthreshd1", 0.6),
+                cdfthreshd=kwargs.get("sparge_cdfthreshd", 0.98),
+                pvthreshd=kwargs.get("sparge_pvthreshd", 50),
+            )
         else:
             raise ValueError(f"Invalid attention implementation: {attn_impl}")
@@ -189,6 +221,7 @@ def long_context_attention(
     attn_impl: Optional[str] = None,
     attn_mask: Optional[torch.Tensor] = None,
     scale: Optional[float] = None,
+    **kwargs,
 ):
     """
     q: [B, Lq, Nq, C1]
@@ -226,7 +259,13 @@ def long_context_attention(
         elif attn_impl == "sage_attn":
             attn_func = LongContextAttention(attn_type=AttnType.SAGE_FP8)
         elif attn_impl == "sparge_attn":
-            attn_func = LongContextAttention(attn_type=AttnType.SPARSE_SAGE)
+            attn_processor = SparseAttentionMeansim()
+            # default args from spas_sage2_attn_meansim_cuda
+            attn_processor.smooth_k = torch.tensor(kwargs.get("sparge_smooth_k", True))
+            attn_processor.simthreshd1 = torch.tensor(kwargs.get("sparge_simthreshd1", 0.6))
+            attn_processor.cdfthreshd = torch.tensor(kwargs.get("sparge_cdfthreshd", 0.98))
+            attn_processor.pvthreshd = torch.tensor(kwargs.get("sparge_pvthreshd", 50))
+            attn_func = LongContextAttention(attn_type=AttnType.SPARSE_SAGE, attn_processor=attn_processor)
         else:
             raise ValueError(f"Invalid long context attention implementation: {attn_impl}")
     return attn_func(q, k, v, softmax_scale=scale)

{diffsynth_engine-0.3.6.dev8 → diffsynth_engine-0.3.6.dev10}/diffsynth_engine/models/flux/flux_controlnet.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from typing import Optional, Dict
+from typing import Any, Dict, Optional
 from einops import rearrange
 from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter
 from diffsynth_engine.models.flux.flux_dit import (
@@ -87,7 +87,7 @@ class FluxControlNet(PreTrainedModel):
     def __init__(
         self,
         condition_channels: int = 64,
-        attn_impl: Optional[str] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -104,7 +104,10 @@ class FluxControlNet(PreTrainedModel):
         self.x_embedder = nn.Linear(64, 3072, device=device, dtype=dtype)
         self.controlnet_x_embedder = nn.Linear(condition_channels, 3072)
         self.blocks = nn.ModuleList(
-            [FluxDoubleTransformerBlock(3072, 24, attn_impl=attn_impl, device=device, dtype=dtype) for _ in range(6)]
+            [
+                FluxDoubleTransformerBlock(3072, 24, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+                for _ in range(6)
+            ]
         )
         # controlnet projection
         self.blocks_proj = nn.ModuleList(
@@ -154,7 +157,7 @@ class FluxControlNet(PreTrainedModel):
         state_dict: Dict[str, torch.Tensor],
         device: str,
         dtype: torch.dtype,
-        attn_impl: Optional[str] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
         if "controlnet_x_embedder.weight" in state_dict:
             condition_channels = state_dict["controlnet_x_embedder.weight"].shape[1]
@@ -163,7 +166,7 @@ class FluxControlNet(PreTrainedModel):
         with no_init_weights():
             model = torch.nn.utils.skip_init(
-                cls, condition_channels=condition_channels, attn_impl=attn_impl, device=device, dtype=dtype
+                cls, condition_channels=condition_channels, attn_kwargs=attn_kwargs, device=device, dtype=dtype
             )
         model.load_state_dict(state_dict)
         model.to(device=device, dtype=dtype, non_blocking=True)

{diffsynth_engine-0.3.6.dev8 → diffsynth_engine-0.3.6.dev10}/diffsynth_engine/models/flux/flux_dit.py RENAMED Viewed

@@ -2,7 +2,7 @@ import json
 import torch
 import torch.nn as nn
 import numpy as np
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 from einops import rearrange
 from diffsynth_engine.models.basic.transformer_helper import (
@@ -177,7 +177,7 @@ class FluxDoubleAttention(nn.Module):
         dim_b,
         num_heads,
         head_dim,
-        attn_impl: Optional[str] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -195,7 +195,7 @@ class FluxDoubleAttention(nn.Module):
         self.a_to_out = nn.Linear(dim_a, dim_a, device=device, dtype=dtype)
         self.b_to_out = nn.Linear(dim_b, dim_b, device=device, dtype=dtype)
-        self.attn_impl = attn_impl
+        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
     def attention_callback(self, attn_out_a, attn_out_b, x_a, x_b, q_a, q_b, k_a, k_b, v_a, v_b, rope_emb, image_emb):
         return attn_out_a, attn_out_b
@@ -207,7 +207,7 @@ class FluxDoubleAttention(nn.Module):
         k = torch.cat([self.norm_k_b(k_b), self.norm_k_a(k_a)], dim=1)
         v = torch.cat([v_b, v_a], dim=1)
         q, k = apply_rope(q, k, rope_emb)
-        attn_out = attention_ops.attention(q, k, v, attn_impl=self.attn_impl)
+        attn_out = attention_ops.attention(q, k, v, **self.attn_kwargs)
         attn_out = rearrange(attn_out, "b s h d -> b s (h d)").to(q.dtype)
         text_out, image_out = attn_out[:, : text.shape[1]], attn_out[:, text.shape[1] :]
         image_out, text_out = self.attention_callback(
@@ -232,13 +232,13 @@ class FluxDoubleTransformerBlock(nn.Module):
         self,
         dim,
         num_heads,
-        attn_impl: Optional[str] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
         self.attn = FluxDoubleAttention(
-            dim, dim, num_heads, dim // num_heads, attn_impl=attn_impl, device=device, dtype=dtype
+            dim, dim, num_heads, dim // num_heads, attn_kwargs=attn_kwargs, device=device, dtype=dtype
         )
         # Image
         self.norm_msa_a = AdaLayerNormZero(dim, device=device, dtype=dtype)
@@ -278,7 +278,7 @@ class FluxSingleAttention(nn.Module):
         self,
         dim,
         num_heads,
-        attn_impl: Optional[str] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -287,7 +287,7 @@ class FluxSingleAttention(nn.Module):
         self.to_qkv = nn.Linear(dim, dim * 3, device=device, dtype=dtype)
         self.norm_q_a = RMSNorm(dim // num_heads, eps=1e-6, device=device, dtype=dtype)
         self.norm_k_a = RMSNorm(dim // num_heads, eps=1e-6, device=device, dtype=dtype)
-        self.attn_impl = attn_impl
+        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
     def attention_callback(self, attn_out, x, q, k, v, rope_emb, image_emb):
         return attn_out
@@ -295,7 +295,7 @@ class FluxSingleAttention(nn.Module):
     def forward(self, x, rope_emb, image_emb):
         q, k, v = rearrange(self.to_qkv(x), "b s (h d) -> b s h d", h=(3 * self.num_heads)).chunk(3, dim=2)
         q, k = apply_rope(self.norm_q_a(q), self.norm_k_a(k), rope_emb)
-        attn_out = attention_ops.attention(q, k, v, attn_impl=self.attn_impl)
+        attn_out = attention_ops.attention(q, k, v, **self.attn_kwargs)
         attn_out = rearrange(attn_out, "b s h d -> b s (h d)").to(q.dtype)
         return self.attention_callback(attn_out=attn_out, x=x, q=q, k=k, v=v, rope_emb=rope_emb, image_emb=image_emb)
@@ -305,14 +305,14 @@ class FluxSingleTransformerBlock(nn.Module):
         self,
         dim,
         num_heads,
-        attn_impl: Optional[str] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
         self.dim = dim
         self.norm = AdaLayerNormZero(dim, device=device, dtype=dtype)
-        self.attn = FluxSingleAttention(dim, num_heads, attn_impl=attn_impl, device=device, dtype=dtype)
+        self.attn = FluxSingleAttention(dim, num_heads, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
         self.mlp = nn.Sequential(
             nn.Linear(dim, dim * 4),
             nn.GELU(approximate="tanh"),
@@ -333,7 +333,7 @@ class FluxDiT(PreTrainedModel):
     def __init__(
         self,
         in_channel: int = 64,
-        attn_impl: Optional[str] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -351,10 +351,16 @@ class FluxDiT(PreTrainedModel):
         self.x_embedder = nn.Linear(in_channel, 3072, device=device, dtype=dtype)
         self.blocks = nn.ModuleList(
-            [FluxDoubleTransformerBlock(3072, 24, attn_impl=attn_impl, device=device, dtype=dtype) for _ in range(19)]
+            [
+                FluxDoubleTransformerBlock(3072, 24, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+                for _ in range(19)
+            ]
         )
         self.single_blocks = nn.ModuleList(
-            [FluxSingleTransformerBlock(3072, 24, attn_impl=attn_impl, device=device, dtype=dtype) for _ in range(38)]
+            [
+                FluxSingleTransformerBlock(3072, 24, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+                for _ in range(38)
+            ]
         )
         self.final_norm_out = AdaLayerNorm(3072, device=device, dtype=dtype)
         self.final_proj_out = nn.Linear(3072, 64, device=device, dtype=dtype)
@@ -495,7 +501,7 @@ class FluxDiT(PreTrainedModel):
         device: str,
         dtype: torch.dtype,
         in_channel: int = 64,
-        attn_impl: Optional[str] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
         with no_init_weights():
             model = torch.nn.utils.skip_init(
@@ -503,7 +509,7 @@ class FluxDiT(PreTrainedModel):
                 device=device,
                 dtype=dtype,
                 in_channel=in_channel,
-                attn_impl=attn_impl,
+                attn_kwargs=attn_kwargs,
             )
             model = model.requires_grad_(False)  # for loading gguf
         model.load_state_dict(state_dict, assign=True)

{diffsynth_engine-0.3.6.dev8 → diffsynth_engine-0.3.6.dev10}/diffsynth_engine/models/flux/flux_dit_fbcache.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import torch
 import numpy as np
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.gguf import gguf_inference
@@ -21,12 +21,12 @@ class FluxDiTFBCache(FluxDiT):
     def __init__(
         self,
         in_channel: int = 64,
-        attn_impl: Optional[str] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
         relative_l1_threshold: float = 0.05,
     ):
-        super().__init__(in_channel=in_channel, attn_impl=attn_impl, device=device, dtype=dtype)
+        super().__init__(in_channel=in_channel, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
         self.relative_l1_threshold = relative_l1_threshold
         self.step_count = 0
         self.num_inference_steps = 0
@@ -187,7 +187,7 @@ class FluxDiTFBCache(FluxDiT):
         device: str,
         dtype: torch.dtype,
         in_channel: int = 64,
-        attn_impl: Optional[str] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         fb_cache_relative_l1_threshold: float = 0.05,
     ):
         with no_init_weights():
@@ -196,7 +196,7 @@ class FluxDiTFBCache(FluxDiT):
                 device=device,
                 dtype=dtype,
                 in_channel=in_channel,
-                attn_impl=attn_impl,
+                attn_kwargs=attn_kwargs,
                 fb_cache_relative_l1_threshold=fb_cache_relative_l1_threshold,
             )
             model = model.requires_grad_(False)  # for loading gguf

{diffsynth_engine-0.3.6.dev8 → diffsynth_engine-0.3.6.dev10}/diffsynth_engine/models/flux/flux_ipadapter.py RENAMED Viewed

@@ -2,7 +2,7 @@ import torch
 from einops import rearrange
 from torch import nn
 from PIL import Image
-from typing import Dict, List
+from typing import Any, Dict, List, Optional
 from functools import partial
 from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.models.text_encoder.siglip import SiglipImageEncoder
@@ -19,7 +19,7 @@ class FluxIPAdapterAttention(nn.Module):
         dim: int = 3072,
         head_num: int = 24,
         scale: float = 1.0,
-        attn_impl="auto",
+        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -29,12 +29,12 @@ class FluxIPAdapterAttention(nn.Module):
         self.to_v_ip = nn.Linear(image_emb_dim, dim, device=device, dtype=dtype, bias=False)
         self.head_num = head_num
         self.scale = scale
-        self.attn_impl = attn_impl
+        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
     def forward(self, query: torch.Tensor, image_emb: torch.Tensor):
         key = rearrange(self.norm_k(self.to_k_ip(image_emb)), "b s (h d) -> b s h d", h=self.head_num)
         value = rearrange(self.to_v_ip(image_emb), "b s (h d) -> b s h d", h=self.head_num)
-        attn_out = attention(query, key, value)
+        attn_out = attention(query, key, value, **self.attn_kwargs)
         return self.scale * rearrange(attn_out, "b s h d -> b s (h d)")
     @classmethod
@@ -142,7 +142,7 @@ class FluxIPAdapter(PreTrainedModel):
                 single_attention_callback, self=dit.single_blocks[i].attn
             )
-    def image_encode(self, image: Image.Image) -> torch.Tensor:
+    def encode_image(self, image: Image.Image) -> torch.Tensor:
         image_emb = self.image_encoder(image)
         return self.image_proj(image_emb)

{diffsynth_engine-0.3.6.dev8 → diffsynth_engine-0.3.6.dev10}/diffsynth_engine/models/sd/sd_controlnet.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from typing import Dict, Optional
+from typing import Dict
 from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter
 from diffsynth_engine.models.basic.timestep import TimestepEmbeddings
@@ -570,7 +570,6 @@ class SDControlNet(PreTrainedModel):
     def __init__(
         self,
-        attn_impl: Optional[str] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -666,10 +665,9 @@ class SDControlNet(PreTrainedModel):
         state_dict: Dict[str, torch.Tensor],
         device: str,
         dtype: torch.dtype,
-        attn_impl: Optional[str] = None,
     ):
         with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, attn_impl=attn_impl, device=device, dtype=dtype)
+            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
         model.load_state_dict(state_dict)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model

diffsynth-engine 0.3.6.dev8__tar.gz → 0.3.6.dev10__tar.gz

diffsynth-engine 0.3.6.dev8tar.gz → 0.3.6.dev10tar.gz