PyPI - diffsynth-engine - Versions diffs - 0.7.1.dev1__tar.gz → 0.7.1.dev3__tar.gz - Mend

diffsynth-engine 0.7.1.dev1tar.gz → 0.7.1.dev3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

{diffsynth_engine-0.7.1.dev1 → diffsynth_engine-0.7.1.dev3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.7.1.dev1
+Version: 0.7.1.dev3
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.7.1.dev1 → diffsynth_engine-0.7.1.dev3}/diffsynth_engine/algorithm/noise_scheduler/base_scheduler.py RENAMED Viewed

@@ -19,7 +19,10 @@ class BaseScheduler:
     def update_config(self, config_dict):
         for config_name, new_value in config_dict.items():
             if hasattr(self, config_name):
-                setattr(self, config_name, new_value)
+                actual_value = new_value
+                if isinstance(actual_value, str) and actual_value.lower() == "none":
+                    actual_value = None
+                setattr(self, config_name, actual_value)
     def restore_config(self):
         for config_name, config_value in self._stored_config.items():

{diffsynth_engine-0.7.1.dev1 → diffsynth_engine-0.7.1.dev3}/diffsynth_engine/models/z_image/qwen3.py RENAMED Viewed

@@ -59,8 +59,11 @@ class Qwen3Model(PreTrainedModel):
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
-        model = cls(config=config, device="meta", dtype=dtype)
+        with torch.device("meta"):
+            model = cls(config=config, device="meta", dtype=dtype)
         model.requires_grad_(False)
+        model.rotary_emb = Qwen3RotaryEmbedding(config=config, device=device)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model

{diffsynth_engine-0.7.1.dev1 → diffsynth_engine-0.7.1.dev3}/diffsynth_engine/models/z_image/z_image_dit.py RENAMED Viewed

@@ -584,7 +584,8 @@ class ZImageDiT(PreTrainedModel):
         dtype: torch.dtype,
         **kwargs,
     ):
-        model = cls(device="meta", dtype=dtype, **kwargs)
+        with torch.device("meta"):
+            model = cls(device="meta", dtype=dtype, **kwargs)
         model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)

diffsynth_engine-0.7.1.dev3/diffsynth_engine/tools/qwen_image_upscaler_tool.py ADDED Viewed

@@ -0,0 +1,340 @@
+import torch
+import torch.nn as nn
+import math
+import numpy as np
+from typing import Literal, Optional, Dict
+from copy import deepcopy
+from PIL import Image
+from einops import rearrange, repeat
+from contextlib import contextmanager
+from diffsynth_engine.configs import QwenImagePipelineConfig
+from diffsynth_engine.pipelines.qwen_image import QwenImagePipeline
+from diffsynth_engine.models.qwen_image import QwenImageVAE
+from diffsynth_engine.models.basic.lora import LoRALinear
+from diffsynth_engine.models.qwen_image.qwen_image_dit import QwenImageTransformerBlock
+from diffsynth_engine.utils import logging
+from diffsynth_engine.utils.loader import load_file
+from diffsynth_engine.utils.download import fetch_model
+from diffsynth_engine.utils.image import adain_color_fix, wavelet_color_fix
+logger = logging.get_logger(__name__)
+@contextmanager
+def odtsr_forward():
+    """
+    Context manager for ODTSR forward pass optimization.
+    Replaces two methods:
+    1. LoRALinear.forward - to support batch CFG with dual outputs
+    2. QwenImageTransformerBlock._modulate - optimized version without repeat_interleave
+    """
+    original_lora_forward = LoRALinear.forward
+    original_modulate = QwenImageTransformerBlock._modulate
+    def lora_batch_cfg_forward(self, x):
+        y = nn.Linear.forward(self, x)
+        if len(self._lora_dict) < 1:
+            return y
+        if x.ndim == 2:
+            y2 = y.clone()
+            for name, lora in self._lora_dict.items():
+                y2 += lora(x)
+            return torch.stack([y, y2], dim=1)
+        else:
+            L2 = x.shape[1]
+            L = L2 // 2
+            x2 = x[:, L:, :]
+            for name, lora in self._lora_dict.items():
+                y[:, L:] += lora(x2)
+            return y
+    def optimized_modulate(self, x, mod_params, index=None):
+        if mod_params.ndim == 2:
+            shift, scale, gate = mod_params.chunk(3, dim=-1)
+            return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
+        else:
+            B, L2, C = x.shape
+            L = L2 // 2
+            shift, scale, gate = mod_params.chunk(3, dim=-1)  # Each: [B, 2, dim]
+            result = torch.empty_like(x)
+            gate_result = torch.empty(B, L2, gate.shape[-1], dtype=x.dtype, device=x.device)
+            result[:, :L] = x[:, :L] * (1 + scale[:, 0:1]) + shift[:, 0:1]
+            gate_result[:, :L] = gate[:, 0:1].expand(-1, L, -1)
+            result[:, L:] = x[:, L:] * (1 + scale[:, 1:2]) + shift[:, 1:2]
+            gate_result[:, L:] = gate[:, 1:2].expand(-1, L, -1)
+            return result, gate_result
+    LoRALinear.forward = lora_batch_cfg_forward
+    QwenImageTransformerBlock._modulate = optimized_modulate
+    try:
+        yield
+    finally:
+        LoRALinear.forward = original_lora_forward
+        QwenImageTransformerBlock._modulate = original_modulate
+class QwenImageUpscalerTool:
+    """
+    Tool for ODTSR (One-step Diffusion Transformer Super Resolution) image upscaling.
+    https://huggingface.co/double8fun/ODTSR
+    """
+    def __init__(
+        self,
+        pipeline: QwenImagePipeline,
+        odtsr_weight_path: Optional[str] = None,
+    ):
+        self.pipe = pipeline
+        self.device = self.pipe.device
+        self.dtype = self.pipe.dtype
+        # to avoid "small grid" artifacts in generated images
+        self._convert_dit_part_linear_weight()
+        if not odtsr_weight_path:
+            odtsr_weight_path = fetch_model("muse/ODTSR", revision="master", path="weight.safetensors")
+        odtsr_state_dict = load_file(odtsr_weight_path)
+        lora_state_dict = self._convert_odtsr_lora(odtsr_state_dict)
+        lora_state_dict_list = [(lora_state_dict, 1.0, odtsr_weight_path)]
+        self.pipe._load_lora_state_dicts(lora_state_dict_list, fused=False, save_original_weight=False)
+        self.new_vae = deepcopy(self.pipe.vae)
+        self._load_vae_encoder_weights(odtsr_state_dict)
+        sigmas = torch.linspace(1.0, 0.0, 1000 + 1)[:-1]
+        mu = 0.8
+        shift_terminal = 0.02
+        sigmas = math.exp(mu) / (math.exp(mu) + (1 / sigmas - 1))
+        one_minus_sigmas = 1 - sigmas
+        scale_factor = one_minus_sigmas[-1] / (1 - shift_terminal)
+        self.sigmas = 1 - (one_minus_sigmas / scale_factor)
+        self.sigmas = self.sigmas.to(device=self.device)
+        self.timesteps = self.sigmas * self.pipe.noise_scheduler.num_train_timesteps
+        self.timesteps = self.timesteps.to(device=self.device)
+        self.start_timestep = 750
+        self.fixed_timestep = self.timesteps[self.start_timestep].to(device=self.device)
+        self.one_step_sigma = self.sigmas[self.start_timestep].to(device=self.device)
+        self.prompt = "High Contrast, hyper detailed photo, 2k UHD"
+        self.prompt_emb, self.prompt_emb_mask = self.pipe.encode_prompt(self.prompt, 1, 4096)
+    @classmethod
+    def from_pretrained(
+        cls,
+        qwen_model_path: str,
+        odtsr_weight_path: Optional[str] = None,
+        device: str = "cuda",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        config = QwenImagePipelineConfig(
+            model_path=qwen_model_path,
+            model_dtype=dtype,
+            device=device,
+            load_encoder=True,
+        )
+        pipe = QwenImagePipeline.from_pretrained(config)
+        return cls(pipe, odtsr_weight_path)
+    def _convert_dit_part_linear_weight(self):
+        """
+        Perform dtype conversion on weights of specific Linear layers in the DIT model.
+        This is an important trick: for Linear layers NOT in the patterns list, convert their weights
+        to float8_e4m3fn first, then convert back to the original dtype (typically bfloat16). This operation
+        matches the weight processing method used during training to avoid "small grid" artifacts in generated images.
+        Layers in the patterns list (such as LoRA-related layers) are skipped and their original weights remain unchanged.
+        """
+        patterns = [
+            "img_in",
+            "img_mod.1",
+            "attn.to_q",
+            "attn.to_k",
+            "attn.to_v",
+            "to_out",
+            "img_mlp.net.0.proj",
+            "img_mlp.net.2",
+        ]
+        def _convert_weight(parent: nn.Module, name_prefix: str = ""):
+            for name, module in list(parent.named_children()):
+                full_name = f"{name_prefix}{name}"
+                if isinstance(module, torch.nn.Linear):
+                    if not any(p in full_name for p in patterns):
+                        origin_dtype = module.weight.data.dtype
+                        module.weight.data = module.weight.data.to(torch.float8_e4m3fn)
+                        module.weight.data = module.weight.data.to(origin_dtype)
+                        if module.bias is not None:
+                            module.bias.data = module.bias.data.to(torch.float8_e4m3fn)
+                            module.bias.data = module.bias.data.to(origin_dtype)
+                else:
+                    _convert_weight(module, name_prefix=full_name + ".")
+        _convert_weight(self.pipe.dit)
+    def _convert_odtsr_lora(self, odtsr_state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        state_dict = {}
+        for key, param in odtsr_state_dict.items():
+            if "lora_A2" in key:
+                lora_b_key = key.replace("lora_A2", "lora_B2")
+                lora_b_param = odtsr_state_dict[lora_b_key]
+                lora_a_key = key.replace("lora_A2", "lora_A").replace("pipe.dit.", "")
+                lora_b_key = lora_b_key.replace("lora_B2", "lora_B").replace("pipe.dit.", "")
+                state_dict[lora_a_key] = param
+                state_dict[lora_b_key] = lora_b_param
+        return state_dict
+    def _load_vae_encoder_weights(self, state_dict: Dict[str, torch.Tensor]):
+        try:
+            vae_state_dict = {}
+            for k, v in state_dict.items():
+                if 'pipe.new_vae.' in k:
+                    new_key = k.replace('pipe.new_vae.', '')
+                    vae_state_dict[new_key] = v
+            if vae_state_dict:
+                self.new_vae.load_state_dict(vae_state_dict, strict=False)
+                logger.info(f"Loaded {len(vae_state_dict)} trained VAE encoder parameters")
+            else:
+                logger.warning(f"No 'pipe.new_vae.' weights found, using original VAE")
+        except Exception as e:
+            logger.error(f"Failed to load VAE encoder weights: {e}")
+            raise e
+    def add_noise(self, sample: torch.Tensor, noise: torch.Tensor, timestep: torch.Tensor) -> torch.Tensor:
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (1 - sigma) * sample + sigma * noise
+        return sample
+    def preprocess_image(self, image: Image.Image) -> torch.Tensor:
+        image = torch.Tensor(np.array(image, dtype=np.float32))
+        image = image.to(dtype=self.dtype, device=self.device)
+        image = image * (2 / 255) - 1
+        image = repeat(image, f"H W C -> B C H W", **({"B": 1}))
+        return image
+    def _prepare_condition_latents(self, image: Image.Image, vae: QwenImageVAE, vae_tiled: bool) -> torch.Tensor:
+        image_tensor = self.preprocess_image(image).to(dtype=self.pipe.config.vae_dtype)
+        image_tensor = image_tensor.unsqueeze(2)
+        latents = vae.encode(
+            image_tensor,
+            device=self.device,
+            tiled=vae_tiled,
+            tile_size=self.pipe.vae_tile_size,
+            tile_stride=self.pipe.vae_tile_stride,
+        )
+        latents = latents.squeeze(2).to(device=self.device, dtype=self.dtype)
+        return latents
+    def _single_step_denoise(
+        self,
+        latents: torch.Tensor,
+        image_latents: torch.Tensor,
+        noise: torch.Tensor,
+        prompt_emb: torch.Tensor,
+        prompt_emb_mask: torch.Tensor,
+        fidelity: float,
+    ) -> torch.Tensor:
+        fidelity_timestep_id = int(self.start_timestep + fidelity * (1000 - self.start_timestep) + 0.5)
+        if fidelity_timestep_id != 1000:
+            fidelity_timestep = self.timesteps[fidelity_timestep_id].to(device=self.device)
+            image_latents = self.add_noise(image_latents, noise, fidelity_timestep)
+        latents = self.add_noise(latents, noise, self.fixed_timestep)
+        with odtsr_forward():
+            noise_pred = self.pipe.predict_noise_with_cfg(
+                latents=latents,
+                image_latents=[image_latents],
+                timestep=self.fixed_timestep.unsqueeze(0),
+                prompt_emb=prompt_emb,
+                prompt_emb_mask=prompt_emb_mask,
+                negative_prompt_emb=None,
+                negative_prompt_emb_mask=None,
+                context_latents=None,
+                entity_prompt_embs=None,
+                entity_prompt_emb_masks=None,
+                negative_entity_prompt_embs=None,
+                negative_entity_prompt_emb_masks=None,
+                entity_masks=None,
+                cfg_scale=1.0,
+                batch_cfg=self.pipe.config.batch_cfg,
+            )
+        denoised = latents + (0 - self.one_step_sigma) * noise_pred
+        return denoised
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Image.Image,
+        scale: int = 2,
+        prompt: str = "High Contrast, hyper detailed photo, 2k UHD",
+        fidelity: float = 1.0,
+        align_method: Literal["none", "adain", "wavelet"] = "none",
+    ) -> Image.Image:
+        width, height = image.size
+        target_width, target_height = width * scale, height * scale
+        target_width_round = target_width // 16 * 16
+        target_height_round = target_height // 16 * 16
+        logger.info(f"Upscaling image from {width}x{height} to {target_width}x{target_height}")
+        vae_tiled = (target_width_round * target_height_round > 2048 * 2048)
+        resized_image = image.resize((target_width_round, target_height_round), Image.BICUBIC)
+        condition_latents = self._prepare_condition_latents(resized_image, self.pipe.vae, vae_tiled)
+        latents = self._prepare_condition_latents(resized_image, self.new_vae, vae_tiled)
+        noise = self.pipe.generate_noise(
+            (1, 16, target_height_round // 8, target_width_round // 8),
+            seed=42,
+            device=self.device,
+            dtype=self.dtype
+        )
+        prompt_emb, prompt_emb_mask = self.prompt_emb, self.prompt_emb_mask
+        if prompt != self.prompt:
+            prompt_emb, prompt_emb_mask = self.pipe.encode_prompt(prompt, 1, 4096)
+        denoised_latents = self._single_step_denoise(
+            latents=latents,
+            noise=noise,
+            image_latents=condition_latents,
+            prompt_emb=prompt_emb,
+            prompt_emb_mask=prompt_emb_mask,
+            fidelity=fidelity,
+        )
+        # Decode
+        denoised_latents = rearrange(denoised_latents, "B C H W -> B C 1 H W")
+        vae_output = rearrange(
+            self.pipe.vae.decode(
+                denoised_latents.to(self.pipe.vae.model.encoder.conv1.weight.dtype),
+                device=self.pipe.vae.model.encoder.conv1.weight.device,
+                tiled=vae_tiled,
+                tile_size=self.pipe.vae_tile_size,
+                tile_stride=self.pipe.vae_tile_stride,
+            )[0],
+            "C B H W -> B C H W",
+        )
+        result_image = self.pipe.vae_output_to_image(vae_output)
+        self.pipe.model_lifecycle_finish(["vae"])
+        if align_method == "adain":
+            result_image = adain_color_fix(target=result_image, source=resized_image)
+        elif align_method == "wavelet":
+            result_image = wavelet_color_fix(target=result_image, source=resized_image)
+        result_image = result_image.resize((target_width, target_height), Image.BICUBIC)
+        return result_image

{diffsynth_engine-0.7.1.dev1 → diffsynth_engine-0.7.1.dev3}/diffsynth_engine/utils/image.py RENAMED Viewed

@@ -1,10 +1,13 @@
 import torch
 from torchvision import transforms
+from torchvision.transforms import ToTensor, ToPILImage
 import numpy as np
 import math
 from PIL import Image
 from enum import Enum
 from typing import List, Tuple, Optional
+from torch import Tensor
+from torch.nn import functional as F
 from diffsynth_engine.utils import logging
@@ -243,3 +246,84 @@ def _need_rescale_pil_conversion(image: np.ndarray) -> bool:
             f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
         )
     return do_rescale
+# --------------------------------------------------------------------------------
+# Color Alignment Functions
+# Based on Li Yi's implementation: https://github.com/pkuliyi2015/sd-webui-stablesr
+# --------------------------------------------------------------------------------
+def calc_mean_std(feat: Tensor, eps=1e-5):
+    size = feat.size()
+    assert len(size) == 4, 'The input feature should be 4D tensor.'
+    b, c = size[:2]
+    feat_var = feat.reshape(b, c, -1).var(dim=2) + eps
+    feat_std = feat_var.sqrt().reshape(b, c, 1, 1)
+    feat_mean = feat.reshape(b, c, -1).mean(dim=2).reshape(b, c, 1, 1)
+    return feat_mean, feat_std
+def adaptive_instance_normalization(content_feat: Tensor, style_feat: Tensor):
+    size = content_feat.size()
+    style_mean, style_std = calc_mean_std(style_feat)
+    content_mean, content_std = calc_mean_std(content_feat)
+    normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
+    return normalized_feat * style_std.expand(size) + style_mean.expand(size)
+def wavelet_blur(image: Tensor, radius: int):
+    kernel_vals = [
+        [0.0625, 0.125, 0.0625],
+        [0.125, 0.25, 0.125],
+        [0.0625, 0.125, 0.0625],
+    ]
+    kernel = torch.tensor(kernel_vals, dtype=image.dtype, device=image.device)
+    kernel = kernel[None, None]
+    kernel = kernel.repeat(3, 1, 1, 1)
+    image = F.pad(image, (radius, radius, radius, radius), mode='replicate')
+    output = F.conv2d(image, kernel, groups=3, dilation=radius)
+    return output
+def wavelet_decomposition(image: Tensor, levels=5):
+    high_freq = torch.zeros_like(image)
+    for i in range(levels):
+        radius = 2 ** i
+        low_freq = wavelet_blur(image, radius)
+        high_freq += (image - low_freq)
+        image = low_freq
+    return high_freq, low_freq
+def wavelet_reconstruction(content_feat: Tensor, style_feat: Tensor):
+    content_high_freq, content_low_freq = wavelet_decomposition(content_feat)
+    del content_low_freq
+    style_high_freq, style_low_freq = wavelet_decomposition(style_feat)
+    del style_high_freq
+    return content_high_freq + style_low_freq
+def adain_color_fix(target: Image.Image, source: Image.Image) -> Image.Image:
+    to_tensor = ToTensor()
+    target_tensor = to_tensor(target).unsqueeze(0)
+    source_tensor = to_tensor(source).unsqueeze(0)
+    result_tensor = adaptive_instance_normalization(target_tensor, source_tensor)
+    to_image = ToPILImage()
+    result_image = to_image(result_tensor.squeeze(0).clamp_(0.0, 1.0))
+    return result_image
+def wavelet_color_fix(target: Image.Image, source: Image.Image) -> Image.Image:
+    to_tensor = ToTensor()
+    target_tensor = to_tensor(target).unsqueeze(0)
+    source_tensor = to_tensor(source).unsqueeze(0)
+    result_tensor = wavelet_reconstruction(target_tensor, source_tensor)
+    to_image = ToPILImage()
+    result_image = to_image(result_tensor.squeeze(0).clamp_(0.0, 1.0))
+    return result_image

{diffsynth_engine-0.7.1.dev1 → diffsynth_engine-0.7.1.dev3}/diffsynth_engine.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.7.1.dev1
+Version: 0.7.1.dev3
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.7.1.dev1 → diffsynth_engine-0.7.1.dev3}/diffsynth_engine.egg-info/SOURCES.txt RENAMED Viewed

@@ -202,6 +202,7 @@ diffsynth_engine/tools/flux_inpainting_tool.py
 diffsynth_engine/tools/flux_outpainting_tool.py
 diffsynth_engine/tools/flux_reference_tool.py
 diffsynth_engine/tools/flux_replace_tool.py
+diffsynth_engine/tools/qwen_image_upscaler_tool.py
 diffsynth_engine/utils/__init__.py
 diffsynth_engine/utils/cache.py
 diffsynth_engine/utils/constants.py