PyPI - diffsynth-engine - Versions diffs - 0.5.1.dev4__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl - Mend

diffsynth-engine 0.5.1.dev4py3-none-any.whl → 0.6.1.dev25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

diffsynth_engine/pipelines/qwen_image.py CHANGED Viewed

@@ -2,12 +2,17 @@ import json
 import torch
 import torch.distributed as dist
 import math
-from typing import Callable, List, Tuple, Optional, Union, Dict
+from typing import Callable, List, Dict, Tuple, Optional, Union
 from tqdm import tqdm
 from einops import rearrange
 from PIL import Image
-from diffsynth_engine.configs import QwenImagePipelineConfig, QwenImageStateDicts
+from diffsynth_engine.configs import (
+    QwenImagePipelineConfig,
+    QwenImageStateDicts,
+    QwenImageControlNetParams,
+    QwenImageControlType,
+)
 from diffsynth_engine.models.basic.lora import LoRAContext
 from diffsynth_engine.models.qwen_image import (
     QwenImageDiT,
@@ -19,7 +24,7 @@ from diffsynth_engine.models.qwen_image import (
 from diffsynth_engine.models.qwen_image import QwenImageVAE
 from diffsynth_engine.tokenizers import Qwen2TokenizerFast, Qwen2VLProcessor
 from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
-from diffsynth_engine.pipelines.utils import calculate_shift
+from diffsynth_engine.pipelines.utils import calculate_shift, pad_and_concat
 from diffsynth_engine.algorithm.noise_scheduler import RecifitedFlowScheduler
 from diffsynth_engine.algorithm.sampler import FlowMatchEulerSampler
 from diffsynth_engine.utils.constants import (
@@ -71,6 +76,39 @@ class QwenImageLoRAConverter(LoRAStateDictConverter):
             lora_args["alpha"] = alpha
             key = key.replace(f".{lora_a_suffix}", "")
+            key = key.replace("base_model.model.", "")
+            if key.startswith("transformer") and "attn.to_out.0" in key:
+                key = key.replace("attn.to_out.0", "attn.to_out")
+            dit_dict[key] = lora_args
+        return {"dit": dit_dict}
+    def _from_diffusers(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]:
+        dit_dict = {}
+        for key, param in lora_state_dict.items():
+            origin_key = key
+            lora_a_suffix = None
+            if "lora_A.weight" in key:
+                lora_a_suffix = "lora_A.weight"
+                lora_b_suffix = "lora_B.weight"
+            if lora_a_suffix is None:
+                continue
+            lora_args = {}
+            lora_args["down"] = param
+            lora_args["up"] = lora_state_dict[origin_key.replace(lora_a_suffix, lora_b_suffix)]
+            lora_args["rank"] = lora_args["up"].shape[1]
+            alpha_key = origin_key.replace(lora_a_suffix, "alpha")
+            if alpha_key in lora_state_dict:
+                alpha = lora_state_dict[alpha_key]
+            else:
+                alpha = lora_args["rank"]
+            lora_args["alpha"] = alpha
+            key = key.replace(f".{lora_a_suffix}", "")
+            key = key.replace("diffusion_model.", "")
             if key.startswith("transformer") and "attn.to_out.0" in key:
                 key = key.replace("attn.to_out.0", "attn.to_out")
@@ -78,7 +116,11 @@ class QwenImageLoRAConverter(LoRAStateDictConverter):
         return {"dit": dit_dict}
     def convert(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]:
-        return self._from_diffsynth(lora_state_dict)
+        key = list(lora_state_dict.keys())[0]
+        if key.startswith("diffusion_model."):
+            return self._from_diffusers(lora_state_dict)
+        else:
+            return self._from_diffsynth(lora_state_dict)
 class QwenImagePipeline(BasePipeline):
@@ -101,11 +143,25 @@ class QwenImagePipeline(BasePipeline):
             dtype=config.model_dtype,
         )
         self.config = config
+        # qwen image
         self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
         self.prompt_template_encode_start_idx = 34
+        # qwen image edit
+        self.edit_system_prompt = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
+        self.edit_prompt_template_encode = (
+            "<|im_start|>system\n"
+            + self.edit_system_prompt
+            + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
+        )
+        # qwen image edit plus
+        self.edit_plus_prompt_template_encode = (
+            "<|im_start|>system\n"
+            + self.edit_system_prompt
+            + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        )
-        self.edit_prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
         self.edit_prompt_template_encode_start_idx = 64
         # sampler
         self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True)
         self.sampler = FlowMatchEulerSampler()
@@ -138,6 +194,7 @@ class QwenImagePipeline(BasePipeline):
         logger.info(f"loading state dict from {config.vae_path} ...")
         vae_state_dict = cls.load_model_checkpoint(config.vae_path, device="cpu", dtype=config.vae_dtype)
+        encoder_state_dict = None
         if config.encoder_path is None:
             config.encoder_path = fetch_model(
                 "MusePublic/Qwen-image",
@@ -149,8 +206,11 @@ class QwenImagePipeline(BasePipeline):
                     "text_encoder/model-00004-of-00004.safetensors",
                 ],
             )
-        logger.info(f"loading state dict from {config.encoder_path} ...")
-        encoder_state_dict = cls.load_model_checkpoint(config.encoder_path, device="cpu", dtype=config.encoder_dtype)
+        if config.load_encoder:
+            logger.info(f"loading state dict from {config.encoder_path} ...")
+            encoder_state_dict = cls.load_model_checkpoint(
+                config.encoder_path, device="cpu", dtype=config.encoder_dtype
+            )
         state_dicts = QwenImageStateDicts(
             model=model_state_dict,
@@ -177,50 +237,44 @@ class QwenImagePipeline(BasePipeline):
     @classmethod
     def _from_state_dict(cls, state_dicts: QwenImageStateDicts, config: QwenImagePipelineConfig) -> "QwenImagePipeline":
         init_device = "cpu" if config.offload_mode is not None else config.device
-        tokenizer = Qwen2TokenizerFast.from_pretrained(QWEN_IMAGE_TOKENIZER_CONF_PATH)
-        processor = Qwen2VLProcessor.from_pretrained(
-            tokenizer_config_path=QWEN_IMAGE_TOKENIZER_CONF_PATH,
-            image_processor_config_path=QWEN_IMAGE_PROCESSOR_CONFIG_FILE,
-        )
-        with open(QWEN_IMAGE_VISION_CONFIG_FILE, "r") as f:
-            vision_config = Qwen2_5_VLVisionConfig(**json.load(f))
-        with open(QWEN_IMAGE_CONFIG_FILE, "r") as f:
-            text_config = Qwen2_5_VLConfig(**json.load(f))
-        encoder = Qwen2_5_VLForConditionalGeneration.from_state_dict(
-            state_dicts.encoder,
-            vision_config=vision_config,
-            config=text_config,
-            device=init_device,
-            dtype=config.encoder_dtype,
-        )
-        with open(QWEN_IMAGE_VAE_CONFIG_FILE, "r") as f:
+        tokenizer, processor, encoder = None, None, None
+        if config.load_encoder:
+            tokenizer = Qwen2TokenizerFast.from_pretrained(QWEN_IMAGE_TOKENIZER_CONF_PATH)
+            processor = Qwen2VLProcessor.from_pretrained(
+                tokenizer_config_path=QWEN_IMAGE_TOKENIZER_CONF_PATH,
+                image_processor_config_path=QWEN_IMAGE_PROCESSOR_CONFIG_FILE,
+            )
+            with open(QWEN_IMAGE_VISION_CONFIG_FILE, "r", encoding="utf-8") as f:
+                vision_config = Qwen2_5_VLVisionConfig(**json.load(f))
+            with open(QWEN_IMAGE_CONFIG_FILE, "r", encoding="utf-8") as f:
+                text_config = Qwen2_5_VLConfig(**json.load(f))
+            encoder = Qwen2_5_VLForConditionalGeneration.from_state_dict(
+                state_dicts.encoder,
+                vision_config=vision_config,
+                config=text_config,
+                device=("cpu" if config.use_fsdp else init_device),
+                dtype=config.encoder_dtype,
+            )
+        with open(QWEN_IMAGE_VAE_CONFIG_FILE, "r", encoding="utf-8") as f:
             vae_config = json.load(f)
         vae = QwenImageVAE.from_state_dict(
             state_dicts.vae, config=vae_config, device=init_device, dtype=config.vae_dtype
         )
         with LoRAContext():
-            attn_kwargs = {
-                "attn_impl": config.dit_attn_impl,
-                "sparge_smooth_k": config.sparge_smooth_k,
-                "sparge_cdfthreshd": config.sparge_cdfthreshd,
-                "sparge_simthreshd1": config.sparge_simthreshd1,
-                "sparge_pvthreshd": config.sparge_pvthreshd,
-            }
             if config.use_fbcache:
                 dit = QwenImageDiTFBCache.from_state_dict(
                     state_dicts.model,
-                    device=init_device,
+                    device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
-                    attn_kwargs=attn_kwargs,
                     relative_l1_threshold=config.fbcache_relative_l1_threshold,
                 )
             else:
                 dit = QwenImageDiT.from_state_dict(
                     state_dicts.model,
-                    device=init_device,
+                    device=("cpu" if config.use_fsdp else init_device),
                     dtype=config.model_dtype,
-                    attn_kwargs=attn_kwargs,
                 )
             if config.use_fp8_linear:
                 enable_fp8_linear(dit)
@@ -254,8 +308,13 @@ class QwenImagePipeline(BasePipeline):
             pipe.compile()
         return pipe
+    def update_weights(self, state_dicts: QwenImageStateDicts) -> None:
+        self.update_component(self.dit, state_dicts.model, self.config.device, self.config.model_dtype)
+        self.update_component(self.encoder, state_dicts.encoder, self.config.device, self.config.encoder_dtype)
+        self.update_component(self.vae, state_dicts.vae, self.config.device, self.config.vae_dtype)
     def compile(self):
-        self.dit.compile_repeated_blocks(dynamic=True)
+        self.dit.compile_repeated_blocks()
     def load_loras(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
         assert self.config.tp_degree is None or self.config.tp_degree == 1, (
@@ -270,6 +329,10 @@ class QwenImagePipeline(BasePipeline):
     def unload_loras(self):
         self.dit.unload_loras()
+        self.noise_scheduler.restore_config()
+    def apply_scheduler_config(self, scheduler_config: Dict):
+        self.noise_scheduler.update_config(scheduler_config)
     def prepare_latents(
         self,
@@ -307,32 +370,43 @@ class QwenImagePipeline(BasePipeline):
         input_ids, attention_mask = outputs["input_ids"].to(self.device), outputs["attention_mask"].to(self.device)
         outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
         hidden_states = outputs["hidden_states"]
-        prompt_embeds = hidden_states[:, drop_idx:]
-        prompt_embeds_mask = attention_mask[:, drop_idx:]
-        seq_len = prompt_embeds.shape[1]
+        prompt_emb = hidden_states[:, drop_idx:]
+        prompt_emb_mask = attention_mask[:, drop_idx:]
+        seq_len = prompt_emb.shape[1]
         # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        prompt_emb = prompt_emb.repeat(1, num_images_per_prompt, 1)
+        prompt_emb = prompt_emb.view(batch_size * num_images_per_prompt, seq_len, -1)
-        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+        prompt_emb_mask = prompt_emb_mask.repeat(1, num_images_per_prompt, 1)
+        prompt_emb_mask = prompt_emb_mask.view(batch_size * num_images_per_prompt, seq_len)
-        return prompt_embeds, prompt_embeds_mask
+        return prompt_emb, prompt_emb_mask
     def encode_prompt_with_image(
         self,
         prompt: Union[str, List[str]],
-        image: torch.Tensor,
+        vae_image: List[torch.Tensor],
+        condition_image: List[torch.Tensor],  # edit plus
         num_images_per_prompt: int = 1,
         max_sequence_length: int = 1024,
+        is_edit_plus: bool = True,
     ):
         prompt = [prompt] if isinstance(prompt, str) else prompt
         batch_size = len(prompt)
         template = self.edit_prompt_template_encode
         drop_idx = self.edit_prompt_template_encode_start_idx
-        texts = [template.format(txt) for txt in prompt]
+        if not is_edit_plus:
+            template = self.edit_prompt_template_encode
+            texts = [template.format(txt) for txt in prompt]
+            image = vae_image
+        else:
+            template = self.edit_plus_prompt_template_encode
+            img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>"
+            img_prompt = "".join([img_prompt_template.format(i + 1) for i in range(len(condition_image))])
+            texts = [template.format(img_prompt + e) for e in prompt]
+            image = condition_image
         model_inputs = self.processor(text=texts, images=image, max_length=max_sequence_length + drop_idx)
         input_ids, attention_mask, pixel_values, image_grid_thw = (
@@ -348,18 +422,18 @@ class QwenImagePipeline(BasePipeline):
             image_grid_thw=image_grid_thw,
         )
         hidden_states = outputs["hidden_states"]
-        prompt_embeds = hidden_states[:, drop_idx:]
-        prompt_embeds_mask = attention_mask[:, drop_idx:]
-        seq_len = prompt_embeds.shape[1]
+        prompt_emb = hidden_states[:, drop_idx:]
+        prompt_emb_mask = attention_mask[:, drop_idx:]
+        seq_len = prompt_emb.shape[1]
         # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        prompt_emb = prompt_emb.repeat(1, num_images_per_prompt, 1)
+        prompt_emb = prompt_emb.view(batch_size * num_images_per_prompt, seq_len, -1)
-        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+        prompt_emb_mask = prompt_emb_mask.repeat(1, num_images_per_prompt, 1)
+        prompt_emb_mask = prompt_emb_mask.view(batch_size * num_images_per_prompt, seq_len)
-        return prompt_embeds, prompt_embeds_mask
+        return prompt_emb, prompt_emb_mask
     def predict_noise_with_cfg(
         self,
@@ -368,9 +442,17 @@ class QwenImagePipeline(BasePipeline):
         timestep: torch.Tensor,
         prompt_emb: torch.Tensor,
         negative_prompt_emb: torch.Tensor,
-        prompt_embeds_mask: torch.Tensor,
-        negative_prompt_embeds_mask: torch.Tensor,
-        cfg_scale: float,
+        prompt_emb_mask: torch.Tensor,
+        negative_prompt_emb_mask: torch.Tensor,
+        # in_context
+        context_latents: torch.Tensor = None,
+        # eligen
+        entity_prompt_embs: Optional[List[torch.Tensor]] = None,
+        entity_prompt_emb_masks: Optional[List[torch.Tensor]] = None,
+        negative_entity_prompt_embs: Optional[List[torch.Tensor]] = None,
+        negative_entity_prompt_emb_masks: Optional[List[torch.Tensor]] = None,
+        entity_masks: Optional[List[torch.Tensor]] = None,
+        cfg_scale: float = 1.0,
         batch_cfg: bool = False,
     ):
         if cfg_scale <= 1.0 or negative_prompt_emb is None:
@@ -379,7 +461,11 @@ class QwenImagePipeline(BasePipeline):
                 image_latents,
                 timestep,
                 prompt_emb,
-                prompt_embeds_mask,
+                prompt_emb_mask,
+                context_latents=context_latents,
+                entity_prompt_embs=entity_prompt_embs,
+                entity_prompt_emb_masks=entity_prompt_emb_masks,
+                entity_masks=entity_masks,
             )
         if not batch_cfg:
             # cfg by predict noise one by one
@@ -389,14 +475,22 @@ class QwenImagePipeline(BasePipeline):
                 image_latents,
                 timestep,
                 prompt_emb,
-                prompt_embeds_mask,
+                prompt_emb_mask,
+                context_latents=context_latents,
+                entity_prompt_embs=entity_prompt_embs,
+                entity_prompt_emb_masks=entity_prompt_emb_masks,
+                entity_masks=entity_masks,
             )
             negative_noise_pred = self.predict_noise(
                 latents,
                 image_latents,
                 timestep,
                 negative_prompt_emb,
-                negative_prompt_embeds_mask,
+                negative_prompt_emb_mask,
+                context_latents=context_latents,
+                entity_prompt_embs=negative_entity_prompt_embs,
+                entity_prompt_emb_masks=negative_entity_prompt_emb_masks,
+                entity_masks=entity_masks,
             )
             comb_pred = negative_noise_pred + cfg_scale * (positive_noise_pred - negative_noise_pred)
             cond_norm = torch.norm(self.dit.patchify(positive_noise_pred), dim=-1, keepdim=True)
@@ -406,18 +500,32 @@ class QwenImagePipeline(BasePipeline):
         else:
             # cfg by predict noise in one batch
             bs, _, h, w = latents.shape
-            prompt_emb = torch.cat([prompt_emb, negative_prompt_emb], dim=0)
-            prompt_embeds_mask = torch.cat([prompt_embeds_mask, negative_prompt_embeds_mask], dim=0)
+            prompt_emb = pad_and_concat(prompt_emb, negative_prompt_emb)
+            prompt_emb_mask = pad_and_concat(prompt_emb_mask, negative_prompt_emb_mask)
+            if entity_prompt_embs is not None:
+                entity_prompt_embs = [
+                    torch.cat([x, y], dim=0) for x, y in zip(entity_prompt_embs, negative_entity_prompt_embs)
+                ]
+                entity_prompt_emb_masks = [
+                    torch.cat([x, y], dim=0) for x, y in zip(entity_prompt_emb_masks, negative_entity_prompt_emb_masks)
+                ]
+                entity_masks = [torch.cat([mask, mask], dim=0) for mask in entity_masks]
             latents = torch.cat([latents, latents], dim=0)
             if image_latents is not None:
-                image_latents = torch.cat([image_latents, image_latents], dim=0)
+                image_latents = [torch.cat([image_latent, image_latent], dim=0) for image_latent in image_latents]
+            if context_latents is not None:
+                context_latents = torch.cat([context_latents, context_latents], dim=0)
             timestep = torch.cat([timestep, timestep], dim=0)
             noise_pred = self.predict_noise(
                 latents,
                 image_latents,
                 timestep,
                 prompt_emb,
-                prompt_embeds_mask,
+                prompt_emb_mask,
+                context_latents=context_latents,
+                entity_prompt_embs=entity_prompt_embs,
+                entity_prompt_emb_masks=entity_prompt_emb_masks,
+                entity_masks=entity_masks,
             )
             positive_noise_pred, negative_noise_pred = noise_pred[:bs], noise_pred[bs:]
             comb_pred = negative_noise_pred + cfg_scale * (positive_noise_pred - negative_noise_pred)
@@ -432,15 +540,27 @@ class QwenImagePipeline(BasePipeline):
         image_latents: torch.Tensor,
         timestep: torch.Tensor,
         prompt_emb: torch.Tensor,
-        prompt_embeds_mask: torch.Tensor,
+        prompt_emb_mask: torch.Tensor,
+        # in_context
+        context_latents: torch.Tensor = None,
+        # eligen
+        entity_prompt_embs: Optional[List[torch.Tensor]] = None,
+        entity_prompt_emb_masks: Optional[List[torch.Tensor]] = None,
+        entity_masks: Optional[List[torch.Tensor]] = None,
     ):
         self.load_models_to_device(["dit"])
+        attn_kwargs = self.get_attn_kwargs(latents)
         noise_pred = self.dit(
             image=latents,
             edit=image_latents,
-            text=prompt_emb,
             timestep=timestep,
-            txt_seq_lens=prompt_embeds_mask.sum(dim=1),
+            text=prompt_emb,
+            text_seq_lens=prompt_emb_mask.sum(dim=1),
+            context_latents=context_latents,
+            entity_text=entity_prompt_embs,
+            entity_seq_lens=[mask.sum(dim=1) for mask in entity_prompt_emb_masks] if entity_prompt_emb_masks else None,
+            entity_masks=entity_masks,
+            attn_kwargs=attn_kwargs,
         )
         return noise_pred
@@ -457,6 +577,20 @@ class QwenImagePipeline(BasePipeline):
         image_latents = image_latents.squeeze(2).to(device=self.device)
         return image_latents
+    def prepare_eligen(self, entity_prompts, entity_masks, width, height):
+        entity_masks = [mask.resize((width // 8, height // 8), resample=Image.NEAREST) for mask in entity_masks]
+        entity_masks = [self.preprocess_image(mask).mean(dim=1, keepdim=True) > 0 for mask in entity_masks]
+        entity_masks = [mask.to(device=self.device, dtype=self.dtype) for mask in entity_masks]
+        prompt_embs, prompt_emb_masks = [], []
+        negative_prompt_embs, negative_prompt_emb_masks = [], []
+        for entity_prompt in entity_prompts:
+            prompt_emb, prompt_emb_mask = self.encode_prompt(entity_prompt, 1, 512)
+            prompt_embs.append(prompt_emb)
+            prompt_emb_masks.append(prompt_emb_mask)
+            negative_prompt_embs.append(torch.zeros_like(prompt_emb))
+            negative_prompt_emb_masks.append(torch.zeros_like(prompt_emb_mask))
+        return prompt_embs, prompt_emb_masks, negative_prompt_embs, negative_prompt_emb_masks, entity_masks
     def calculate_dimensions(self, target_area, ratio):
         width = math.sqrt(target_area * ratio)
         height = width / ratio
@@ -469,21 +603,51 @@ class QwenImagePipeline(BasePipeline):
         self,
         prompt: str,
         negative_prompt: str = "",
-        input_image: Image.Image | None = None,  # use for img2img
+        # single image for edit, list for edit plus(QwenImageEdit2509)
+        input_image: List[Image.Image] | Image.Image | None = None,
         cfg_scale: float = 4.0,  # true cfg
-        height: int = 1328,
-        width: int = 1328,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         seed: int | None = None,
+        controlnet_params: List[QwenImageControlNetParams] | QwenImageControlNetParams = [],
         progress_callback: Optional[Callable] = None,  # def progress_callback(current, total, status)
+        # eligen
+        entity_prompts: Optional[List[str]] = None,
+        entity_masks: Optional[List[Image.Image]] = None,
     ):
-        if input_image is not None:
-            width, height = input_image.size
-            width, height = self.calculate_dimensions(1024 * 1024, width / height)
-            input_image = input_image.resize((width, height), Image.LANCZOS)
+        assert (height is None) == (width is None), "height and width should be set together"
+        is_edit_plus = isinstance(input_image, list)
+        if input_image is not None:
+            if not isinstance(input_image, list):
+                input_image = [input_image]
+            condition_images = []
+            vae_images = []
+            for img in input_image:
+                img_width, img_height = img.size
+                condition_width, condition_height = self.calculate_dimensions(384 * 384, img_width / img_height)
+                vae_width, vae_height = self.calculate_dimensions(1024 * 1024, img_width / img_height)
+                condition_images.append(img.resize((condition_width, condition_height), Image.LANCZOS))
+                vae_images.append(img.resize((vae_width, vae_height), Image.LANCZOS))
+            if width is None and height is None:
+                width, height = vae_images[-1].size
+        if width is None and height is None:
+            width, height = 1328, 1328
         self.validate_image_size(height, width, minimum=64, multiple_of=16)
+        if not isinstance(controlnet_params, list):
+            controlnet_params = [controlnet_params]
+        context_latents = None
+        for param in controlnet_params:
+            self.load_lora(param.model, param.scale, fused=False, save_original_weight=False)
+            if param.control_type == QwenImageControlType.in_context:
+                width, height = param.image.size
+                self.validate_image_size(height, width, minimum=64, multiple_of=16)
+                context_latents = self.prepare_image_latents(param.image.resize((width, height), Image.LANCZOS))
         noise = self.generate_noise((1, 16, height // 8, width // 8), seed=seed, device="cpu", dtype=self.dtype).to(
             device=self.device
         )
@@ -496,39 +660,60 @@ class QwenImagePipeline(BasePipeline):
         self.load_models_to_device(["vae"])
         if input_image:
-            image_latents = self.prepare_image_latents(input_image)
+            image_latents = [self.prepare_image_latents(img) for img in vae_images]
         else:
             image_latents = None
         self.load_models_to_device(["encoder"])
         if image_latents is not None:
-            prompt_embeds, prompt_embeds_mask = self.encode_prompt_with_image(prompt, input_image, 1, 4096)
+            prompt_emb, prompt_emb_mask = self.encode_prompt_with_image(
+                prompt, vae_images, condition_images, 1, 4096, is_edit_plus
+            )
             if cfg_scale > 1.0 and negative_prompt != "":
-                negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt_with_image(
-                    negative_prompt, input_image, 1, 4096
+                negative_prompt_emb, negative_prompt_emb_mask = self.encode_prompt_with_image(
+                    negative_prompt, vae_images, condition_images, 1, 4096, is_edit_plus
                 )
             else:
-                negative_prompt_embeds, negative_prompt_embeds_mask = None, None
+                negative_prompt_emb, negative_prompt_emb_mask = None, None
         else:
-            prompt_embeds, prompt_embeds_mask = self.encode_prompt(prompt, 1, 4096)
+            prompt_emb, prompt_emb_mask = self.encode_prompt(prompt, 1, 4096)
             if cfg_scale > 1.0 and negative_prompt != "":
-                negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(negative_prompt, 1, 4096)
+                negative_prompt_emb, negative_prompt_emb_mask = self.encode_prompt(negative_prompt, 1, 4096)
             else:
-                negative_prompt_embeds, negative_prompt_embeds_mask = None, None
+                negative_prompt_emb, negative_prompt_emb_mask = None, None
+        entity_prompt_embs, entity_prompt_emb_masks = None, None
+        negative_entity_prompt_embs, negative_entity_prompt_emb_masks = None, None
+        if entity_prompts is not None and entity_masks is not None:
+            assert len(entity_prompts) == len(entity_masks), "entity_prompts and entity_masks must have the same length"
+            (
+                entity_prompt_embs,
+                entity_prompt_emb_masks,
+                negative_entity_prompt_embs,
+                negative_entity_prompt_emb_masks,
+                entity_masks,
+            ) = self.prepare_eligen(entity_prompts, entity_masks, width, height)
         self.model_lifecycle_finish(["encoder"])
+        self.load_models_to_device(["dit"])
         hide_progress = dist.is_initialized() and dist.get_rank() != 0
         for i, timestep in enumerate(tqdm(timesteps, disable=hide_progress)):
             timestep = timestep.unsqueeze(0).to(dtype=self.dtype)
             noise_pred = self.predict_noise_with_cfg(
                 latents=latents,
                 image_latents=image_latents,
                 timestep=timestep,
-                prompt_emb=prompt_embeds,
-                negative_prompt_emb=negative_prompt_embeds,
-                prompt_embeds_mask=prompt_embeds_mask,
-                negative_prompt_embeds_mask=negative_prompt_embeds_mask,
+                prompt_emb=prompt_emb,
+                negative_prompt_emb=negative_prompt_emb,
+                prompt_emb_mask=prompt_emb_mask,
+                negative_prompt_emb_mask=negative_prompt_emb_mask,
+                context_latents=context_latents,
+                entity_prompt_embs=entity_prompt_embs,
+                entity_prompt_emb_masks=entity_prompt_emb_masks,
+                negative_entity_prompt_embs=negative_entity_prompt_embs,
+                negative_entity_prompt_emb_masks=negative_entity_prompt_emb_masks,
+                entity_masks=entity_masks,
                 cfg_scale=cfg_scale,
                 batch_cfg=self.config.batch_cfg,
             )

diffsynth_engine/pipelines/sdxl_image.py CHANGED Viewed

@@ -181,7 +181,7 @@ class SDXLImagePipeline(BasePipeline):
     @classmethod
     def from_state_dict(cls, state_dicts: SDXLStateDicts, config: SDXLPipelineConfig) -> "SDXLImagePipeline":
-        init_device = "cpu" if config.offload_mode else config.device
+        init_device = "cpu" if config.offload_mode is not None else config.device
         tokenizer = CLIPTokenizer.from_pretrained(SDXL_TOKENIZER_CONF_PATH)
         tokenizer_2 = CLIPTokenizer.from_pretrained(SDXL_TOKENIZER_2_CONF_PATH)
         with LoRAContext():

diffsynth_engine/pipelines/utils.py CHANGED Viewed

@@ -1,3 +1,7 @@
+import torch
+import torch.nn.functional as F
 def accumulate(result, new_item):
     if result is None:
         return new_item
@@ -17,3 +21,51 @@ def calculate_shift(
     b = base_shift - m * base_seq_len
     mu = image_seq_len * m + b
     return mu
+def pad_and_concat(
+    tensor1: torch.Tensor,
+    tensor2: torch.Tensor,
+    concat_dim: int = 0,
+    pad_dim: int = 1,
+) -> torch.Tensor:
+    """
+    Concatenate two tensors along a specified dimension after padding along another dimension.
+    Assumes input tensors have shape (b, s, d), where:
+    - b: batch dimension
+    - s: sequence dimension (may differ)
+    - d: feature dimension
+    Args:
+        tensor1: First tensor with shape (b1, s1, d)
+        tensor2: Second tensor with shape (b2, s2, d)
+        concat_dim: Dimension to concatenate along, default is 0 (batch dimension)
+        pad_dim: Dimension to pad along, default is 1 (sequence dimension)
+    Returns:
+        Concatenated tensor, shape depends on concat_dim and pad_dim choices
+    """
+    assert tensor1.dim() == tensor2.dim(), "Both tensors must have the same number of dimensions"
+    assert concat_dim != pad_dim, "concat_dim and pad_dim cannot be the same"
+    len1, len2 = tensor1.shape[pad_dim], tensor2.shape[pad_dim]
+    max_len = max(len1, len2)
+    # Calculate the position of pad_dim in the padding list
+    # Padding format: from the last dimension, each pair represents (dim_n_left, dim_n_right, ..., dim_0_left, dim_0_right)
+    ndim = tensor1.dim()
+    padding = [0] * (2 * ndim)
+    pad_right_idx = -2 * pad_dim - 1
+    if len1 < max_len:
+        pad_len = max_len - len1
+        padding[pad_right_idx] = pad_len
+        tensor1 = F.pad(tensor1, padding, mode="constant", value=0)
+    elif len2 < max_len:
+        pad_len = max_len - len2
+        padding[pad_right_idx] = pad_len
+        tensor2 = F.pad(tensor2, padding, mode="constant", value=0)
+    # Concatenate along the specified dimension
+    return torch.cat([tensor1, tensor2], dim=concat_dim)

diffsynth-engine 0.5.1.dev4__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl

diffsynth-engine 0.5.1.dev4py3-none-any.whl → 0.6.1.dev25py3-none-any.whl