PyPI - diffsynth-engine - Versions diffs - 0.4.3.dev9__py3-none-any.whl → 0.4.3.dev11__py3-none-any.whl - Mend

diffsynth-engine 0.4.3.dev9py3-none-any.whl → 0.4.3.dev11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

diffsynth_engine/pipelines/qwen_image.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import json
 import torch
+import torch.distributed as dist
 import math
 from typing import Callable, List, Tuple, Optional, Union, Dict
 from tqdm import tqdm
 from einops import rearrange
-import torch.distributed as dist
+from PIL import Image
 from diffsynth_engine.configs import QwenImagePipelineConfig, QwenImageStateDicts
 from diffsynth_engine.models.basic.lora import LoRAContext
@@ -16,13 +17,14 @@ from diffsynth_engine.models.qwen_image import (
     Qwen2_5_VLConfig,
 )
 from diffsynth_engine.models.qwen_image import QwenImageVAE
-from diffsynth_engine.tokenizers import Qwen2TokenizerFast
+from diffsynth_engine.tokenizers import Qwen2TokenizerFast, Qwen2VLProcessor
 from diffsynth_engine.pipelines import BasePipeline, LoRAStateDictConverter
 from diffsynth_engine.pipelines.utils import calculate_shift
 from diffsynth_engine.algorithm.noise_scheduler import RecifitedFlowScheduler
 from diffsynth_engine.algorithm.sampler import FlowMatchEulerSampler
 from diffsynth_engine.utils.constants import (
     QWEN_IMAGE_TOKENIZER_CONF_PATH,
+    QWEN_IMAGE_PROCESSOR_CONFIG_FILE,
     QWEN_IMAGE_CONFIG_FILE,
     QWEN_IMAGE_VISION_CONFIG_FILE,
     QWEN_IMAGE_VAE_CONFIG_FILE,
@@ -44,20 +46,23 @@ class QwenImageLoRAConverter(LoRAStateDictConverter):
             lora_a_suffix = None
             if "lora_A.default.weight" in key:
                 lora_a_suffix = "lora_A.default.weight"
+                lora_b_suffix = "lora_B.default.weight"
             elif "lora_A.weight" in key:
                 lora_a_suffix = "lora_A.weight"
+                lora_b_suffix = "lora_B.weight"
+            elif "lora_down.weight" in key:
+                lora_a_suffix = "lora_down.weight"
+                lora_b_suffix = "lora_up.weight"
             if lora_a_suffix is None:
                 continue
             lora_args = {}
             lora_args["down"] = param
-            lora_b_suffix = lora_a_suffix.replace("lora_A", "lora_B")
             lora_args["up"] = lora_state_dict[origin_key.replace(lora_a_suffix, lora_b_suffix)]
             lora_args["rank"] = lora_args["up"].shape[1]
-            alpha_key = origin_key.replace("lora_up", "lora_A").replace(lora_a_suffix, "alpha")
+            alpha_key = origin_key.replace(lora_a_suffix, "alpha")
             if alpha_key in lora_state_dict:
                 alpha = lora_state_dict[alpha_key]
@@ -83,6 +88,7 @@ class QwenImagePipeline(BasePipeline):
         self,
         config: QwenImagePipelineConfig,
         tokenizer: Qwen2TokenizerFast,
+        processor: Qwen2VLProcessor,
         encoder: Qwen2_5_VLForConditionalGeneration,
         dit: QwenImageDiT,
         vae: QwenImageVAE,
@@ -97,11 +103,15 @@ class QwenImagePipeline(BasePipeline):
         self.config = config
         self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
         self.prompt_template_encode_start_idx = 34
+        self.edit_prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
+        self.edit_prompt_template_encode_start_idx = 64
         # sampler
         self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True)
         self.sampler = FlowMatchEulerSampler()
         # models
         self.tokenizer = tokenizer
+        self.processor = processor
         self.encoder = encoder
         self.dit = dit
         self.vae = vae
@@ -155,6 +165,10 @@ class QwenImagePipeline(BasePipeline):
         init_device = "cpu" if config.parallelism > 1 or config.offload_mode is not None else config.device
         tokenizer = Qwen2TokenizerFast.from_pretrained(QWEN_IMAGE_TOKENIZER_CONF_PATH)
+        processor = Qwen2VLProcessor.from_pretrained(
+            tokenizer_config_path=QWEN_IMAGE_TOKENIZER_CONF_PATH,
+            image_processor_config_path=QWEN_IMAGE_PROCESSOR_CONFIG_FILE,
+        )
         with open(QWEN_IMAGE_VISION_CONFIG_FILE, "r") as f:
             vision_config = Qwen2_5_VLVisionConfig(**json.load(f))
         with open(QWEN_IMAGE_CONFIG_FILE, "r") as f:
@@ -201,6 +215,7 @@ class QwenImagePipeline(BasePipeline):
         pipe = cls(
             config=config,
             tokenizer=tokenizer,
+            processor=processor,
             encoder=encoder,
             dit=dit,
             vae=vae,
@@ -209,7 +224,7 @@ class QwenImagePipeline(BasePipeline):
         if config.offload_mode is not None:
             pipe.enable_cpu_offload(config.offload_mode, config.offload_to_disk)
         if config.model_dtype == torch.float8_e4m3fn:
             pipe.dtype = torch.bfloat16  # compute dtype
             pipe.enable_fp8_autocast(
@@ -302,9 +317,51 @@ class QwenImagePipeline(BasePipeline):
         return prompt_embeds, prompt_embeds_mask
+    def encode_prompt_with_image(
+        self,
+        prompt: Union[str, List[str]],
+        image: torch.Tensor,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 1024,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        template = self.edit_prompt_template_encode
+        drop_idx = self.edit_prompt_template_encode_start_idx
+        texts = [template.format(txt) for txt in prompt]
+        model_inputs = self.processor(text=texts, images=image, max_length=max_sequence_length + drop_idx)
+        input_ids, attention_mask, pixel_values, image_grid_thw = (
+            model_inputs["input_ids"].to(self.device),
+            model_inputs["attention_mask"].to(self.device),
+            model_inputs["pixel_values"].to(self.device),
+            model_inputs["image_grid_thw"].to(self.device),
+        )
+        outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+        )
+        hidden_states = outputs["hidden_states"]
+        prompt_embeds = hidden_states[:, drop_idx:]
+        prompt_embeds_mask = attention_mask[:, drop_idx:]
+        seq_len = prompt_embeds.shape[1]
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+        return prompt_embeds, prompt_embeds_mask
     def predict_noise_with_cfg(
         self,
         latents: torch.Tensor,
+        image_latents: torch.Tensor,
         timestep: torch.Tensor,
         prompt_emb: torch.Tensor,
         negative_prompt_emb: torch.Tensor,
@@ -316,6 +373,7 @@ class QwenImagePipeline(BasePipeline):
         if cfg_scale <= 1.0 or negative_prompt_emb is None:
             return self.predict_noise(
                 latents,
+                image_latents,
                 timestep,
                 prompt_emb,
                 prompt_embeds_mask,
@@ -325,12 +383,14 @@ class QwenImagePipeline(BasePipeline):
             h, w = latents.shape[-2:]
             positive_noise_pred = self.predict_noise(
                 latents,
+                image_latents,
                 timestep,
                 prompt_emb,
                 prompt_embeds_mask,
             )
             negative_noise_pred = self.predict_noise(
                 latents,
+                image_latents,
                 timestep,
                 negative_prompt_emb,
                 negative_prompt_embeds_mask,
@@ -346,9 +406,11 @@ class QwenImagePipeline(BasePipeline):
             prompt_emb = torch.cat([prompt_emb, negative_prompt_emb], dim=0)
             prompt_embeds_mask = torch.cat([prompt_embeds_mask, negative_prompt_embeds_mask], dim=0)
             latents = torch.cat([latents, latents], dim=0)
+            image_latents = torch.cat([image_latents, image_latents], dim=0)
             timestep = torch.cat([timestep, timestep], dim=0)
             noise_pred = self.predict_noise(
                 latents,
+                image_latents,
                 timestep,
                 prompt_emb,
                 prompt_embeds_mask,
@@ -363,25 +425,49 @@ class QwenImagePipeline(BasePipeline):
     def predict_noise(
         self,
         latents: torch.Tensor,
+        image_latents: torch.Tensor,
         timestep: torch.Tensor,
         prompt_emb: torch.Tensor,
         prompt_embeds_mask: torch.Tensor,
     ):
         self.load_models_to_device(["dit"])
         noise_pred = self.dit(
             image=latents,
+            edit=image_latents,
             text=prompt_emb,
             timestep=timestep,
             txt_seq_lens=prompt_embeds_mask.sum(dim=1),
         )
         return noise_pred
+    def prepare_image_latents(self, input_image: Image.Image):
+        image = self.preprocess_image(input_image).to(
+            device=self.device, dtype=self.vae.model.encoder.conv1.weight.dtype
+        )
+        image = image.unsqueeze(2)
+        image_latents = self.vae.encode(
+            image,
+            device=self.device,
+            tiled=self.vae_tiled,
+            tile_size=self.vae_tile_size,
+            tile_stride=self.vae_tile_stride,
+        )
+        image_latents = image_latents.squeeze(2)
+        return image_latents
+    def calculate_dimensions(self, target_area, ratio):
+        width = math.sqrt(target_area * ratio)
+        height = width / ratio
+        width = round(width / 32) * 32
+        height = round(height / 32) * 32
+        return width, height
     @torch.no_grad()
     def __call__(
         self,
         prompt: str,
         negative_prompt: str = "",
+        input_image: Image.Image | None = None,  # use for img2img
         cfg_scale: float = 4.0,  # true cfg
         height: int = 1328,
         width: int = 1328,
@@ -389,29 +475,51 @@ class QwenImagePipeline(BasePipeline):
         seed: int | None = None,
         progress_callback: Optional[Callable] = None,  # def progress_callback(current, total, status)
     ):
+        if input_image is not None:
+            width, height = input_image.size
+            width, height = self.calculate_dimensions(1024 * 1024, width / height)
+            input_image = input_image.resize((width, height), Image.LANCZOS)
+        self.validate_image_size(height, width, minimum=64, multiple_of=16)
         noise = self.generate_noise((1, 16, height // 8, width // 8), seed=seed, device="cpu", dtype=self.dtype).to(
             device=self.device
         )
         # dynamic shift
         image_seq_len = math.ceil(height // 16) * math.ceil(width // 16)
         mu = calculate_shift(image_seq_len, max_shift=0.9, max_seq_len=8192)
+        if input_image:
+            image_latents = self.prepare_image_latents(input_image)
+        else:
+            image_latents = None
         init_latents, latents, sigmas, timesteps = self.prepare_latents(noise, num_inference_steps, mu)
         # Initialize sampler
         self.sampler.initialize(sigmas=sigmas)
         self.load_models_to_device(["encoder"])
-        prompt_embeds, prompt_embeds_mask = self.encode_prompt(prompt, 1, 4096)
-        if cfg_scale > 1.0 and negative_prompt != "":
-            negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(negative_prompt, 1, 4096)
+        if image_latents is not None:
+            prompt_embeds, prompt_embeds_mask = self.encode_prompt_with_image(prompt, input_image, 1, 4096)
+            if cfg_scale > 1.0 and negative_prompt != "":
+                negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt_with_image(
+                    negative_prompt, input_image, 1, 4096
+                )
+            else:
+                negative_prompt_embeds, negative_prompt_embeds_mask = None, None
         else:
-            negative_prompt_embeds, negative_prompt_embeds_mask = None, None
+            prompt_embeds, prompt_embeds_mask = self.encode_prompt(prompt, 1, 4096)
+            if cfg_scale > 1.0 and negative_prompt != "":
+                negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(negative_prompt, 1, 4096)
+            else:
+                negative_prompt_embeds, negative_prompt_embeds_mask = None, None
         self.model_lifecycle_finish(["encoder"])
         hide_progress = dist.is_initialized() and dist.get_rank() != 0
         for i, timestep in enumerate(tqdm(timesteps, disable=hide_progress)):
             timestep = timestep.unsqueeze(0).to(dtype=self.dtype)
             noise_pred = self.predict_noise_with_cfg(
                 latents=latents,
+                image_latents=image_latents,
                 timestep=timestep,
                 prompt_emb=prompt_embeds,
                 negative_prompt_emb=negative_prompt_embeds,
@@ -431,12 +539,16 @@ class QwenImagePipeline(BasePipeline):
         latents = rearrange(latents, "B C H W -> B C 1 H W")
         vae_output = rearrange(
             self.vae.decode(
-                latents.to(self.vae.model.encoder.conv1.weight.dtype), device=self.vae.model.encoder.conv1.weight.device
+                latents.to(self.vae.model.encoder.conv1.weight.dtype),
+                device=self.vae.model.encoder.conv1.weight.device,
+                tiled=self.vae_tiled,
+                tile_size=self.vae_tile_size,
+                tile_stride=self.vae_tile_stride,
             )[0],
             "C B H W -> B C H W",
         )
         image = self.vae_output_to_image(vae_output)
         # Offload all models
-        self.model_lifecycle_finish(["vae"])
+        self.model_lifecycle_finish(["vae"])
         self.load_models_to_device([])
         return image

diffsynth_engine/pipelines/sd_image.py CHANGED Viewed

@@ -181,21 +181,21 @@ class SDImagePipeline(BasePipeline):
                 raise ValueError("`model_path` cannot be empty")
             logger.info(f"loading state dict from {config.model_path} ...")
             state_dicts.model = cls.load_model_checkpoint(config.model_path, device="cpu", dtype=config.model_dtype)
         if state_dicts.vae is None:
             if config.vae_path is None:
                 state_dicts.vae = state_dicts.model
             else:
                 logger.info(f"loading state dict from {config.vae_path} ...")
                 state_dicts.vae = cls.load_model_checkpoint(config.vae_path, device="cpu", dtype=config.vae_dtype)
         if state_dicts.clip is None:
             if config.clip_path is None:
                 state_dicts.clip = state_dicts.model
             else:
                 logger.info(f"loading state dict from {config.clip_path} ...")
                 state_dicts.clip = cls.load_model_checkpoint(config.clip_path, device="cpu", dtype=config.clip_dtype)
         init_device = "cpu" if config.offload_mode is not None else config.device
         tokenizer = CLIPTokenizer.from_pretrained(SDXL_TOKENIZER_CONF_PATH)
         with LoRAContext():

diffsynth_engine/pipelines/sdxl_image.py CHANGED Viewed

@@ -159,28 +159,32 @@ class SDXLImagePipeline(BasePipeline):
                 raise ValueError("`model_path` cannot be empty")
             logger.info(f"loading state dict from {config.model_path} ...")
             state_dicts.model = cls.load_model_checkpoint(config.model_path, device="cpu", dtype=config.model_dtype)
         if state_dicts.vae is None:
             if config.vae_path is None:
                 state_dicts.vae = state_dicts.model
             else:
                 logger.info(f"loading state dict from {config.vae_path} ...")
                 state_dicts.vae = cls.load_model_checkpoint(config.vae_path, device="cpu", dtype=config.vae_dtype)
         if state_dicts.clip_l is None:
             if config.clip_l_path is None:
                 state_dicts.clip_l = state_dicts.model
             else:
                 logger.info(f"loading state dict from {config.clip_l_path} ...")
-                state_dicts.clip_l = cls.load_model_checkpoint(config.clip_l_path, device="cpu", dtype=config.clip_l_dtype)
+                state_dicts.clip_l = cls.load_model_checkpoint(
+                    config.clip_l_path, device="cpu", dtype=config.clip_l_dtype
+                )
         if state_dicts.clip_g is None:
             if config.clip_g_path is None:
                 state_dicts.clip_g = state_dicts.model
             else:
                 logger.info(f"loading state dict from {config.clip_g_path} ...")
-                state_dicts.clip_g = cls.load_model_checkpoint(config.clip_g_path, device="cpu", dtype=config.clip_g_dtype)
+                state_dicts.clip_g = cls.load_model_checkpoint(
+                    config.clip_g_path, device="cpu", dtype=config.clip_g_dtype
+                )
         init_device = "cpu" if config.offload_mode else config.device
         tokenizer = CLIPTokenizer.from_pretrained(SDXL_TOKENIZER_CONF_PATH)
         tokenizer_2 = CLIPTokenizer.from_pretrained(SDXL_TOKENIZER_2_CONF_PATH)

diffsynth_engine/tokenizers/__init__.py CHANGED Viewed

@@ -3,6 +3,8 @@ from .clip import CLIPTokenizer
 from .t5 import T5TokenizerFast
 from .wan import WanT5Tokenizer
 from .qwen2 import Qwen2TokenizerFast
+from .qwen2_vl_image_processor import Qwen2VLImageProcessor
+from .qwen2_vl_processor import Qwen2VLProcessor
 __all__ = [
     "BaseTokenizer",
@@ -10,4 +12,6 @@ __all__ = [
     "T5TokenizerFast",
     "WanT5Tokenizer",
     "Qwen2TokenizerFast",
+    "Qwen2VLImageProcessor",
+    "Qwen2VLProcessor",
 ]

diffsynth_engine/tokenizers/qwen2_vl_image_processor.py ADDED Viewed

@@ -0,0 +1,157 @@
+# modified from transformers.models.qwen2_vl.image_processing_qwen2_vl
+import os
+import json
+import logging
+import numpy as np
+from typing import List, Optional
+from PIL import Image
+from diffsynth_engine.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from diffsynth_engine.utils.image import (
+    ChannelDimension,
+    convert_to_rgb,
+    get_image_size,
+    infer_channel_dimension_format,
+    rescale_image,
+    resize_image,
+    smart_resize,
+    normalize_image,
+    to_channel_dimension_format,
+)
+logger = logging.getLogger(__name__)
+class Qwen2VLImageProcessor:
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: Image.Resampling = Image.Resampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: float = 1.0 / 255,
+        do_normalize: bool = True,
+        image_mean: List[float] = OPENAI_CLIP_MEAN,
+        image_std: List[float] = OPENAI_CLIP_STD,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ):
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_rgb = do_convert_rgb
+        self.size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.temporal_patch_size = temporal_patch_size
+    @classmethod
+    def from_pretrained(cls, config_file_path: str | os.PathLike, **kwargs):
+        init_kwargs = {}
+        if not os.path.exists(config_file_path):
+            logger.warning(f"Cannot find {config_file_path}, init processor with default parameters")
+        else:
+            with open(config_file_path, "r", encoding="utf-8") as kwargs_handler:
+                init_kwargs = json.load(kwargs_handler)
+        init_kwargs.update(**kwargs)
+        return cls(**init_kwargs)
+    def __call__(
+        self,
+        images: Image.Image | List[Image.Image],
+        videos: Optional[List[List[Image.Image]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+    ):
+        pixel_values, image_grid_thws = None, None
+        if images is not None:
+            if isinstance(images, Image.Image):
+                images = [images]
+            pixel_values, image_grid_thws = [], []
+            for image in images:
+                flatten_patches, image_grid_thw = self._preprocess([image], data_format)
+                pixel_values.extend(flatten_patches)
+                image_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            image_grid_thws = np.array(image_grid_thws)
+        vision_pixel_values, vision_grid_thws = None, None
+        if videos is not None:
+            vision_pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                flatten_patches, video_grid_thw = self._preprocess(images, data_format)
+                vision_pixel_values.append(flatten_patches)
+                vision_grid_thws.append(video_grid_thw)
+            vision_pixel_values = np.array(vision_pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+        return pixel_values, image_grid_thws, vision_pixel_values, vision_grid_thws
+    def _preprocess(self, images: List[Image.Image], data_format: Optional[ChannelDimension] = ChannelDimension.FIRST):
+        images = [convert_to_rgb(image) for image in images]
+        image_nps = [np.array(image) for image in images]
+        input_data_format = infer_channel_dimension_format(image_nps[0])
+        height, width = get_image_size(image_nps[0], input_data_format)
+        resized_height, resized_width = height, width
+        processed_image_nps = []
+        for image_np in image_nps:
+            if self.do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image_np = resize_image(
+                    image_np, resized_height, resized_width, self.resample, input_data_format=input_data_format
+                )
+            if self.do_rescale:
+                image_np = rescale_image(image_np, self.rescale_factor)
+            if self.do_normalize:
+                image_np = normalize_image(
+                    image_np, self.image_mean, self.image_std, input_data_format=input_data_format
+                )
+            image_np = to_channel_dimension_format(image_np, data_format, input_data_format)
+            processed_image_nps.append(image_np)
+        patches = np.array(processed_image_nps)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
+            patches = np.concatenate([patches, repeats], axis=0)
+        num_channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h = resized_height // self.patch_size
+        grid_w = resized_width // self.patch_size
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            num_channel,
+            grid_h // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+            grid_w // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, num_channel * self.temporal_patch_size * self.patch_size * self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)

diffsynth_engine/tokenizers/qwen2_vl_processor.py ADDED Viewed

@@ -0,0 +1,100 @@
+import os
+import re
+import torch
+import logging
+from PIL import Image
+from typing import List, Dict, Optional
+from diffsynth_engine.tokenizers.qwen2_vl_image_processor import Qwen2VLImageProcessor
+from diffsynth_engine.tokenizers.qwen2 import Qwen2TokenizerFast
+logger = logging.getLogger(__name__)
+class Qwen2VLProcessor:
+    def __init__(
+        self,
+        tokenizer: Qwen2TokenizerFast,
+        image_processor: Qwen2VLImageProcessor,
+        image_token: str = "<|image_pad|>",
+        **kwargs,
+    ):
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.image_token = image_token
+    @classmethod
+    def from_pretrained(
+        cls,
+        tokenizer_config_path: str | os.PathLike,
+        image_processor_config_path: str | os.PathLike,
+        **kwargs,
+    ):
+        tokenizer = Qwen2TokenizerFast.from_pretrained(tokenizer_config_path)
+        image_processor = Qwen2VLImageProcessor.from_pretrained(image_processor_config_path)
+        return cls(tokenizer=tokenizer, image_processor=image_processor, **kwargs)
+    def batch_decode(
+        self,
+        ids: List[List[int]] | List[torch.Tensor],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+    ):
+        if isinstance(ids[0], torch.Tensor):
+            ids = [id_.tolist() for id_ in ids]
+        decoded = self.tokenizer.batch_decode(ids, skip_special_tokens, clean_up_tokenization_spaces)
+        pattern = r"<\|vision_start\|>.*?<\|vision_end\|>"
+        decoded_with_image_tag = [re.sub(pattern, "<image>", d, flags=re.DOTALL) for d in decoded]
+        decoded_with_image_tag = [re.sub(r"<\|im_end\|>", "", d) for d in decoded_with_image_tag]
+        return decoded_with_image_tag
+    def __call__(
+        self,
+        text: str | List[str],
+        images: Optional[List[Image.Image]] = None,
+        videos: Optional[List[List[Image.Image]]] = None,
+        max_length: Optional[int] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            text (`List[str]`):
+                The sequence or batch of sequences to be encoded.
+            images (`List[PIL.Image.Image]`):
+                The batch of images to be prepared.
+            videos (`List[List[PIL.Image.Image]]`):
+                The batch of videos to be prepared.
+        """
+        images_pixel_values, images_grid_thws, video_pixels_values, video_grid_thws = self.image_processor(
+            images, videos
+        )
+        if not isinstance(text, list):
+            text = [text]
+        if images_grid_thws is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.image_token, "<|placeholder|>" * (images_grid_thws[index].prod() // merge_length), 1
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        text_inputs = self.tokenizer(text, max_length=max_length)
+        processed_inputs = text_inputs
+        if images_pixel_values is not None:
+            processed_inputs["pixel_values"] = torch.from_numpy(images_pixel_values)
+        if images_grid_thws is not None:
+            processed_inputs["image_grid_thw"] = torch.from_numpy(images_grid_thws)
+        if video_pixels_values is not None:
+            processed_inputs["pixel_values_videos"] = video_pixels_values
+        if video_grid_thws is not None:
+            processed_inputs["video_grid_thw"] = video_grid_thws
+        return processed_inputs

diffsynth_engine/utils/constants.py CHANGED Viewed

@@ -5,6 +5,7 @@ REPO_ROOT = os.path.dirname(PACKAGE_ROOT)
 # conf
 CONF_PATH = os.path.join(PACKAGE_ROOT, "conf")
 # tokenizers
 FLUX_TOKENIZER_1_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "flux", "tokenizer_1")
 FLUX_TOKENIZER_2_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "flux", "tokenizer_2")
@@ -12,6 +13,8 @@ SDXL_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "sdxl", "tokeni
 SDXL_TOKENIZER_2_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "sdxl", "tokenizer_2")
 WAN_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "wan", "umt5-xxl")
 QWEN_IMAGE_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "qwen_image", "tokenizer")
+QWEN_IMAGE_PROCESSOR_CONFIG_FILE = os.path.join(CONF_PATH, "tokenizers", "qwen_image", "qwen2_vl_image_processor.json")
 # models
 VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "components", "vae.json")
 FLUX_DIT_CONFIG_FILE = os.path.join(CONF_PATH, "models", "flux", "flux_dit.json")
@@ -46,3 +49,6 @@ KB = 1024
 MB = 1024 * KB
 GB = 1024 * MB
 TB = 1024 * GB
+OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]

diffsynth-engine 0.4.3.dev9__py3-none-any.whl → 0.4.3.dev11__py3-none-any.whl

diffsynth-engine 0.4.3.dev9py3-none-any.whl → 0.4.3.dev11py3-none-any.whl