PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev41__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev41py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

diffsynth_engine/configs/pipeline.py CHANGED Viewed

@@ -307,6 +307,8 @@ class ZImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig,
     vae_dtype: torch.dtype = torch.bfloat16
     encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
     encoder_dtype: torch.dtype = torch.bfloat16
+    image_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    image_encoder_dtype: torch.dtype = torch.bfloat16
     @classmethod
     def basic_config(
@@ -314,6 +316,7 @@ class ZImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig,
         model_path: str | os.PathLike | List[str | os.PathLike],
         encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None,
         vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None,
+        image_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None,
         device: str = "cuda",
         parallelism: int = 1,
         offload_mode: Optional[str] = None,
@@ -324,6 +327,7 @@ class ZImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig,
             device=device,
             encoder_path=encoder_path,
             vae_path=vae_path,
+            image_encoder_path=image_encoder_path,
             parallelism=parallelism,
             use_cfg_parallel=True if parallelism > 1 else False,
             use_fsdp=True if parallelism > 1 else False,
@@ -391,6 +395,7 @@ class ZImageStateDicts:
     model: Dict[str, torch.Tensor]
     encoder: Dict[str, torch.Tensor]
     vae: Dict[str, torch.Tensor]
+    image_encoder: Optional[Dict[str, torch.Tensor]] = None
 def init_parallel_config(config: FluxPipelineConfig | QwenImagePipelineConfig | WanPipelineConfig | ZImagePipelineConfig):

diffsynth_engine/models/z_image/__init__.py CHANGED Viewed

@@ -3,9 +3,13 @@ from .qwen3 import (
     Qwen3Config,
 )
 from .z_image_dit import ZImageDiT
+from .z_image_dit_omni_base import ZImageOmniBaseDiT
+from .siglip import Siglip2ImageEncoder
 __all__ = [
     "Qwen3Model",
     "Qwen3Config",
     "ZImageDiT",
+    "ZImageOmniBaseDiT",
+    "Siglip2ImageEncoder",
 ]

diffsynth_engine/models/z_image/siglip.py ADDED Viewed

@@ -0,0 +1,72 @@
+from transformers import Siglip2VisionModel, Siglip2VisionConfig, Siglip2ImageProcessorFast
+import torch
+class Siglip2ImageEncoder(Siglip2VisionModel):
+    def __init__(self, **kwargs):
+        config = Siglip2VisionConfig(
+            attention_dropout = 0.0,
+            dtype = "bfloat16",
+            hidden_act = "gelu_pytorch_tanh",
+            hidden_size = 1152,
+            intermediate_size = 4304,
+            layer_norm_eps = 1e-06,
+            model_type = "siglip2_vision_model",
+            num_attention_heads = 16,
+            num_channels = 3,
+            num_hidden_layers = 27,
+            num_patches = 256,
+            patch_size = 16,
+            transformers_version = "4.57.1"
+        )
+        super().__init__(config)
+        self.processor = Siglip2ImageProcessorFast(
+            **{
+                "data_format": "channels_first",
+                "default_to_square": True,
+                "device": None,
+                "disable_grouping": None,
+                "do_convert_rgb": None,
+                "do_normalize": True,
+                "do_pad": None,
+                "do_rescale": True,
+                "do_resize": True,
+                "image_mean": [
+                    0.5,
+                    0.5,
+                    0.5
+                ],
+                "image_processor_type": "Siglip2ImageProcessorFast",
+                "image_std": [
+                    0.5,
+                    0.5,
+                    0.5
+                ],
+                "input_data_format": None,
+                "max_num_patches": 256,
+                "pad_size": None,
+                "patch_size": 16,
+                "processor_class": "Siglip2Processor",
+                "resample": 2,
+                "rescale_factor": 0.00392156862745098,
+                "return_tensors": None,
+            }
+        )
+    def forward(self, image, torch_dtype=torch.bfloat16, device="cuda"):
+        siglip_inputs = self.processor(images=[image], return_tensors="pt").to(device)
+        shape = siglip_inputs.spatial_shapes[0]
+        hidden_state = super().forward(**siglip_inputs).last_hidden_state
+        B, N, C = hidden_state.shape
+        hidden_state = hidden_state[:, : shape[0] * shape[1]]
+        hidden_state = hidden_state.view(shape[0], shape[1], C)
+        hidden_state = hidden_state.to(torch_dtype)
+        return hidden_state
+    @classmethod
+    def from_state_dict(cls, state_dict, device: str, dtype: torch.dtype):
+        model = cls()
+        model.requires_grad_(False)
+        model.load_state_dict(state_dict, assign=True)
+        model.to(device=device, dtype=dtype, non_blocking=True)
+        return model

diffsynth-engine 0.6.1.dev41__py3-none-any.whl → 0.7.0__py3-none-any.whl

diffsynth-engine 0.6.1.dev41py3-none-any.whl → 0.7.0py3-none-any.whl