PyPI - diffusers - Versions diffs - 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

diffusers 0.32.1py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (389) hide show

diffusers/pipelines/cogview4/pipeline_output.py ADDED Viewed

@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import List, Union
+import numpy as np
+import PIL.Image
+from ...utils import BaseOutput
+@dataclass
+class CogView4PipelineOutput(BaseOutput):
+    """
+    Output class for CogView3 pipelines.
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+    images: Union[List[PIL.Image.Image], np.ndarray]

diffusers/pipelines/consisid/__init__.py ADDED Viewed

@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_opencv_available,
+    is_torch_available,
+    is_transformers_available,
+)
+_dummy_objects = {}
+_import_structure = {}
+try:
+    if not (is_transformers_available() and is_torch_available() and is_opencv_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_and_opencv_objects  # noqa F403
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_opencv_objects))
+else:
+    _import_structure["pipeline_consisid"] = ["ConsisIDPipeline"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_consisid import ConsisIDPipeline
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)

diffusers/pipelines/consisid/consisid_utils.py ADDED Viewed

@@ -0,0 +1,357 @@
+import importlib.util
+import os
+import cv2
+import numpy as np
+import torch
+from PIL import Image, ImageOps
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import normalize, resize
+from ...utils import get_logger, load_image
+logger = get_logger(__name__)
+_insightface_available = importlib.util.find_spec("insightface") is not None
+_consisid_eva_clip_available = importlib.util.find_spec("consisid_eva_clip") is not None
+_facexlib_available = importlib.util.find_spec("facexlib") is not None
+if _insightface_available:
+    import insightface
+    from insightface.app import FaceAnalysis
+else:
+    raise ImportError("insightface is not available. Please install it using 'pip install insightface'.")
+if _consisid_eva_clip_available:
+    from consisid_eva_clip import create_model_and_transforms
+    from consisid_eva_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+else:
+    raise ImportError("consisid_eva_clip is not available. Please install it using 'pip install consisid_eva_clip'.")
+if _facexlib_available:
+    from facexlib.parsing import init_parsing_model
+    from facexlib.utils.face_restoration_helper import FaceRestoreHelper
+else:
+    raise ImportError("facexlib is not available. Please install it using 'pip install facexlib'.")
+def resize_numpy_image_long(image, resize_long_edge=768):
+    """
+    Resize the input image to a specified long edge while maintaining aspect ratio.
+    Args:
+        image (numpy.ndarray): Input image (H x W x C or H x W).
+        resize_long_edge (int): The target size for the long edge of the image. Default is 768.
+    Returns:
+        numpy.ndarray: Resized image with the long edge matching `resize_long_edge`, while maintaining the aspect
+        ratio.
+    """
+    h, w = image.shape[:2]
+    if max(h, w) <= resize_long_edge:
+        return image
+    k = resize_long_edge / max(h, w)
+    h = int(h * k)
+    w = int(w * k)
+    image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
+    return image
+def img2tensor(imgs, bgr2rgb=True, float32=True):
+    """Numpy array to tensor.
+    Args:
+        imgs (list[ndarray] | ndarray): Input images.
+        bgr2rgb (bool): Whether to change bgr to rgb.
+        float32 (bool): Whether to change to float32.
+    Returns:
+        list[tensor] | tensor: Tensor images. If returned results only have
+            one element, just return tensor.
+    """
+    def _totensor(img, bgr2rgb, float32):
+        if img.shape[2] == 3 and bgr2rgb:
+            if img.dtype == "float64":
+                img = img.astype("float32")
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = torch.from_numpy(img.transpose(2, 0, 1))
+        if float32:
+            img = img.float()
+        return img
+    if isinstance(imgs, list):
+        return [_totensor(img, bgr2rgb, float32) for img in imgs]
+    return _totensor(imgs, bgr2rgb, float32)
+def to_gray(img):
+    """
+    Converts an RGB image to grayscale by applying the standard luminosity formula.
+    Args:
+        img (torch.Tensor): The input image tensor with shape (batch_size, channels, height, width).
+                             The image is expected to be in RGB format (3 channels).
+    Returns:
+        torch.Tensor: The grayscale image tensor with shape (batch_size, 3, height, width).
+                      The grayscale values are replicated across all three channels.
+    """
+    x = 0.299 * img[:, 0:1] + 0.587 * img[:, 1:2] + 0.114 * img[:, 2:3]
+    x = x.repeat(1, 3, 1, 1)
+    return x
+def process_face_embeddings(
+    face_helper_1,
+    clip_vision_model,
+    face_helper_2,
+    eva_transform_mean,
+    eva_transform_std,
+    app,
+    device,
+    weight_dtype,
+    image,
+    original_id_image=None,
+    is_align_face=True,
+):
+    """
+    Process face embeddings from an image, extracting relevant features such as face embeddings, landmarks, and parsed
+    face features using a series of face detection and alignment tools.
+    Args:
+        face_helper_1: Face helper object (first helper) for alignment and landmark detection.
+        clip_vision_model: Pre-trained CLIP vision model used for feature extraction.
+        face_helper_2: Face helper object (second helper) for embedding extraction.
+        eva_transform_mean: Mean values for image normalization before passing to EVA model.
+        eva_transform_std: Standard deviation values for image normalization before passing to EVA model.
+        app: Application instance used for face detection.
+        device: Device (CPU or GPU) where the computations will be performed.
+        weight_dtype: Data type of the weights for precision (e.g., `torch.float32`).
+        image: Input image in RGB format with pixel values in the range [0, 255].
+        original_id_image: (Optional) Original image for feature extraction if `is_align_face` is False.
+        is_align_face: Boolean flag indicating whether face alignment should be performed.
+    Returns:
+        Tuple:
+            - id_cond: Concatenated tensor of Ante face embedding and CLIP vision embedding
+            - id_vit_hidden: Hidden state of the CLIP vision model, a list of tensors.
+            - return_face_features_image_2: Processed face features image after normalization and parsing.
+            - face_kps: Keypoints of the face detected in the image.
+    """
+    face_helper_1.clean_all()
+    image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+    # get antelopev2 embedding
+    face_info = app.get(image_bgr)
+    if len(face_info) > 0:
+        face_info = sorted(face_info, key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]))[
+            -1
+        ]  # only use the maximum face
+        id_ante_embedding = face_info["embedding"]  # (512,)
+        face_kps = face_info["kps"]
+    else:
+        id_ante_embedding = None
+        face_kps = None
+    # using facexlib to detect and align face
+    face_helper_1.read_image(image_bgr)
+    face_helper_1.get_face_landmarks_5(only_center_face=True)
+    if face_kps is None:
+        face_kps = face_helper_1.all_landmarks_5[0]
+    face_helper_1.align_warp_face()
+    if len(face_helper_1.cropped_faces) == 0:
+        raise RuntimeError("facexlib align face fail")
+    align_face = face_helper_1.cropped_faces[0]  # (512, 512, 3)  # RGB
+    # incase insightface didn't detect face
+    if id_ante_embedding is None:
+        logger.warning("Failed to detect face using insightface. Extracting embedding with align face")
+        id_ante_embedding = face_helper_2.get_feat(align_face)
+    id_ante_embedding = torch.from_numpy(id_ante_embedding).to(device, weight_dtype)  # torch.Size([512])
+    if id_ante_embedding.ndim == 1:
+        id_ante_embedding = id_ante_embedding.unsqueeze(0)  # torch.Size([1, 512])
+    # parsing
+    if is_align_face:
+        input = img2tensor(align_face, bgr2rgb=True).unsqueeze(0) / 255.0  # torch.Size([1, 3, 512, 512])
+        input = input.to(device)
+        parsing_out = face_helper_1.face_parse(normalize(input, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))[0]
+        parsing_out = parsing_out.argmax(dim=1, keepdim=True)  # torch.Size([1, 1, 512, 512])
+        bg_label = [0, 16, 18, 7, 8, 9, 14, 15]
+        bg = sum(parsing_out == i for i in bg_label).bool()
+        white_image = torch.ones_like(input)  # torch.Size([1, 3, 512, 512])
+        # only keep the face features
+        return_face_features_image = torch.where(bg, white_image, to_gray(input))  # torch.Size([1, 3, 512, 512])
+        return_face_features_image_2 = torch.where(bg, white_image, input)  # torch.Size([1, 3, 512, 512])
+    else:
+        original_image_bgr = cv2.cvtColor(original_id_image, cv2.COLOR_RGB2BGR)
+        input = img2tensor(original_image_bgr, bgr2rgb=True).unsqueeze(0) / 255.0  # torch.Size([1, 3, 512, 512])
+        input = input.to(device)
+        return_face_features_image = return_face_features_image_2 = input
+    # transform img before sending to eva-clip-vit
+    face_features_image = resize(
+        return_face_features_image, clip_vision_model.image_size, InterpolationMode.BICUBIC
+    )  # torch.Size([1, 3, 336, 336])
+    face_features_image = normalize(face_features_image, eva_transform_mean, eva_transform_std)
+    id_cond_vit, id_vit_hidden = clip_vision_model(
+        face_features_image.to(weight_dtype), return_all_features=False, return_hidden=True, shuffle=False
+    )  # torch.Size([1, 768]),  list(torch.Size([1, 577, 1024]))
+    id_cond_vit_norm = torch.norm(id_cond_vit, 2, 1, True)
+    id_cond_vit = torch.div(id_cond_vit, id_cond_vit_norm)
+    id_cond = torch.cat(
+        [id_ante_embedding, id_cond_vit], dim=-1
+    )  # torch.Size([1, 512]), torch.Size([1, 768])  ->  torch.Size([1, 1280])
+    return (
+        id_cond,
+        id_vit_hidden,
+        return_face_features_image_2,
+        face_kps,
+    )  # torch.Size([1, 1280]), list(torch.Size([1, 577, 1024]))
+def process_face_embeddings_infer(
+    face_helper_1,
+    clip_vision_model,
+    face_helper_2,
+    eva_transform_mean,
+    eva_transform_std,
+    app,
+    device,
+    weight_dtype,
+    img_file_path,
+    is_align_face=True,
+):
+    """
+    Process face embeddings from an input image for inference, including alignment, feature extraction, and embedding
+    concatenation.
+    Args:
+        face_helper_1: Face helper object (first helper) for alignment and landmark detection.
+        clip_vision_model: Pre-trained CLIP vision model used for feature extraction.
+        face_helper_2: Face helper object (second helper) for embedding extraction.
+        eva_transform_mean: Mean values for image normalization before passing to EVA model.
+        eva_transform_std: Standard deviation values for image normalization before passing to EVA model.
+        app: Application instance used for face detection.
+        device: Device (CPU or GPU) where the computations will be performed.
+        weight_dtype: Data type of the weights for precision (e.g., `torch.float32`).
+        img_file_path: Path to the input image file (string) or a numpy array representing an image.
+        is_align_face: Boolean flag indicating whether face alignment should be performed (default: True).
+    Returns:
+        Tuple:
+            - id_cond: Concatenated tensor of Ante face embedding and CLIP vision embedding.
+            - id_vit_hidden: Hidden state of the CLIP vision model, a list of tensors.
+            - image: Processed face image after feature extraction and alignment.
+            - face_kps: Keypoints of the face detected in the image.
+    """
+    # Load and preprocess the input image
+    if isinstance(img_file_path, str):
+        image = np.array(load_image(image=img_file_path).convert("RGB"))
+    else:
+        image = np.array(ImageOps.exif_transpose(Image.fromarray(img_file_path)).convert("RGB"))
+    # Resize image to ensure the longer side is 1024 pixels
+    image = resize_numpy_image_long(image, 1024)
+    original_id_image = image
+    # Process the image to extract face embeddings and related features
+    id_cond, id_vit_hidden, align_crop_face_image, face_kps = process_face_embeddings(
+        face_helper_1,
+        clip_vision_model,
+        face_helper_2,
+        eva_transform_mean,
+        eva_transform_std,
+        app,
+        device,
+        weight_dtype,
+        image,
+        original_id_image,
+        is_align_face,
+    )
+    # Convert the aligned cropped face image (torch tensor) to a numpy array
+    tensor = align_crop_face_image.cpu().detach()
+    tensor = tensor.squeeze()
+    tensor = tensor.permute(1, 2, 0)
+    tensor = tensor.numpy() * 255
+    tensor = tensor.astype(np.uint8)
+    image = ImageOps.exif_transpose(Image.fromarray(tensor))
+    return id_cond, id_vit_hidden, image, face_kps
+def prepare_face_models(model_path, device, dtype):
+    """
+    Prepare all face models for the facial recognition task.
+    Parameters:
+    - model_path: Path to the directory containing model files.
+    - device: The device (e.g., 'cuda', 'cpu') where models will be loaded.
+    - dtype: Data type (e.g., torch.float32) for model inference.
+    Returns:
+    - face_helper_1: First face restoration helper.
+    - face_helper_2: Second face restoration helper.
+    - face_clip_model: CLIP model for face extraction.
+    - eva_transform_mean: Mean value for image normalization.
+    - eva_transform_std: Standard deviation value for image normalization.
+    - face_main_model: Main face analysis model.
+    """
+    # get helper model
+    face_helper_1 = FaceRestoreHelper(
+        upscale_factor=1,
+        face_size=512,
+        crop_ratio=(1, 1),
+        det_model="retinaface_resnet50",
+        save_ext="png",
+        device=device,
+        model_rootpath=os.path.join(model_path, "face_encoder"),
+    )
+    face_helper_1.face_parse = None
+    face_helper_1.face_parse = init_parsing_model(
+        model_name="bisenet", device=device, model_rootpath=os.path.join(model_path, "face_encoder")
+    )
+    face_helper_2 = insightface.model_zoo.get_model(
+        f"{model_path}/face_encoder/models/antelopev2/glintr100.onnx", providers=["CUDAExecutionProvider"]
+    )
+    face_helper_2.prepare(ctx_id=0)
+    # get local facial extractor part 1
+    model, _, _ = create_model_and_transforms(
+        "EVA02-CLIP-L-14-336",
+        os.path.join(model_path, "face_encoder", "EVA02_CLIP_L_336_psz14_s6B.pt"),
+        force_custom_clip=True,
+    )
+    face_clip_model = model.visual
+    eva_transform_mean = getattr(face_clip_model, "image_mean", OPENAI_DATASET_MEAN)
+    eva_transform_std = getattr(face_clip_model, "image_std", OPENAI_DATASET_STD)
+    if not isinstance(eva_transform_mean, (list, tuple)):
+        eva_transform_mean = (eva_transform_mean,) * 3
+    if not isinstance(eva_transform_std, (list, tuple)):
+        eva_transform_std = (eva_transform_std,) * 3
+    eva_transform_mean = eva_transform_mean
+    eva_transform_std = eva_transform_std
+    # get local facial extractor part 2
+    face_main_model = FaceAnalysis(
+        name="antelopev2", root=os.path.join(model_path, "face_encoder"), providers=["CUDAExecutionProvider"]
+    )
+    face_main_model.prepare(ctx_id=0, det_size=(640, 640))
+    # move face models to device
+    face_helper_1.face_det.eval()
+    face_helper_1.face_parse.eval()
+    face_clip_model.eval()
+    face_helper_1.face_det.to(device)
+    face_helper_1.face_parse.to(device)
+    face_clip_model.to(device, dtype=dtype)
+    return face_helper_1, face_helper_2, face_clip_model, face_main_model, eva_transform_mean, eva_transform_std

diffusers 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl

diffusers 0.32.1py3-none-any.whl → 0.33.0py3-none-any.whl