PyPI - diffusers - Versions diffs - 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

diffusers 0.32.1py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (389) hide show

diffusers/utils/remote_utils.py ADDED Viewed

@@ -0,0 +1,425 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import json
+from typing import List, Literal, Optional, Union, cast
+import requests
+from .deprecation_utils import deprecate
+from .import_utils import is_safetensors_available, is_torch_available
+if is_torch_available():
+    import torch
+    from ..image_processor import VaeImageProcessor
+    from ..video_processor import VideoProcessor
+    if is_safetensors_available():
+        import safetensors.torch
+    DTYPE_MAP = {
+        "float16": torch.float16,
+        "float32": torch.float32,
+        "bfloat16": torch.bfloat16,
+        "uint8": torch.uint8,
+    }
+from PIL import Image
+def detect_image_type(data: bytes) -> str:
+    if data.startswith(b"\xff\xd8"):
+        return "jpeg"
+    elif data.startswith(b"\x89PNG\r\n\x1a\n"):
+        return "png"
+    elif data.startswith(b"GIF87a") or data.startswith(b"GIF89a"):
+        return "gif"
+    elif data.startswith(b"BM"):
+        return "bmp"
+    return "unknown"
+def check_inputs_decode(
+    endpoint: str,
+    tensor: "torch.Tensor",
+    processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None,
+    do_scaling: bool = True,
+    scaling_factor: Optional[float] = None,
+    shift_factor: Optional[float] = None,
+    output_type: Literal["mp4", "pil", "pt"] = "pil",
+    return_type: Literal["mp4", "pil", "pt"] = "pil",
+    image_format: Literal["png", "jpg"] = "jpg",
+    partial_postprocess: bool = False,
+    input_tensor_type: Literal["binary"] = "binary",
+    output_tensor_type: Literal["binary"] = "binary",
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+):
+    if tensor.ndim == 3 and height is None and width is None:
+        raise ValueError("`height` and `width` required for packed latents.")
+    if (
+        output_type == "pt"
+        and return_type == "pil"
+        and not partial_postprocess
+        and not isinstance(processor, (VaeImageProcessor, VideoProcessor))
+    ):
+        raise ValueError("`processor` is required.")
+    if do_scaling and scaling_factor is None:
+        deprecate(
+            "do_scaling",
+            "1.0.0",
+            "`do_scaling` is deprecated, pass `scaling_factor` and `shift_factor` if required.",
+            standard_warn=False,
+        )
+def postprocess_decode(
+    response: requests.Response,
+    processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None,
+    output_type: Literal["mp4", "pil", "pt"] = "pil",
+    return_type: Literal["mp4", "pil", "pt"] = "pil",
+    partial_postprocess: bool = False,
+):
+    if output_type == "pt" or (output_type == "pil" and processor is not None):
+        output_tensor = response.content
+        parameters = response.headers
+        shape = json.loads(parameters["shape"])
+        dtype = parameters["dtype"]
+        torch_dtype = DTYPE_MAP[dtype]
+        output_tensor = torch.frombuffer(bytearray(output_tensor), dtype=torch_dtype).reshape(shape)
+    if output_type == "pt":
+        if partial_postprocess:
+            if return_type == "pil":
+                output = [Image.fromarray(image.numpy()) for image in output_tensor]
+                if len(output) == 1:
+                    output = output[0]
+            elif return_type == "pt":
+                output = output_tensor
+        else:
+            if processor is None or return_type == "pt":
+                output = output_tensor
+            else:
+                if isinstance(processor, VideoProcessor):
+                    output = cast(
+                        List[Image.Image],
+                        processor.postprocess_video(output_tensor, output_type="pil")[0],
+                    )
+                else:
+                    output = cast(
+                        Image.Image,
+                        processor.postprocess(output_tensor, output_type="pil")[0],
+                    )
+    elif output_type == "pil" and return_type == "pil" and processor is None:
+        output = Image.open(io.BytesIO(response.content)).convert("RGB")
+        detected_format = detect_image_type(response.content)
+        output.format = detected_format
+    elif output_type == "pil" and processor is not None:
+        if return_type == "pil":
+            output = [
+                Image.fromarray(image)
+                for image in (output_tensor.permute(0, 2, 3, 1).float().numpy() * 255).round().astype("uint8")
+            ]
+        elif return_type == "pt":
+            output = output_tensor
+    elif output_type == "mp4" and return_type == "mp4":
+        output = response.content
+    return output
+def prepare_decode(
+    tensor: "torch.Tensor",
+    processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None,
+    do_scaling: bool = True,
+    scaling_factor: Optional[float] = None,
+    shift_factor: Optional[float] = None,
+    output_type: Literal["mp4", "pil", "pt"] = "pil",
+    image_format: Literal["png", "jpg"] = "jpg",
+    partial_postprocess: bool = False,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+):
+    headers = {}
+    parameters = {
+        "image_format": image_format,
+        "output_type": output_type,
+        "partial_postprocess": partial_postprocess,
+        "shape": list(tensor.shape),
+        "dtype": str(tensor.dtype).split(".")[-1],
+    }
+    if do_scaling and scaling_factor is not None:
+        parameters["scaling_factor"] = scaling_factor
+    if do_scaling and shift_factor is not None:
+        parameters["shift_factor"] = shift_factor
+    if do_scaling and scaling_factor is None:
+        parameters["do_scaling"] = do_scaling
+    elif do_scaling and scaling_factor is None and shift_factor is None:
+        parameters["do_scaling"] = do_scaling
+    if height is not None and width is not None:
+        parameters["height"] = height
+        parameters["width"] = width
+    headers["Content-Type"] = "tensor/binary"
+    headers["Accept"] = "tensor/binary"
+    if output_type == "pil" and image_format == "jpg" and processor is None:
+        headers["Accept"] = "image/jpeg"
+    elif output_type == "pil" and image_format == "png" and processor is None:
+        headers["Accept"] = "image/png"
+    elif output_type == "mp4":
+        headers["Accept"] = "text/plain"
+    tensor_data = safetensors.torch._tobytes(tensor, "tensor")
+    return {"data": tensor_data, "params": parameters, "headers": headers}
+def remote_decode(
+    endpoint: str,
+    tensor: "torch.Tensor",
+    processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None,
+    do_scaling: bool = True,
+    scaling_factor: Optional[float] = None,
+    shift_factor: Optional[float] = None,
+    output_type: Literal["mp4", "pil", "pt"] = "pil",
+    return_type: Literal["mp4", "pil", "pt"] = "pil",
+    image_format: Literal["png", "jpg"] = "jpg",
+    partial_postprocess: bool = False,
+    input_tensor_type: Literal["binary"] = "binary",
+    output_tensor_type: Literal["binary"] = "binary",
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+) -> Union[Image.Image, List[Image.Image], bytes, "torch.Tensor"]:
+    """
+    Hugging Face Hybrid Inference that allow running VAE decode remotely.
+    Args:
+        endpoint (`str`):
+            Endpoint for Remote Decode.
+        tensor (`torch.Tensor`):
+            Tensor to be decoded.
+        processor (`VaeImageProcessor` or `VideoProcessor`, *optional*):
+            Used with `return_type="pt"`, and `return_type="pil"` for Video models.
+        do_scaling (`bool`, default `True`, *optional*):
+            **DEPRECATED**. **pass `scaling_factor`/`shift_factor` instead.** **still set
+            do_scaling=None/do_scaling=False for no scaling until option is removed** When `True` scaling e.g. `latents
+            / self.vae.config.scaling_factor` is applied remotely. If `False`, input must be passed with scaling
+            applied.
+        scaling_factor (`float`, *optional*):
+            Scaling is applied when passed e.g. [`latents /
+            self.vae.config.scaling_factor`](https://github.com/huggingface/diffusers/blob/7007febae5cff000d4df9059d9cf35133e8b2ca9/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L1083C37-L1083C77).
+            - SD v1: 0.18215
+            - SD XL: 0.13025
+            - Flux: 0.3611
+            If `None`, input must be passed with scaling applied.
+        shift_factor (`float`, *optional*):
+            Shift is applied when passed e.g. `latents + self.vae.config.shift_factor`.
+            - Flux: 0.1159
+            If `None`, input must be passed with scaling applied.
+        output_type (`"mp4"` or `"pil"` or `"pt", default `"pil"):
+            **Endpoint** output type. Subject to change. Report feedback on preferred type.
+            `"mp4": Supported by video models. Endpoint returns `bytes` of video. `"pil"`: Supported by image and video
+            models.
+                Image models: Endpoint returns `bytes` of an image in `image_format`. Video models: Endpoint returns
+                `torch.Tensor` with partial `postprocessing` applied.
+                    Requires `processor` as a flag (any `None` value will work).
+            `"pt"`: Support by image and video models. Endpoint returns `torch.Tensor`.
+                With `partial_postprocess=True` the tensor is postprocessed `uint8` image tensor.
+            Recommendations:
+                `"pt"` with `partial_postprocess=True` is the smallest transfer for full quality. `"pt"` with
+                `partial_postprocess=False` is the most compatible with third party code. `"pil"` with
+                `image_format="jpg"` is the smallest transfer overall.
+        return_type (`"mp4"` or `"pil"` or `"pt", default `"pil"):
+            **Function** return type.
+            `"mp4": Function returns `bytes` of video. `"pil"`: Function returns `PIL.Image.Image`.
+                With `output_type="pil" no further processing is applied. With `output_type="pt" a `PIL.Image.Image` is
+                created.
+                    `partial_postprocess=False` `processor` is required. `partial_postprocess=True` `processor` is
+                    **not** required.
+            `"pt"`: Function returns `torch.Tensor`.
+                `processor` is **not** required. `partial_postprocess=False` tensor is `float16` or `bfloat16`, without
+                denormalization. `partial_postprocess=True` tensor is `uint8`, denormalized.
+        image_format (`"png"` or `"jpg"`, default `jpg`):
+            Used with `output_type="pil"`. Endpoint returns `jpg` or `png`.
+        partial_postprocess (`bool`, default `False`):
+            Used with `output_type="pt"`. `partial_postprocess=False` tensor is `float16` or `bfloat16`, without
+            denormalization. `partial_postprocess=True` tensor is `uint8`, denormalized.
+        input_tensor_type (`"binary"`, default `"binary"`):
+            Tensor transfer type.
+        output_tensor_type (`"binary"`, default `"binary"`):
+            Tensor transfer type.
+        height (`int`, **optional**):
+            Required for `"packed"` latents.
+        width (`int`, **optional**):
+            Required for `"packed"` latents.
+    Returns:
+        output (`Image.Image` or `List[Image.Image]` or `bytes` or `torch.Tensor`).
+    """
+    if input_tensor_type == "base64":
+        deprecate(
+            "input_tensor_type='base64'",
+            "1.0.0",
+            "input_tensor_type='base64' is deprecated. Using `binary`.",
+            standard_warn=False,
+        )
+        input_tensor_type = "binary"
+    if output_tensor_type == "base64":
+        deprecate(
+            "output_tensor_type='base64'",
+            "1.0.0",
+            "output_tensor_type='base64' is deprecated. Using `binary`.",
+            standard_warn=False,
+        )
+        output_tensor_type = "binary"
+    check_inputs_decode(
+        endpoint,
+        tensor,
+        processor,
+        do_scaling,
+        scaling_factor,
+        shift_factor,
+        output_type,
+        return_type,
+        image_format,
+        partial_postprocess,
+        input_tensor_type,
+        output_tensor_type,
+        height,
+        width,
+    )
+    kwargs = prepare_decode(
+        tensor=tensor,
+        processor=processor,
+        do_scaling=do_scaling,
+        scaling_factor=scaling_factor,
+        shift_factor=shift_factor,
+        output_type=output_type,
+        image_format=image_format,
+        partial_postprocess=partial_postprocess,
+        height=height,
+        width=width,
+    )
+    response = requests.post(endpoint, **kwargs)
+    if not response.ok:
+        raise RuntimeError(response.json())
+    output = postprocess_decode(
+        response=response,
+        processor=processor,
+        output_type=output_type,
+        return_type=return_type,
+        partial_postprocess=partial_postprocess,
+    )
+    return output
+def check_inputs_encode(
+    endpoint: str,
+    image: Union["torch.Tensor", Image.Image],
+    scaling_factor: Optional[float] = None,
+    shift_factor: Optional[float] = None,
+):
+    pass
+def postprocess_encode(
+    response: requests.Response,
+):
+    output_tensor = response.content
+    parameters = response.headers
+    shape = json.loads(parameters["shape"])
+    dtype = parameters["dtype"]
+    torch_dtype = DTYPE_MAP[dtype]
+    output_tensor = torch.frombuffer(bytearray(output_tensor), dtype=torch_dtype).reshape(shape)
+    return output_tensor
+def prepare_encode(
+    image: Union["torch.Tensor", Image.Image],
+    scaling_factor: Optional[float] = None,
+    shift_factor: Optional[float] = None,
+):
+    headers = {}
+    parameters = {}
+    if scaling_factor is not None:
+        parameters["scaling_factor"] = scaling_factor
+    if shift_factor is not None:
+        parameters["shift_factor"] = shift_factor
+    if isinstance(image, torch.Tensor):
+        data = safetensors.torch._tobytes(image.contiguous(), "tensor")
+        parameters["shape"] = list(image.shape)
+        parameters["dtype"] = str(image.dtype).split(".")[-1]
+    else:
+        buffer = io.BytesIO()
+        image.save(buffer, format="PNG")
+        data = buffer.getvalue()
+    return {"data": data, "params": parameters, "headers": headers}
+def remote_encode(
+    endpoint: str,
+    image: Union["torch.Tensor", Image.Image],
+    scaling_factor: Optional[float] = None,
+    shift_factor: Optional[float] = None,
+) -> "torch.Tensor":
+    """
+    Hugging Face Hybrid Inference that allow running VAE encode remotely.
+    Args:
+        endpoint (`str`):
+            Endpoint for Remote Decode.
+        image (`torch.Tensor` or `PIL.Image.Image`):
+            Image to be encoded.
+        scaling_factor (`float`, *optional*):
+            Scaling is applied when passed e.g. [`latents * self.vae.config.scaling_factor`].
+            - SD v1: 0.18215
+            - SD XL: 0.13025
+            - Flux: 0.3611
+            If `None`, input must be passed with scaling applied.
+        shift_factor (`float`, *optional*):
+            Shift is applied when passed e.g. `latents - self.vae.config.shift_factor`.
+            - Flux: 0.1159
+            If `None`, input must be passed with scaling applied.
+    Returns:
+        output (`torch.Tensor`).
+    """
+    check_inputs_encode(
+        endpoint,
+        image,
+        scaling_factor,
+        shift_factor,
+    )
+    kwargs = prepare_encode(
+        image=image,
+        scaling_factor=scaling_factor,
+        shift_factor=shift_factor,
+    )
+    response = requests.post(endpoint, **kwargs)
+    if not response.ok:
+        raise RuntimeError(response.json())
+    output = postprocess_encode(
+        response=response,
+    )
+    return output

diffusers/utils/source_code_parsing_utils.py ADDED Viewed

@@ -0,0 +1,52 @@
+import ast
+import importlib
+import inspect
+import textwrap
+class ReturnNameVisitor(ast.NodeVisitor):
+    """Thanks to ChatGPT for pairing."""
+    def __init__(self):
+        self.return_names = []
+    def visit_Return(self, node):
+        # Check if the return value is a tuple.
+        if isinstance(node.value, ast.Tuple):
+            for elt in node.value.elts:
+                if isinstance(elt, ast.Name):
+                    self.return_names.append(elt.id)
+                else:
+                    try:
+                        self.return_names.append(ast.unparse(elt))
+                    except Exception:
+                        self.return_names.append(str(elt))
+        else:
+            if isinstance(node.value, ast.Name):
+                self.return_names.append(node.value.id)
+            else:
+                try:
+                    self.return_names.append(ast.unparse(node.value))
+                except Exception:
+                    self.return_names.append(str(node.value))
+        self.generic_visit(node)
+    def _determine_parent_module(self, cls):
+        from diffusers import DiffusionPipeline
+        from diffusers.models.modeling_utils import ModelMixin
+        if issubclass(cls, DiffusionPipeline):
+            return "pipelines"
+        elif issubclass(cls, ModelMixin):
+            return "models"
+        else:
+            raise NotImplementedError
+    def get_ast_tree(self, cls, attribute_name="encode_prompt"):
+        parent_module_name = self._determine_parent_module(cls)
+        main_module = importlib.import_module(f"diffusers.{parent_module_name}")
+        current_cls_module = getattr(main_module, cls.__name__)
+        source_code = inspect.getsource(getattr(current_cls_module, attribute_name))
+        source_code = textwrap.dedent(source_code)
+        tree = ast.parse(source_code)
+        return tree

diffusers/utils/state_dict_utils.py CHANGED Viewed

@@ -17,9 +17,14 @@ State dict utilities: utility methods for converting state dicts easily
 import enum
+from .import_utils import is_torch_available
 from .logging import get_logger
+if is_torch_available():
+    import torch
 logger = get_logger(__name__)
@@ -329,7 +334,16 @@ def convert_state_dict_to_kohya(state_dict, original_type=None, **kwargs):
         kohya_key = kohya_key.replace(peft_adapter_name, "")  # Kohya doesn't take names
         kohya_ss_state_dict[kohya_key] = weight
         if "lora_down" in kohya_key:
-            alpha_key = f'{kohya_key.split(".")[0]}.alpha'
+            alpha_key = f"{kohya_key.split('.')[0]}.alpha"
             kohya_ss_state_dict[alpha_key] = torch.tensor(len(weight))
     return kohya_ss_state_dict
+def state_dict_all_zero(state_dict, filter_str=None):
+    if filter_str is not None:
+        if isinstance(filter_str, str):
+            filter_str = [filter_str]
+        state_dict = {k: v for k, v in state_dict.items() if any(f in k for f in filter_str)}
+    return all(torch.all(param == 0).item() for param in state_dict.values())

diffusers 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl

diffusers 0.32.1py3-none-any.whl → 0.33.0py3-none-any.whl