PyPI - coreml-diffusion - Versions diffs - 0.1.0__py3-none-any.whl - Mend

coreml-diffusion 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

coreml_diffusion/__init__.py +108 -0
coreml_diffusion/attention.py +5 -0
coreml_diffusion/cli.py +114 -0
coreml_diffusion/conversion/__init__.py +9 -0
coreml_diffusion/conversion/attention.py +245 -0
coreml_diffusion/conversion/shapes.py +20 -0
coreml_diffusion/conversion/trace.py +61 -0
coreml_diffusion/conversion/unet.py +54 -0
coreml_diffusion/convert.py +348 -0
coreml_diffusion/logger.py +5 -0
coreml_diffusion/model_version.py +8 -0
coreml_diffusion/naming.py +73 -0
coreml_diffusion-0.1.0.dist-info/METADATA +98 -0
coreml_diffusion-0.1.0.dist-info/RECORD +17 -0
coreml_diffusion-0.1.0.dist-info/WHEEL +4 -0
coreml_diffusion-0.1.0.dist-info/entry_points.txt +2 -0
coreml_diffusion-0.1.0.dist-info/licenses/LICENSE +21 -0

coreml_diffusion/__init__.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""coreml_diffusion — framework-free Core ML diffusion conversion.
+Converts diffusion-model checkpoints (SD1.5/SDXL today) into Core ML
+``.mlpackage`` artifacts for Apple Neural Engine, with no ComfyUI dependency.
+Usable as a library, via the ``coreml-diffusion`` CLI, or embedded in on-device
+(iOS) tooling. The public surface is the discovery API below plus ``convert``,
+``compose_out_name`` and ``ModelVersion``.
+This package MUST stay free of ``comfy`` / ``folder_paths`` / ``comfy_extras``;
+``import coreml_diffusion`` works in a comfy-free environment.
+Discovery contract (consumed by ComfyUI-CoreMLSuite): the node populates its
+dropdowns by calling ``list_*`` here, so installing a newer ``coreml_diffusion``
+surfaces new conversion types in the old node with no Suite change and no Suite
+version bump. The identifiers returned here are an ADDITIVE-ONLY contract:
+- adding an identifier, or promoting EXPERIMENTAL -> VERIFIED  => minor bump
+- removing/renaming an identifier, or demoting VERIFIED        => MAJOR bump + note
+because a saved workflow JSON references these strings verbatim.
+"""
+from enum import Enum
+from coreml_diffusion.attention import ATTENTION_IMPLEMENTATIONS
+from coreml_diffusion.model_version import ModelVersion
+from coreml_diffusion.naming import (
+    QUANT_NBITS_VALUES,
+    compose_out_name,
+    lora_names_from_params,
+)
+__all__ = [
+    "ModelVersion",
+    "Status",
+    "list_model_versions",
+    "list_attention_impls",
+    "list_quant_modes",
+    "CONTRACT_VERSION",
+    "compose_out_name",
+    "lora_names_from_params",
+    "convert",
+]
+class Status(Enum):
+    VERIFIED = "verified"  # has a golden anchor + passing [M2-ANE] check
+    EXPERIMENTAL = "experimental"  # convertible, not yet anchored/verified
+# Single source of truth for which conversions the Suite may surface. The Suite
+# gates on this status, NOT on a hardcoded node list: promoting a model to
+# VERIFIED expands the node's dropdown with no Suite change.
+#
+# Keyed by the ModelVersion MEMBER so ``list_model_versions`` can emit ``.name``
+# ("SD15", "SDXL"). The node reverses the dropdown string via ``ModelVersion[...]``
+# (name lookup, nodes.py), so emitting ``.value`` ("sd15") would raise KeyError on
+# every saved workflow. See seam.md §5.
+_MODEL_STATUS = {
+    ModelVersion.SD15: Status.VERIFIED,
+    ModelVersion.SDXL: Status.VERIFIED,
+    ModelVersion.SDXL_REFINER: Status.EXPERIMENTAL,  # -> VERIFIED after a refiner golden anchor
+    ModelVersion.LCM: Status.EXPERIMENTAL,  # -> VERIFIED after E-LCM golden anchor
+}
+def list_model_versions(include_experimental: bool = False) -> list[str]:
+    """Model versions by ``.name`` (e.g. ``["SD15", "SDXL"]``).
+    Returns VERIFIED versions only by default — the converter node calls this
+    plainly. A power-user/CLI path may pass ``include_experimental=True`` to also
+    list convertible-but-unanchored versions.
+    """
+    return [
+        version.name
+        for version, status in _MODEL_STATUS.items()
+        if status is Status.VERIFIED
+        or (include_experimental and status is Status.EXPERIMENTAL)
+    ]
+def list_attention_impls() -> list[str]:
+    """Supported attention implementations, e.g. ``["SPLIT_EINSUM", ...]``."""
+    return list(ATTENTION_IMPLEMENTATIONS)
+def list_quant_modes() -> list[str]:
+    """Palettization modes, e.g. ``["none", "8", "6", "4"]`` ("none" = unquantized)."""
+    return list(QUANT_NBITS_VALUES)
+# Discovery-contract version. Bump per the additive-only rules in this module's
+# docstring and CONVERTER_EXTRACTION_SPEC.md "Interface contract".
+CONTRACT_VERSION = "1.0"
+def __getattr__(name):
+    """Lazily expose the heavy conversion entrypoint.
+    ``convert`` pulls coremltools/diffusers, so importing it eagerly would drag
+    the Mac/heavy stack into every ``import coreml_diffusion`` and break the
+    Tier-0 (Linux, framework-free) lane. Resolve it only on first access.
+    """
+    if name == "convert":
+        from coreml_diffusion.convert import convert as _convert
+        return _convert
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

coreml_diffusion/attention.py ADDED Viewed

@@ -0,0 +1,5 @@
+ATTENTION_IMPLEMENTATIONS = (
+    "SPLIT_EINSUM",
+    "SPLIT_EINSUM_V2",
+    "ORIGINAL",
+)

coreml_diffusion/cli.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Command-line entry point for coreml_diffusion.
+Mirrors ``coreml_diffusion.convert`` so the package can produce a Core ML
+``.mlpackage`` with no ComfyUI involved — for headless and on-device (iOS)
+conversion workflows. The heavy import (coremltools/diffusers, pulled by
+``convert``) is deferred into the handler, so ``--help`` and argument parsing
+stay light and the arg→call mapping is testable on plain Linux.
+Example:
+    coreml-diffusion convert --ckpt model.safetensors --model-version SD15 \\
+        --out unet.mlpackage --height 512 --width 512 --attn-impl SPLIT_EINSUM
+"""
+import argparse
+import coreml_diffusion
+def _parse_lora(spec):
+    """Parse a ``PATH:STRENGTH`` lora spec into ``(path, float_strength)``.
+    Strength defaults to 1.0 when omitted. ``rsplit`` on the last ':' so Windows
+    drive letters / colons in the path survive.
+    """
+    path, sep, strength = spec.rpartition(":")
+    if not sep:
+        return spec, 1.0
+    return path, float(strength)
+def _convert_cmd(args):
+    sample_size = (args.height // 8, args.width // 8)
+    lora_weights = [_parse_lora(spec) for spec in (args.lora or [])]
+    coreml_diffusion.convert(
+        args.ckpt,
+        coreml_diffusion.ModelVersion[args.model_version],
+        args.out,
+        batch_size=args.batch_size,
+        sample_size=sample_size,
+        controlnet_support=args.controlnet,
+        lora_weights=lora_weights or None,
+        attn_impl=args.attn_impl,
+        config_path=args.config,
+        quantize_nbits=args.quantize,
+    )
+def build_parser():
+    parser = argparse.ArgumentParser(
+        prog="coreml-diffusion",
+        description="Convert diffusion checkpoints to Core ML for Apple Neural Engine.",
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+    conv = sub.add_parser("convert", help="Convert a checkpoint's UNet to a .mlpackage")
+    conv.add_argument(
+        "--ckpt", required=True, help="Path to the source .safetensors checkpoint"
+    )
+    conv.add_argument(
+        "--model-version",
+        required=True,
+        # include experimental: the CLI is the power-user path. Experimental
+        # versions (LCM, SDXL_REFINER) convert but are not golden-verified.
+        choices=coreml_diffusion.list_model_versions(include_experimental=True),
+        help="Model architecture (verified: SD15, SDXL; experimental otherwise)",
+    )
+    conv.add_argument("--out", required=True, help="Output .mlpackage path to write")
+    conv.add_argument(
+        "--height", type=int, default=512, help="Target image height (default 512)"
+    )
+    conv.add_argument(
+        "--width", type=int, default=512, help="Target image width (default 512)"
+    )
+    conv.add_argument(
+        "--batch-size", type=int, default=1, help="Batch size (default 1)"
+    )
+    conv.add_argument(
+        "--attn-impl",
+        choices=coreml_diffusion.list_attention_impls(),
+        default=coreml_diffusion.list_attention_impls()[0],
+        help="Attention implementation (default SPLIT_EINSUM)",
+    )
+    conv.add_argument(
+        "--controlnet",
+        action="store_true",
+        help="Add ControlNet residual inputs to the converted UNet",
+    )
+    conv.add_argument(
+        "--lora",
+        action="append",
+        metavar="PATH[:STRENGTH]",
+        help="LoRA to fuse before conversion; repeatable. STRENGTH defaults to 1.0",
+    )
+    conv.add_argument(
+        "--config", default=None, help="Optional original-config YAML path"
+    )
+    conv.add_argument(
+        "--quantize",
+        choices=coreml_diffusion.list_quant_modes(),
+        default="none",
+        help="K-means weight palettization bits (default none = unquantized)",
+    )
+    conv.set_defaults(func=_convert_cmd)
+    return parser
+def main(argv=None):
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return args.func(args)
+if __name__ == "__main__":
+    main()

coreml_diffusion/conversion/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Core ML conversion helpers.
+The conversion approach originates from Apple's ml-stable-diffusion
+(https://github.com/apple/ml-stable-diffusion). This implementation has since
+diverged: it runs natively on diffusers' UNet2DConditionModel with its own
+SPLIT_EINSUM / SPLIT_EINSUM_V2 attention processors and no longer depends on
+that package. The intent is to keep iterating on these methods independently
+while tracking current tooling.
+"""

coreml_diffusion/conversion/attention.py ADDED Viewed

@@ -0,0 +1,245 @@
+import logging
+import torch
+logger = logging.getLogger(__name__)
+CHUNK_SIZE = 512
+def apply_attention_implementation(unet, attention_implementation):
+    if attention_implementation == "ORIGINAL":
+        return unet
+    if attention_implementation == "SPLIT_EINSUM":
+        unet.set_attn_processor(SplitEinsumAttnProcessor())
+        return unet
+    if attention_implementation == "SPLIT_EINSUM_V2":
+        unet.set_attn_processor(SplitEinsumV2AttnProcessor())
+        return unet
+    raise ValueError(
+        f"Unsupported attention implementation: {attention_implementation}"
+    )
+class SplitEinsumAttnProcessor:
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        return _attention_forward(
+            attn,
+            hidden_states,
+            encoder_hidden_states,
+            attention_mask,
+            temb,
+            split_einsum,
+        )
+class SplitEinsumV2AttnProcessor:
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        return _attention_forward(
+            attn,
+            hidden_states,
+            encoder_hidden_states,
+            attention_mask,
+            temb,
+            split_einsum_v2,
+        )
+def _attention_forward(
+    attn,
+    hidden_states,
+    encoder_hidden_states,
+    attention_mask,
+    temb,
+    attention_fn,
+):
+    residual = hidden_states
+    if attn.spatial_norm is not None:
+        hidden_states = attn.spatial_norm(hidden_states, temb)
+    input_ndim = hidden_states.ndim
+    if input_ndim == 4:
+        batch_size, channel, height, width = hidden_states.shape
+        hidden_states = hidden_states.view(
+            batch_size, channel, height * width
+        ).transpose(1, 2)
+    else:
+        batch_size, _, channel = hidden_states.shape
+        height = None
+        width = None
+    batch_size, key_sequence_length, _ = (
+        hidden_states.shape
+        if encoder_hidden_states is None
+        else encoder_hidden_states.shape
+    )
+    if attention_mask is not None:
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask,
+            key_sequence_length,
+            batch_size,
+        )
+        attention_mask = _prepare_split_einsum_mask(
+            attention_mask,
+            batch_size,
+            attn.heads,
+            key_sequence_length,
+        )
+    if attn.group_norm is not None:
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+    query = attn.to_q(hidden_states)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif attn.norm_cross:
+        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+    key = attn.to_k(encoder_hidden_states)
+    value = attn.to_v(encoder_hidden_states)
+    batch_size = query.shape[0]
+    dim_head = attn.inner_kv_dim // attn.heads
+    query = _linear_projection_to_bchw(query)
+    key = _linear_projection_to_bchw(key)
+    value = _linear_projection_to_bchw(value)
+    hidden_states = attention_fn(
+        query,
+        key,
+        value,
+        attention_mask,
+        attn.heads,
+        dim_head,
+    )
+    hidden_states = hidden_states.squeeze(2).transpose(1, 2)
+    hidden_states = hidden_states.reshape(batch_size, -1, attn.inner_dim)
+    hidden_states = attn.to_out[0](hidden_states)
+    hidden_states = attn.to_out[1](hidden_states)
+    if input_ndim == 4:
+        hidden_states = hidden_states.transpose(-1, -2).reshape(
+            batch_size,
+            channel,
+            height,
+            width,
+        )
+    if attn.residual_connection:
+        hidden_states = hidden_states + residual
+    hidden_states = hidden_states / attn.rescale_output_factor
+    return hidden_states
+def split_einsum(q, k, v, mask, heads, dim_head):
+    q_heads = _split_heads(q, heads, dim_head)
+    k = k.transpose(1, 3)
+    k_heads = [
+        k[:, :, :, head_idx * dim_head : (head_idx + 1) * dim_head]
+        for head_idx in range(heads)
+    ]
+    v_heads = _split_heads(v, heads, dim_head)
+    weights = [
+        torch.einsum("bchq,bkhc->bkhq", query, key) * (dim_head**-0.5)
+        for query, key in zip(q_heads, k_heads)
+    ]
+    if mask is not None:
+        weights = [weight + mask for weight in weights]
+    weights = [weight.softmax(dim=1) for weight in weights]
+    outputs = [
+        torch.einsum("bkhq,bchk->bchq", weight, value)
+        for weight, value in zip(weights, v_heads)
+    ]
+    return torch.cat(outputs, dim=1)
+def split_einsum_v2(q, k, v, mask, heads, dim_head):
+    query_length = q.size(3)
+    num_chunks = query_length // CHUNK_SIZE
+    if num_chunks == 0:
+        logger.info(
+            "SPLIT_EINSUM_V2 query sequence is shorter than %s; using SPLIT_EINSUM.",
+            CHUNK_SIZE,
+        )
+        return split_einsum(q, k, v, mask, heads, dim_head)
+    q_heads = _split_heads(q, heads, dim_head)
+    q_chunks = [
+        [
+            head[..., chunk_idx * CHUNK_SIZE : (chunk_idx + 1) * CHUNK_SIZE]
+            for chunk_idx in range(num_chunks)
+        ]
+        for head in q_heads
+    ]
+    k = k.transpose(1, 3)
+    k_heads = [
+        k[:, :, :, head_idx * dim_head : (head_idx + 1) * dim_head]
+        for head_idx in range(heads)
+    ]
+    v_heads = _split_heads(v, heads, dim_head)
+    head_outputs = []
+    for query_chunks, key, value in zip(q_chunks, k_heads, v_heads):
+        chunk_outputs = []
+        for query_chunk in query_chunks:
+            weights = torch.einsum("bchq,bkhc->bkhq", query_chunk, key)
+            weights = weights * (dim_head**-0.5)
+            if mask is not None:
+                weights = weights + mask
+            weights = weights.softmax(dim=1)
+            chunk_outputs.append(torch.einsum("bkhq,bchk->bchq", weights, value))
+        head_outputs.append(torch.cat(chunk_outputs, dim=3))
+    return torch.cat(head_outputs, dim=1)
+def _split_heads(x, heads, dim_head):
+    return [
+        x[:, head_idx * dim_head : (head_idx + 1) * dim_head, :, :]
+        for head_idx in range(heads)
+    ]
+def _linear_projection_to_bchw(x):
+    return x.transpose(1, 2).unsqueeze(2)
+def _prepare_split_einsum_mask(mask, batch_size, heads, key_sequence_length):
+    if mask.ndim == 2:
+        mask = mask[:, None, :]
+    if mask.shape[0] == batch_size * heads:
+        mask = mask.reshape(batch_size, heads, -1, key_sequence_length)
+        mask = mask[:, 0]
+    if mask.ndim == 3:
+        mask = mask[:, :, None, None]
+    return mask

coreml_diffusion/conversion/shapes.py ADDED Viewed

@@ -0,0 +1,20 @@
+def conv2d_output_shape(height, width, conv):
+    """Return the spatial output shape for a torch.nn.Conv2d-like module."""
+    kernel_h, kernel_w = _pair(conv.kernel_size)
+    stride_h, stride_w = _pair(conv.stride)
+    pad_h, pad_w = _pair(conv.padding)
+    dilation_h, dilation_w = _pair(conv.dilation)
+    out_h = _conv_output_dim(height, kernel_h, stride_h, pad_h, dilation_h)
+    out_w = _conv_output_dim(width, kernel_w, stride_w, pad_w, dilation_w)
+    return out_h, out_w
+def _conv_output_dim(size, kernel, stride, padding, dilation):
+    return ((size + (2 * padding) - (dilation * (kernel - 1)) - 1) // stride) + 1
+def _pair(value):
+    if isinstance(value, tuple):
+        return value
+    return value, value

coreml_diffusion/conversion/trace.py ADDED Viewed

@@ -0,0 +1,61 @@
+from types import MethodType
+from diffusers.models.transformers.transformer_2d import Transformer2DModel
+def prepare_unet_for_coreml_trace(unet):
+    for module in unet.modules():
+        if isinstance(module, Transformer2DModel):
+            module._operate_on_continuous_inputs = MethodType(
+                _operate_on_continuous_inputs,
+                module,
+            )
+            module._get_output_for_continuous_inputs = MethodType(
+                _get_output_for_continuous_inputs,
+                module,
+            )
+    return unet
+def _operate_on_continuous_inputs(self, hidden_states):
+    hidden_states = self.norm(hidden_states)
+    if not self.use_linear_projection:
+        hidden_states = self.proj_in(hidden_states)
+        inner_dim = self.inner_dim
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+    else:
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        hidden_states = self.proj_in(hidden_states)
+    return hidden_states, inner_dim
+def _get_output_for_continuous_inputs(
+    self,
+    hidden_states,
+    residual,
+    batch_size,
+    height,
+    width,
+    inner_dim,
+):
+    if not self.use_linear_projection:
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size,
+            inner_dim,
+            height,
+            width,
+        )
+        hidden_states = self.proj_out(hidden_states)
+    else:
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size,
+            inner_dim,
+            height,
+            width,
+        )
+    return hidden_states + residual

coreml_diffusion/conversion/unet.py ADDED Viewed

@@ -0,0 +1,54 @@
+import torch
+class CoreMLUNetWrapper(torch.nn.Module):
+    """Adapt diffusers UNet inputs to CoreMLSuite's stable Core ML contract."""
+    def __init__(self, unet, model_version):
+        super().__init__()
+        self.unet = unet
+        self.model_version = model_version
+    def forward(self, sample, timestep, encoder_hidden_states, *extra_inputs):
+        input_index = 0
+        timestep_cond = None
+        if self._is_lcm:
+            timestep_cond = extra_inputs[input_index]
+            input_index += 1
+        added_cond_kwargs = None
+        if self._is_sdxl:
+            time_ids = extra_inputs[input_index]
+            text_embeds = extra_inputs[input_index + 1]
+            input_index += 2
+            added_cond_kwargs = {
+                "time_ids": time_ids,
+                "text_embeds": text_embeds,
+            }
+        additional_residuals = extra_inputs[input_index:]
+        down_residuals = None
+        mid_residual = None
+        if additional_residuals:
+            down_residuals = tuple(additional_residuals[:-1])
+            mid_residual = additional_residuals[-1]
+        outputs = self.unet(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            timestep_cond=timestep_cond,
+            added_cond_kwargs=added_cond_kwargs,
+            down_block_additional_residuals=down_residuals,
+            mid_block_additional_residual=mid_residual,
+            return_dict=False,
+        )
+        return outputs[0]
+    @property
+    def _is_lcm(self):
+        return self.model_version.name == "LCM"
+    @property
+    def _is_sdxl(self):
+        return self.model_version.name in {"SDXL", "SDXL_REFINER"}

coreml_diffusion/convert.py ADDED Viewed

@@ -0,0 +1,348 @@
+"""Core ML UNet conversion mechanics — framework-free.
+Moved from ``coreml_suite/converter.py`` in extraction phase E2. This module
+produces a ``.mlpackage`` on disk and stops there: it must NOT import ``comfy``,
+``folder_paths``, or ``comfy_extras``. Output paths are inputs, not resolved here.
+``get_sample_input`` carries an optional ``scheduler`` so the LCM path (which
+derives the trace timestep from an LCM scheduler) shares this single
+implementation instead of keeping a near-duplicate copy.
+"""
+import gc
+import os
+import time
+import coremltools as ct
+import numpy as np
+import torch
+from diffusers import UNet2DConditionModel
+from coreml_diffusion.attention import ATTENTION_IMPLEMENTATIONS
+from coreml_diffusion.conversion.attention import apply_attention_implementation
+from coreml_diffusion.conversion.shapes import conv2d_output_shape
+from coreml_diffusion.conversion.trace import prepare_unet_for_coreml_trace
+from coreml_diffusion.conversion.unet import CoreMLUNetWrapper
+from coreml_diffusion.logger import logger
+from coreml_diffusion.model_version import ModelVersion
+DEFAULT_TRACE_TIMESTEP = 999.0
+TEXT_TOKEN_SEQUENCE_LENGTH = 77
+def get_unet(model_version: ModelVersion, ref_unet, attention_implementation):
+    ref_unet = prepare_unet_for_coreml_trace(ref_unet)
+    unet = apply_attention_implementation(
+        ref_unet.eval(),
+        attention_implementation,
+    )
+    return CoreMLUNetWrapper(unet, model_version)
+def get_encoder_hidden_states_shape(ref_unet, batch_size):
+    encoder_hidden_states_shape = (
+        batch_size,
+        TEXT_TOKEN_SEQUENCE_LENGTH,
+        ref_unet.config.cross_attention_dim,
+    )
+    return encoder_hidden_states_shape
+def get_coreml_inputs(sample_inputs):
+    coreml_sample_unet_inputs = {
+        k: v.numpy().astype(np.float16) for k, v in sample_inputs.items()
+    }
+    return [
+        ct.TensorType(
+            name=k,
+            shape=v.shape,
+            dtype=v.numpy().dtype if isinstance(v, torch.Tensor) else v.dtype,
+        )
+        for k, v in coreml_sample_unet_inputs.items()
+    ]
+def load_coreml_model(out_path):
+    logger.info(f"Loading model from {out_path}")
+    start = time.time()
+    coreml_model = ct.models.MLModel(out_path)
+    logger.info(f"Loading {out_path} took {time.time() - start:.1f} seconds")
+    return coreml_model
+def convert_to_coreml(
+    submodule_name, torchscript_module, sample_inputs, output_names, out_path
+):
+    if os.path.exists(out_path):
+        logger.info(f"Skipping export because {out_path} already exists")
+        coreml_model = load_coreml_model(out_path)
+    else:
+        logger.info(f"Converting {submodule_name} to CoreML..")
+        coreml_model = ct.convert(
+            torchscript_module,
+            convert_to="mlprogram",
+            minimum_deployment_target=ct.target.macOS13,
+            inputs=sample_inputs,
+            outputs=[
+                ct.TensorType(name=name, dtype=np.float32) for name in output_names
+            ],
+            skip_model_load=True,
+        )
+        del torchscript_module
+        gc.collect()
+    return coreml_model
+def get_sample_input(
+    batch_size, encoder_hidden_states_shape, sample_shape, scheduler=None
+):
+    """Build the example inputs used to JIT-trace the UNet.
+    When ``scheduler`` is provided (the LCM path) the trace timestep is taken
+    from ``scheduler.timesteps[0]``; otherwise the fixed ``DEFAULT_TRACE_TIMESTEP``
+    is used. Only the shapes/dtypes/order of these tensors matter to the traced
+    graph — the random values are placeholders.
+    """
+    timestep_value = (
+        scheduler.timesteps[0].item()
+        if scheduler is not None
+        else DEFAULT_TRACE_TIMESTEP
+    )
+    sample_unet_inputs = dict(
+        [
+            ("sample", torch.rand(*sample_shape)),
+            (
+                "timestep",
+                torch.tensor([timestep_value] * batch_size).to(torch.float32),
+            ),
+            ("encoder_hidden_states", torch.rand(*encoder_hidden_states_shape)),
+        ]
+    )
+    return sample_unet_inputs
+def lcm_inputs(sample_unet_inputs):
+    batch_size = sample_unet_inputs["sample"].shape[0]
+    return {"timestep_cond": torch.randn(batch_size, 256).to(torch.float32)}
+def sdxl_inputs(sample_unet_inputs, ref_unet, model_version):
+    sample_shape = sample_unet_inputs["sample"].shape
+    batch_size = sample_shape[0]
+    h = sample_shape[2] * 8
+    w = sample_shape[3] * 8
+    original_size = (h, w)
+    crops_coords_top_left = (0, 0)
+    is_refiner = model_version == ModelVersion.SDXL_REFINER
+    if is_refiner:
+        aesthetic_score = (6.0,)
+        time_ids_list = list(original_size + crops_coords_top_left + aesthetic_score)
+    else:
+        target_size = (h, w)
+        time_ids_list = list(original_size + crops_coords_top_left + target_size)
+    time_ids = torch.tensor(time_ids_list).repeat(batch_size, 1).to(torch.int64)
+    text_embeds_shape = (
+        batch_size,
+        get_sdxl_text_embeds_dim(ref_unet, len(time_ids_list)),
+    )
+    return {
+        "time_ids": time_ids,
+        "text_embeds": torch.randn(*text_embeds_shape).to(torch.float32),
+    }
+def get_sdxl_text_embeds_dim(ref_unet, time_ids_dim):
+    projection_dim = ref_unet.config.projection_class_embeddings_input_dim
+    time_embed_dim = ref_unet.config.addition_time_embed_dim
+    return projection_dim - (time_ids_dim * time_embed_dim)
+def get_inputs_spec(inputs):
+    inputs_spec = {k: (v.shape, v.dtype) for k, v in inputs.items()}
+    return inputs_spec
+def add_cnet_support(sample_shape, reference_unet):
+    additional_residuals_shapes = []
+    batch_size = sample_shape[0]
+    h, w = sample_shape[2:]
+    # conv_in
+    out_h, out_w = conv2d_output_shape(
+        h,
+        w,
+        reference_unet.conv_in,
+    )
+    additional_residuals_shapes.append(
+        (batch_size, reference_unet.conv_in.out_channels, out_h, out_w)
+    )
+    # down_blocks
+    for down_block in reference_unet.down_blocks:
+        additional_residuals_shapes += [
+            (batch_size, resnet.out_channels, out_h, out_w)
+            for resnet in down_block.resnets
+        ]
+        if hasattr(down_block, "downsamplers") and down_block.downsamplers is not None:
+            for downsampler in down_block.downsamplers:
+                out_h, out_w = conv2d_output_shape(out_h, out_w, downsampler.conv)
+            additional_residuals_shapes.append(
+                (
+                    batch_size,
+                    down_block.downsamplers[-1].conv.out_channels,
+                    out_h,
+                    out_w,
+                )
+            )
+    # mid_block
+    additional_residuals_shapes.append(
+        (batch_size, reference_unet.mid_block.resnets[-1].out_channels, out_h, out_w)
+    )
+    additional_inputs = {}
+    for i, shape in enumerate(additional_residuals_shapes):
+        sample_residual_input = torch.rand(*shape)
+        additional_inputs[f"additional_residual_{i}"] = sample_residual_input
+    return additional_inputs
+def convert_unet(
+    ref_unet,
+    model_version: ModelVersion,
+    unet_out_path: str,
+    batch_size: int = 1,
+    sample_size: tuple[int, int] = (64, 64),
+    controlnet_support: bool = False,
+    attention_implementation: str = ATTENTION_IMPLEMENTATIONS[0],
+    quantize_nbits: str = "none",
+):
+    coreml_unet = get_unet(model_version, ref_unet, attention_implementation)
+    sample_shape = (
+        batch_size,  # B
+        ref_unet.config.in_channels,  # C
+        sample_size[0],  # H
+        sample_size[1],  # W
+    )
+    encoder_hidden_states_shape = get_encoder_hidden_states_shape(ref_unet, batch_size)
+    sample_inputs = get_sample_input(
+        batch_size, encoder_hidden_states_shape, sample_shape
+    )
+    if model_version == ModelVersion.LCM:
+        sample_inputs |= lcm_inputs(sample_inputs)
+    if model_version in {ModelVersion.SDXL, ModelVersion.SDXL_REFINER}:
+        sample_inputs |= sdxl_inputs(sample_inputs, ref_unet, model_version)
+    if controlnet_support:
+        sample_inputs |= add_cnet_support(sample_shape, ref_unet)
+    sample_inputs_spec = get_inputs_spec(sample_inputs)
+    logger.info(f"Sample UNet inputs spec: {sample_inputs_spec}")
+    logger.info("JIT tracing..")
+    traced_unet = torch.jit.trace(
+        coreml_unet, example_inputs=list(sample_inputs.values())
+    )
+    logger.info("Done.")
+    coreml_sample_inputs = get_coreml_inputs(sample_inputs)
+    coreml_unet = convert_to_coreml(
+        "unet", traced_unet, coreml_sample_inputs, ["noise_pred"], unet_out_path
+    )
+    del traced_unet
+    gc.collect()
+    if quantize_nbits != "none":
+        # Opt-in k-means weight palettization. The default path
+        # (quantize_nbits="none") leaves the traced UNet untouched.
+        from coremltools.optimize.coreml import (
+            OpPalettizerConfig,
+            OptimizationConfig,
+            palettize_weights,
+        )
+        nbits = int(quantize_nbits)
+        logger.info(f"Palettizing UNet weights to {nbits}-bit (kmeans)..")
+        t0 = time.time()
+        cfg = OptimizationConfig(
+            global_config=OpPalettizerConfig(mode="kmeans", nbits=nbits)
+        )
+        coreml_unet = palettize_weights(coreml_unet, config=cfg)
+        logger.info(f"Palettization took {time.time() - t0:.1f}s")
+    coreml_unet.save(unet_out_path)
+    logger.info(f"Saved unet into {unet_out_path}")
+def convert(
+    ckpt_path: str,
+    model_version: ModelVersion,
+    out_path: str,
+    *,
+    batch_size: int = 1,
+    sample_size: tuple[int, int] = (64, 64),
+    controlnet_support: bool = False,
+    lora_weights: list[tuple[str | os.PathLike, float]] = None,
+    attn_impl: str = ATTENTION_IMPLEMENTATIONS[0],
+    config_path: str = None,
+    quantize_nbits: str = "none",
+):
+    """Convert a single-file checkpoint's UNet to a Core ML ``.mlpackage``.
+    Keyword-only past the three required positionals so the package can add
+    capabilities (new keyword args) without breaking an older caller — the
+    versioned interface contract. Writes ``out_path``; returns None.
+    """
+    if os.path.exists(out_path):
+        logger.info(f"Found existing model at {out_path}! Skipping..")
+        return
+    if attn_impl not in ATTENTION_IMPLEMENTATIONS:
+        raise ValueError(
+            f"Unsupported attention implementation {attn_impl!r}. "
+            f"Expected one of {ATTENTION_IMPLEMENTATIONS}."
+        )
+    ref_unet = load_unet(ckpt_path, config_path)
+    for i, lora_weight in enumerate(lora_weights or []):
+        lora_path, strength = lora_weight
+        adapter_name = f"lora_{i}"
+        ref_unet.load_lora_adapter(lora_path, adapter_name=adapter_name)
+        ref_unet.set_adapters([adapter_name], weights=[strength])
+        ref_unet.fuse_lora()
+    convert_unet(
+        ref_unet,
+        model_version,
+        out_path,
+        batch_size,
+        sample_size,
+        controlnet_support,
+        attention_implementation=attn_impl,
+        quantize_nbits=quantize_nbits,
+    )
+def load_unet(ckpt_path, config_path):
+    return UNet2DConditionModel.from_single_file(
+        ckpt_path,
+        original_config=config_path,
+    )

coreml_diffusion/logger.py ADDED Viewed

@@ -0,0 +1,5 @@
+import logging
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)

coreml_diffusion/model_version.py ADDED Viewed

@@ -0,0 +1,8 @@
+from enum import Enum
+class ModelVersion(Enum):
+    SD15 = "sd15"
+    SDXL = "sdxl"
+    SDXL_REFINER = "sdxl_refiner"
+    LCM = "lcm"

coreml_diffusion/naming.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""Pure out_name composition for the Core ML UNet artifact.
+Extracted from CoreMLConverter.convert so the filename contract
+can be tested + reused without instantiating the node. The string is the
+cache key: every workflow that references a converted .mlpackage depends
+on it staying byte-for-byte identical.
+"""
+from typing import Iterable, Tuple
+ATTN_SUFFIX = {
+    "SPLIT_EINSUM": "se",
+    "SPLIT_EINSUM_V2": "se2",
+    "ORIGINAL": "orig",
+}
+# Palettization bits. "none" = no quantization (default; keeps the
+# unquantized filename intact so existing workflows still resolve their
+# cached .mlpackage). Numeric values append a `_q<bits>` suffix.
+QUANT_NBITS_VALUES = ("none", "8", "6", "4")
+def compose_out_name(
+    *,
+    ckpt_name: str,
+    batch_size: int,
+    width: int,
+    height: int,
+    controlnet_support: bool,
+    attention_implementation: str,
+    lora_names: Iterable[str] = (),
+    quantize_nbits: str = "none",
+) -> str:
+    """Build the .mlpackage stem from convert() parameters.
+    Locked behaviour (characterization tests):
+      - first '.' in ckpt_name wins (`a.b.c.safetensors` -> `a`)
+      - spaces collapse to underscores
+      - LoRA names are taken stem-only, sorted, joined with '_' and
+        prefixed with '_' when present (caller is expected to pass a
+        sorted list; we sort defensively)
+      - controlnet adds `_cn`
+      - attn suffix is `_se` | `_se2` | `_orig`
+    Quantization:
+      - quantize_nbits "none" (default) appends nothing — existing
+        unquantized .mlpackages keep the old filename
+      - "4" / "6" / "8" appends `_q<bits>` after the attn suffix
+    """
+    if quantize_nbits not in QUANT_NBITS_VALUES:
+        raise ValueError(
+            f"quantize_nbits={quantize_nbits!r} not in {QUANT_NBITS_VALUES}"
+        )
+    stem = ckpt_name.split(".")[0]
+    sorted_names = sorted(lora_names)
+    lora_str = (
+        "_" + "_".join(name.split(".")[0] for name in sorted_names)
+        if sorted_names
+        else ""
+    )
+    cn_suffix = "_cn" if controlnet_support else ""
+    attn_suffix = "_" + ATTN_SUFFIX[attention_implementation]
+    quant_suffix = f"_q{quantize_nbits}" if quantize_nbits != "none" else ""
+    out_name = (
+        f"{stem}{lora_str}_{batch_size}x{width}x{height}"
+        f"{cn_suffix}{attn_suffix}{quant_suffix}"
+    )
+    return out_name.replace(" ", "_")
+def lora_names_from_params(lora_params: Iterable[Tuple[str, float]]) -> list[str]:
+    """Mirror the sort applied inside CoreMLConverter.convert."""
+    return [name for name, _ in sorted(lora_params, key=lambda pair: pair[0])]

coreml_diffusion-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,98 @@
+Metadata-Version: 2.4
+Name: coreml-diffusion
+Version: 0.1.0
+Summary: Convert diffusion-model checkpoints (SD1.5/SDXL) to Core ML for Apple Neural Engine — framework-free, ComfyUI-independent.
+Project-URL: Homepage, https://github.com/aszc-dev/coreml-diffusion
+Project-URL: Repository, https://github.com/aszc-dev/coreml-diffusion
+Project-URL: Issues, https://github.com/aszc-dev/coreml-diffusion/issues
+Author-email: Adrian Szczepański <hi@aszc.dev>
+License-Expression: MIT
+License-File: LICENSE
+Keywords: ane,apple-neural-engine,comfyui,core-ml,coreml,diffusers,diffusion,sdxl,stable-diffusion
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Operating System :: MacOS
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Multimedia :: Graphics :: Graphics Conversion
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Typing :: Typed
+Requires-Python: <3.13,>=3.12
+Requires-Dist: coremltools<10,>=9
+Requires-Dist: diffusers>=0.30
+Requires-Dist: numpy<3,>=2
+Requires-Dist: omegaconf>=2.3
+Requires-Dist: peft>=0.13
+Requires-Dist: torch<2.8,>=2.7
+Requires-Dist: transformers>=4.44
+Description-Content-Type: text/markdown
+# coreml-diffusion
+Convert diffusion-model checkpoints into Core ML `.mlpackage` artifacts for the
+Apple Neural Engine (ANE) — framework-free and independent of ComfyUI.
+`coreml-diffusion` takes a single-file Stable Diffusion checkpoint and produces a
+Core ML UNet you can run on-device (macOS/iOS) or load back into ComfyUI via
+[ComfyUI-CoreMLSuite](https://github.com/aszc-dev/ComfyUI-CoreMLSuite), which
+depends on this package for its conversion path.
+## Positioning
+The niche is **diffusion models on the Apple Neural Engine via Core ML** — inside
+ComfyUI and on-device. ANE is the differentiator: low-power, GPU-free, embeddable
+in a Swift/iOS app. This is about feasibility and power efficiency for SD1.5/SDXL
+on ANE, not a raw-throughput claim against desktop GPUs.
+Supported today: SD1.5 and SDXL (verified). SDXL refiner and LCM convert but are
+not yet golden-verified (experimental). The scope is diffusion architectures
+generally, not Stable Diffusion specifically.
+## Install
+```sh
+uv pip install coreml-diffusion          # from PyPI (planned)
+uv pip install -e .                       # from a checkout
+```
+Requires Python 3.12 and (for conversion) `coremltools` 9 — conversion runs on
+macOS; the package imports and its CLI parse on any platform.
+## CLI
+```sh
+coreml-diffusion convert \
+    --ckpt path/to/model.safetensors \
+    --model-version SD15 \
+    --out unet.mlpackage \
+    --height 512 --width 512 \
+    --attn-impl SPLIT_EINSUM \
+    --quantize none
+```
+Options: `--batch-size`, `--controlnet`, `--lora PATH[:STRENGTH]` (repeatable),
+`--config` (original-config YAML). `--quantize {none,8,6,4}` applies k-means
+weight palettization. Run `coreml-diffusion convert --help` for the full list.
+The output `.mlpackage` is the deliverable: load it natively in Swift/Core ML, or
+through ComfyUI-CoreMLSuite.
+## Library
+```python
+import coreml_diffusion
+from coreml_diffusion import ModelVersion
+coreml_diffusion.convert(
+    "model.safetensors", ModelVersion.SD15, "unet.mlpackage",
+    height=512, width=512, attn_impl="SPLIT_EINSUM",
+)
+```
+Discovery API (`list_model_versions`, `list_attention_impls`, `list_quant_modes`,
+`CONTRACT_VERSION`) reports what this build can convert; the identifiers are an
+additive-only contract (removing/renaming one is a major bump).
+## License
+MIT

coreml_diffusion-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+coreml_diffusion/__init__.py,sha256=IVtKAGpIHR25M-9r2kF4RIGZf599PGzuSdBykA_7jD8,4118
+coreml_diffusion/attention.py,sha256=adbE3AmV7uR-FRyILYGy-_3tBtFnSny4ZGutIVfOnPc,91
+coreml_diffusion/cli.py,sha256=TIoClej1JMdlNh9BXtmSls2NcCTAXTZogiFnrWfEt9s,3896
+coreml_diffusion/convert.py,sha256=8Q4IcXbaTQ9AGD89J_eeCI1h1f55CsypsL5OHYg6c1Y,11153
+coreml_diffusion/logger.py,sha256=PE9S6WFmT-3UxG4830IqAeCDOdw0laUNwwu_Q65pYPw,105
+coreml_diffusion/model_version.py,sha256=wjjfLqMRU8LITzylY7k1o0r9L5gEtgg117DC50fBcDY,136
+coreml_diffusion/naming.py,sha256=SDLAI2PnBCyPnZ0ufjUR1oagRJbwur2M1PYNFYulI2w,2589
+coreml_diffusion/conversion/__init__.py,sha256=veWEFzP7tsjSIukFeDIL1H1H6BRMF38rzyd7XE5E2TQ,443
+coreml_diffusion/conversion/attention.py,sha256=VxoO2-unK8iXYMdj8oFNereZUHfk9AGH2xmj5RMLhBA,6710
+coreml_diffusion/conversion/shapes.py,sha256=kJP0lIh5ty2JwLc70va67Neovu-wtP6LXMQDycTPhDM,726
+coreml_diffusion/conversion/trace.py,sha256=iiIh0ZzaULyz5PP8EUN14rlogZ5a9jmAMBJ9LKxWUug,1707
+coreml_diffusion/conversion/unet.py,sha256=nljZgNMY667vbAxDZPC_dS2fF861fzgVSsDygXwEPpU,1701
+coreml_diffusion-0.1.0.dist-info/METADATA,sha256=zjlX9MaUEoShe7Ks-Z6sagvM3EPOpMzOjNZn7a3Exio,3600
+coreml_diffusion-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+coreml_diffusion-0.1.0.dist-info/entry_points.txt,sha256=oYMr6Rre4ErwzBzfgxeFQ1isiwKOeGHswipc0IDB38o,63
+coreml_diffusion-0.1.0.dist-info/licenses/LICENSE,sha256=0L46frKmxey5OMCRWgckyvNBwVT1t4YXMNLs0ZUh5bI,1081
+coreml_diffusion-0.1.0.dist-info/RECORD,,

coreml_diffusion-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

coreml_diffusion-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ coreml-diffusion = coreml_diffusion.cli:main

coreml_diffusion-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2023-2026 Adrian Szczepański
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.