PyPI - coreml-diffusion - Versions diffs - 0.1.0__tar.gz → 0.1.2__tar.gz - Mend

coreml-diffusion 0.1.0tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

coreml_diffusion-0.1.2/.github/workflows/release-please.yml ADDED Viewed

@@ -0,0 +1,27 @@
+name: Release Please
+# Manages the release cycle: maintains a Release PR that bumps the version in
+# pyproject.toml and curates CHANGELOG.md from Conventional Commits (only the
+# user-facing types in release-please-config.json's changelog-sections are
+# surfaced). Merging that PR tags + publishes a GitHub Release.
+#
+# Runs with GH_CI_PAT (not the default GITHUB_TOKEN) so the Release it creates
+# triggers publish-pypi.yml — events made with GITHUB_TOKEN do not start other
+# workflows.
+on:
+  push:
+    branches:
+      - main
+permissions:
+  contents: write
+  pull-requests: write
+jobs:
+  release-please:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: googleapis/release-please-action@v4
+        with:
+          token: ${{ secrets.GH_CI_PAT }}

coreml_diffusion-0.1.2/.github/workflows/tier2.yml ADDED Viewed

@@ -0,0 +1,74 @@
+name: Tier 2 — M2 / ANE (self-hosted)
+on:
+  pull_request:
+    # `labeled` fires when run-m2 is first added; `synchronize`/`reopened`
+    # re-run on every subsequent push while the label is present, so the result
+    # tracks the PR head instead of going stale. The `if` below keeps the run
+    # gated on the run-m2 label for all pull_request events.
+    types: [labeled, synchronize, reopened]
+  schedule:
+    # Nightly at 04:00 UTC (~05/06 in PL). Keeps the ANE path honest without
+    # burning the runner on every PR.
+    - cron: "0 4 * * *"
+  workflow_dispatch:
+jobs:
+  m2:
+    if: |
+      github.event_name == 'schedule' ||
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' &&
+       contains(github.event.pull_request.labels.*.name, 'run-m2'))
+    # Self-hosted Apple Silicon runner (shared with ComfyUI-CoreMLSuite's Tier 2,
+    # same `coreml` label). The runner's environment MUST export
+    # COREML_DIFFUSION_TEST_CKPT: an absolute path to a cached single-file SD1.5
+    # checkpoint. The gate converts it fresh and runs the comfy-free inference
+    # golden — no ComfyUI involved.
+    runs-on: [self-hosted, macOS, ARM64, coreml]
+    timeout-minutes: 90
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+      - name: uv sync
+        run: uv sync
+      - name: Check cached checkpoint
+        # The runner's .env must export COREML_DIFFUSION_TEST_CKPT — an absolute
+        # path to a cached single-file SD1.5 checkpoint.
+        run: |
+          set -euo pipefail
+          if [ -z "${COREML_DIFFUSION_TEST_CKPT:-}" ]; then
+            echo "COREML_DIFFUSION_TEST_CKPT unset — add it to the runner's .env."
+            exit 1
+          fi
+          test -f "$COREML_DIFFUSION_TEST_CKPT" || {
+            echo "checkpoint not found: $COREML_DIFFUSION_TEST_CKPT"; exit 1; }
+          echo "Tier 2: checkpoint \`$COREML_DIFFUSION_TEST_CKPT\`" >> "$GITHUB_STEP_SUMMARY"
+      - name: Convert UNet fresh (batch=2 for CFG)
+        # Convert on every run. The .mlpackage cache key is conversion
+        # *parameters* only, not the conversion code or toolchain — a stale model
+        # would let a conversion regression pass. batch=2 because guided CFG feeds
+        # uncond+cond in a single forward pass, and ANE input shapes are fixed at
+        # convert time.
+        run: |
+          set -euo pipefail
+          MLPKG="$RUNNER_TEMP/sd15_b2.mlpackage"
+          rm -rf "$MLPKG"
+          uv run coreml-diffusion convert \
+            --ckpt "$COREML_DIFFUSION_TEST_CKPT" \
+            --model-version SD15 \
+            --out "$MLPKG" \
+            --batch-size 2 --height 512 --width 512 --attn-impl SPLIT_EINSUM
+          echo "COREML_DIFFUSION_TEST_MLPACKAGE=$MLPKG" >> "$GITHUB_ENV"
+      - name: Run Tier 2 (m2 marker)
+        # Builds a stock diffusers pipeline around the converted UNet and asserts
+        # the generated image against the committed golden (exact match, else
+        # PSNR >= GOLDEN_PSNR_MIN_DB). VAE/text encoder on torch, UNet on the ANE.
+        run: uv run --no-sync pytest -m m2 tests/ -v

{coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/.gitignore RENAMED Viewed

@@ -6,3 +6,4 @@ __pycache__/
 dist/
 build/
 *.egg-info/
+CLAUDE.md

coreml_diffusion-0.1.2/.release-please-manifest.json ADDED Viewed

@@ -0,0 +1,3 @@
+{
+  ".": "0.1.2"
+}

coreml_diffusion-0.1.2/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,8 @@
+# Changelog
+## [0.1.2](https://github.com/aszc-dev/coreml-diffusion/compare/v0.1.1...v0.1.2) (2026-05-27)
+### 🐛 Bug Fixes
+* **attention:** convertible fp32 ORIGINAL attention for the Core ML GPU path ([#2](https://github.com/aszc-dev/coreml-diffusion/issues/2)) ([28e56fc](https://github.com/aszc-dev/coreml-diffusion/commit/28e56fcf8c2242ebbe4c05abd05f7e796069d7d1))

coreml_diffusion-0.1.2/PKG-INFO ADDED Viewed

@@ -0,0 +1,135 @@
+Metadata-Version: 2.4
+Name: coreml-diffusion
+Version: 0.1.2
+Summary: Convert diffusion-model checkpoints (SD1.5/SDXL) to Core ML for Apple Neural Engine — framework-free, ComfyUI-independent.
+Project-URL: Homepage, https://github.com/aszc-dev/coreml-diffusion
+Project-URL: Repository, https://github.com/aszc-dev/coreml-diffusion
+Project-URL: Issues, https://github.com/aszc-dev/coreml-diffusion/issues
+Author-email: Adrian Szczepański <hi@aszc.dev>
+License-Expression: MIT
+License-File: LICENSE
+Keywords: ane,apple-neural-engine,comfyui,core-ml,coreml,diffusers,diffusion,sdxl,stable-diffusion
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Operating System :: MacOS
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Multimedia :: Graphics :: Graphics Conversion
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Typing :: Typed
+Requires-Python: <3.13,>=3.12
+Requires-Dist: coremltools<10,>=9
+Requires-Dist: diffusers>=0.30
+Requires-Dist: numpy<3,>=2
+Requires-Dist: omegaconf>=2.3
+Requires-Dist: peft>=0.13
+Requires-Dist: torch>=2.7
+Requires-Dist: transformers<5,>=4.44
+Description-Content-Type: text/markdown
+# coreml-diffusion
+Convert diffusion-model checkpoints into Core ML `.mlpackage` artifacts for the
+Apple Neural Engine (ANE) — framework-free and standalone.
+`coreml-diffusion` takes a single-file Stable Diffusion checkpoint and produces a
+Core ML UNet you can run on-device (macOS/iOS) via Core ML, in a Python pipeline,
+or load into any host that consumes the artifact.
+## What this is
+A standalone toolkit and knowledge base for running diffusion models on the Apple
+Neural Engine via Core ML. The niche is **diffusion on the ANE**: low-power,
+GPU-free, embeddable in a Swift/iOS app. ANE is the differentiator — this is about
+feasibility and power efficiency for SD1.5/SDXL on ANE, not a raw-throughput claim
+against desktop GPUs.
+The scope is diffusion architectures generally, not Stable Diffusion specifically.
+The project aims to gather, in one place: the conversion path, a reproducible
+benchmarking suite for objective comparison, a per-model catalogue documenting the
+quirks of each architecture on the ANE, and the sources behind it all.
+Supported today: SD1.5 and SDXL (verified). SDXL refiner and LCM convert but are
+not yet golden-verified (experimental).
+## Install
+```sh
+uv pip install coreml-diffusion          # from PyPI
+uv pip install -e .                       # from a checkout
+```
+Requires Python 3.12 and (for conversion) `coremltools` 9 — conversion runs on
+macOS; the package imports and its CLI parse on any platform.
+## CLI
+```sh
+coreml-diffusion convert \
+    --ckpt path/to/model.safetensors \
+    --model-version SD15 \
+    --out unet.mlpackage \
+    --height 512 --width 512 \
+    --attn-impl SPLIT_EINSUM \
+    --quantize none
+```
+Options: `--batch-size`, `--controlnet`, `--lora PATH[:STRENGTH]` (repeatable),
+`--config` (original-config YAML). `--quantize {none,8,6,4}` applies k-means
+weight palettization. Run `coreml-diffusion convert --help` for the full list.
+The output `.mlpackage` is the deliverable: load it natively in Swift/Core ML, run
+it through the Python inference pipeline below, or hand it to any consuming host.
+### Model sources
+Register directories so `--ckpt` accepts a bare name instead of a full path:
+```sh
+coreml-diffusion sources add comfy /path/to/ComfyUI/models   # --kind comfy|flat
+coreml-diffusion sources list                                # sources + checkpoints
+coreml-diffusion convert --ckpt v1-5-pruned-emaonly --model-version SD15 --out unet.mlpackage
+```
+Sources are recorded in `~/.config/coreml-diffusion/sources.toml`. `comfy` knows the
+`models/{checkpoints,loras,vae,...}` layout; `flat` is a plain checkpoint directory.
+## Library
+```python
+import coreml_diffusion
+from coreml_diffusion import ModelVersion
+coreml_diffusion.convert(
+    "model.safetensors", ModelVersion.SD15, "unet.mlpackage",
+    height=512, width=512, attn_impl="SPLIT_EINSUM",
+)
+```
+## Inference (in progress)
+A framework-free inference path lets a converted `.mlpackage` generate images with
+no host framework: a `diffusers` pipeline runs the stock VAE / text encoder on
+torch while the UNet is served from Core ML on the ANE. This doubles as the
+package's own regression anchor — the Tier 2 (`m2`) golden image, asserted on an
+Apple Silicon runner — and as the reference for the on-device write-up. See
+`tests/m2/`.
+## Discovery API
+`list_model_versions`, `list_attention_impls`, `list_quant_modes`, and
+`CONTRACT_VERSION` report what this build can convert. The identifiers are an
+additive-only contract: removing or renaming one is a major version bump, because
+downstream consumers reference these strings verbatim.
+## ComfyUI
+[ComfyUI-CoreMLSuite](https://github.com/aszc-dev/ComfyUI-CoreMLSuite) consumes
+this package for its conversion path and drives its node dropdowns from the
+discovery API above — installing a newer `coreml-diffusion` surfaces new
+conversion types in the node with no Suite change. The Suite is one consumer;
+this package neither depends on nor requires ComfyUI.
+## License
+MIT

coreml_diffusion-0.1.2/README.md ADDED Viewed

@@ -0,0 +1,106 @@
+# coreml-diffusion
+Convert diffusion-model checkpoints into Core ML `.mlpackage` artifacts for the
+Apple Neural Engine (ANE) — framework-free and standalone.
+`coreml-diffusion` takes a single-file Stable Diffusion checkpoint and produces a
+Core ML UNet you can run on-device (macOS/iOS) via Core ML, in a Python pipeline,
+or load into any host that consumes the artifact.
+## What this is
+A standalone toolkit and knowledge base for running diffusion models on the Apple
+Neural Engine via Core ML. The niche is **diffusion on the ANE**: low-power,
+GPU-free, embeddable in a Swift/iOS app. ANE is the differentiator — this is about
+feasibility and power efficiency for SD1.5/SDXL on ANE, not a raw-throughput claim
+against desktop GPUs.
+The scope is diffusion architectures generally, not Stable Diffusion specifically.
+The project aims to gather, in one place: the conversion path, a reproducible
+benchmarking suite for objective comparison, a per-model catalogue documenting the
+quirks of each architecture on the ANE, and the sources behind it all.
+Supported today: SD1.5 and SDXL (verified). SDXL refiner and LCM convert but are
+not yet golden-verified (experimental).
+## Install
+```sh
+uv pip install coreml-diffusion          # from PyPI
+uv pip install -e .                       # from a checkout
+```
+Requires Python 3.12 and (for conversion) `coremltools` 9 — conversion runs on
+macOS; the package imports and its CLI parse on any platform.
+## CLI
+```sh
+coreml-diffusion convert \
+    --ckpt path/to/model.safetensors \
+    --model-version SD15 \
+    --out unet.mlpackage \
+    --height 512 --width 512 \
+    --attn-impl SPLIT_EINSUM \
+    --quantize none
+```
+Options: `--batch-size`, `--controlnet`, `--lora PATH[:STRENGTH]` (repeatable),
+`--config` (original-config YAML). `--quantize {none,8,6,4}` applies k-means
+weight palettization. Run `coreml-diffusion convert --help` for the full list.
+The output `.mlpackage` is the deliverable: load it natively in Swift/Core ML, run
+it through the Python inference pipeline below, or hand it to any consuming host.
+### Model sources
+Register directories so `--ckpt` accepts a bare name instead of a full path:
+```sh
+coreml-diffusion sources add comfy /path/to/ComfyUI/models   # --kind comfy|flat
+coreml-diffusion sources list                                # sources + checkpoints
+coreml-diffusion convert --ckpt v1-5-pruned-emaonly --model-version SD15 --out unet.mlpackage
+```
+Sources are recorded in `~/.config/coreml-diffusion/sources.toml`. `comfy` knows the
+`models/{checkpoints,loras,vae,...}` layout; `flat` is a plain checkpoint directory.
+## Library
+```python
+import coreml_diffusion
+from coreml_diffusion import ModelVersion
+coreml_diffusion.convert(
+    "model.safetensors", ModelVersion.SD15, "unet.mlpackage",
+    height=512, width=512, attn_impl="SPLIT_EINSUM",
+)
+```
+## Inference (in progress)
+A framework-free inference path lets a converted `.mlpackage` generate images with
+no host framework: a `diffusers` pipeline runs the stock VAE / text encoder on
+torch while the UNet is served from Core ML on the ANE. This doubles as the
+package's own regression anchor — the Tier 2 (`m2`) golden image, asserted on an
+Apple Silicon runner — and as the reference for the on-device write-up. See
+`tests/m2/`.
+## Discovery API
+`list_model_versions`, `list_attention_impls`, `list_quant_modes`, and
+`CONTRACT_VERSION` report what this build can convert. The identifiers are an
+additive-only contract: removing or renaming one is a major version bump, because
+downstream consumers reference these strings verbatim.
+## ComfyUI
+[ComfyUI-CoreMLSuite](https://github.com/aszc-dev/ComfyUI-CoreMLSuite) consumes
+this package for its conversion path and drives its node dropdowns from the
+discovery API above — installing a newer `coreml-diffusion` surfaces new
+conversion types in the node with no Suite change. The Suite is one consumer;
+this package neither depends on nor requires ComfyUI.
+## License
+MIT

{coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/__init__.py RENAMED Viewed

@@ -40,6 +40,8 @@ __all__ = [
     "compose_out_name",
     "lora_names_from_params",
     "convert",
+    "build_pipeline",
+    "CoreMLUNet",
 ]
@@ -105,4 +107,8 @@ def __getattr__(name):
         from coreml_diffusion.convert import convert as _convert
         return _convert
+    if name in ("build_pipeline", "CoreMLUNet"):
+        from coreml_diffusion import inference
+        return getattr(inference, name)
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

{coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/cli.py RENAMED Viewed

@@ -14,6 +14,7 @@ Example:
 import argparse
 import coreml_diffusion
+from coreml_diffusion import sources
 def _parse_lora(spec):
@@ -31,8 +32,9 @@ def _parse_lora(spec):
 def _convert_cmd(args):
     sample_size = (args.height // 8, args.width // 8)
     lora_weights = [_parse_lora(spec) for spec in (args.lora or [])]
+    ckpt = sources.resolve_checkpoint(args.ckpt, args.source)
     coreml_diffusion.convert(
-        args.ckpt,
+        ckpt,
         coreml_diffusion.ModelVersion[args.model_version],
         args.out,
         batch_size=args.batch_size,
@@ -45,6 +47,31 @@ def _convert_cmd(args):
     )
+def _sources_add_cmd(args):
+    entry = sources.add_source(args.name, args.path, args.kind)
+    print(f"Added source {args.name!r} ({entry['kind']}): {entry['path']}")
+def _sources_list_cmd(args):
+    registered = sources.load_sources()
+    if not registered:
+        print(f"No sources registered. Config: {sources.config_path()}")
+        return
+    for name, entry in sorted(registered.items()):
+        ckpts = sources.iter_checkpoints(entry)
+        print(f"{name} ({entry['kind']}): {entry['path']}")
+        if ckpts:
+            for stem in ckpts:
+                print(f"  - {stem}")
+        else:
+            print("  (no checkpoints found)")
+def _sources_remove_cmd(args):
+    sources.remove_source(args.name)
+    print(f"Removed source {args.name!r}")
 def build_parser():
     parser = argparse.ArgumentParser(
         prog="coreml-diffusion",
@@ -54,7 +81,15 @@ def build_parser():
     conv = sub.add_parser("convert", help="Convert a checkpoint's UNet to a .mlpackage")
     conv.add_argument(
-        "--ckpt", required=True, help="Path to the source .safetensors checkpoint"
+        "--ckpt",
+        required=True,
+        help="Checkpoint path, or a name resolved against registered sources "
+        "(see 'coreml-diffusion sources')",
+    )
+    conv.add_argument(
+        "--source",
+        default=None,
+        help="Restrict --ckpt name resolution to this registered source",
     )
     conv.add_argument(
         "--model-version",
@@ -101,6 +136,28 @@ def build_parser():
         help="K-means weight palettization bits (default none = unquantized)",
     )
     conv.set_defaults(func=_convert_cmd)
+    src = sub.add_parser("sources", help="Manage model source directories")
+    src_sub = src.add_subparsers(dest="sources_command", required=True)
+    s_add = src_sub.add_parser("add", help="Register (or overwrite) a model source")
+    s_add.add_argument("name", help="Short name for the source, e.g. 'comfy'")
+    s_add.add_argument("path", help="Base directory of the source")
+    s_add.add_argument(
+        "--kind",
+        choices=sources.SOURCE_KINDS,
+        default="comfy",
+        help="Directory layout (default comfy: models/{checkpoints,loras,vae,...})",
+    )
+    s_add.set_defaults(func=_sources_add_cmd)
+    s_list = src_sub.add_parser("list", help="List sources and their checkpoints")
+    s_list.set_defaults(func=_sources_list_cmd)
+    s_rm = src_sub.add_parser("remove", help="Unregister a source")
+    s_rm.add_argument("name", help="Source name to remove")
+    s_rm.set_defaults(func=_sources_remove_cmd)
     return parser

{coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/attention.py RENAMED Viewed

@@ -9,6 +9,7 @@ CHUNK_SIZE = 512
 def apply_attention_implementation(unet, attention_implementation):
     if attention_implementation == "ORIGINAL":
+        unet.set_attn_processor(OriginalAttnProcessor())
         return unet
     if attention_implementation == "SPLIT_EINSUM":
@@ -24,6 +25,43 @@ def apply_attention_implementation(unet, attention_implementation):
     )
+class OriginalAttnProcessor:
+    """Full (non-split) multi-head attention with an fp32 score path.
+    The ORIGINAL implementation targets the Core ML GPU path (SPLIT_EINSUM* are
+    the ANE-friendly default). It is *not* diffusers' stock attention: that path
+    routes through ``F.scaled_dot_product_attention`` plus ``view(B, -1, heads,
+    d)`` reshapes that fail to convert under coremltools 9 (the same einsum graph
+    SPLIT_EINSUM uses converts cleanly). Nor is it diffusers' legacy
+    ``AttnProcessor`` — its ``get_attention_scores`` builds the score buffer with
+    ``torch.empty(query.shape[0], ...)``, whose dynamic int shape also fails ct9.
+    So this reuses the SPLIT_EINSUM conversion-safe boilerplate and supplies a
+    plain full-attention kernel that upcasts QK^T + softmax to fp32. Without the
+    upcast, fp16 self-attention at 64x64 latents (4096 query tokens) overflows ->
+    inf -> NaN after softmax.
+    """
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        return _attention_forward(
+            attn,
+            hidden_states,
+            encoder_hidden_states,
+            attention_mask,
+            temb,
+            original,
+        )
 class SplitEinsumAttnProcessor:
     def __call__(
         self,
@@ -158,6 +196,29 @@ def _attention_forward(
     return hidden_states
+def original(q, k, v, mask, heads, dim_head):
+    """Full multi-head attention with the QK^T scaling + softmax in fp32.
+    Same ``[B, C, 1, S]`` channel-major layout and mask convention as
+    ``split_einsum`` (so it slots into ``_attention_forward`` unchanged), but
+    computes the whole score matrix per head in one batched einsum instead of the
+    per-head split. Upcasting the scores to fp32 keeps the softmax stable when the
+    converted model runs in fp16 (QK^T at 4096 tokens overflows fp16 otherwise).
+    """
+    batch = q.size(0)
+    mh_q = q.view(batch, heads, dim_head, -1).float()
+    mh_k = k.view(batch, heads, dim_head, -1).float()
+    mh_v = v.view(batch, heads, dim_head, -1)
+    weights = torch.einsum("becq,beck->bkeq", mh_q, mh_k) * (dim_head**-0.5)
+    if mask is not None:
+        weights = weights + mask
+    weights = weights.softmax(dim=1).to(mh_v.dtype)
+    outputs = torch.einsum("bkeq,beck->becq", weights, mh_v)
+    return outputs.reshape(batch, heads * dim_head, 1, -1)
 def split_einsum(q, k, v, mask, heads, dim_head):
     q_heads = _split_heads(q, heads, dim_head)
     k = k.transpose(1, 3)

{coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/convert.py RENAMED Viewed

@@ -36,7 +36,11 @@ def get_unet(model_version: ModelVersion, ref_unet, attention_implementation):
         ref_unet.eval(),
         attention_implementation,
     )
-    return CoreMLUNetWrapper(unet, model_version)
+    # The freshly built wrapper defaults to training mode; the inner UNet is
+    # already eval, but coremltools inspects the top-level traced module and warns
+    # ("Model is not in eval mode"). eval() on the wrapper silences it and makes
+    # the eval-mode trace explicit (output is unchanged — UNet dropout p=0).
+    return CoreMLUNetWrapper(unet, model_version).eval()
 def get_encoder_hidden_states_shape(ref_unet, batch_size):

coreml-diffusion 0.1.0__tar.gz → 0.1.2__tar.gz

coreml-diffusion 0.1.0tar.gz → 0.1.2tar.gz