coreml-diffusion 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coreml_diffusion-0.1.2/.github/workflows/release-please.yml +27 -0
- coreml_diffusion-0.1.2/.github/workflows/tier2.yml +74 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/.gitignore +1 -0
- coreml_diffusion-0.1.2/.release-please-manifest.json +3 -0
- coreml_diffusion-0.1.2/CHANGELOG.md +8 -0
- coreml_diffusion-0.1.2/PKG-INFO +135 -0
- coreml_diffusion-0.1.2/README.md +106 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/__init__.py +6 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/cli.py +59 -2
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/attention.py +61 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/convert.py +5 -1
- coreml_diffusion-0.1.2/coreml_diffusion/inference.py +176 -0
- coreml_diffusion-0.1.2/coreml_diffusion/sources.py +170 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/pyproject.toml +12 -5
- coreml_diffusion-0.1.2/release-please-config.json +22 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/conftest.py +0 -2
- coreml_diffusion-0.1.2/tests/m2/goldens/sd15_astronaut.png +0 -0
- coreml_diffusion-0.1.2/tests/m2/goldens/sd15_astronaut.sha256 +1 -0
- coreml_diffusion-0.1.2/tests/m2/test_inference_golden.py +111 -0
- coreml_diffusion-0.1.2/tests/m2/test_original_gpu.py +106 -0
- coreml_diffusion-0.1.2/tests/smoke/test_original_attention.py +95 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/unit/test_cli.py +8 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/unit/test_conversion_helpers.py +24 -3
- coreml_diffusion-0.1.2/tests/unit/test_sources.py +103 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/uv.lock +13 -93
- coreml_diffusion-0.1.0/PKG-INFO +0 -98
- coreml_diffusion-0.1.0/README.md +0 -69
- coreml_diffusion-0.1.0/tests/inference/test_pipeline_inference.py +0 -26
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/.github/workflows/publish-pypi.yml +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/.github/workflows/tier0.yml +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/.github/workflows/tier1.yml +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/LICENSE +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/attention.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/__init__.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/shapes.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/trace.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/unet.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/logger.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/model_version.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/naming.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/smoke/test_split_einsum_attention.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/smoke/test_synthetic_unet.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/unit/test_characterization_out_name.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/unit/test_discovery_api.py +0 -0
- {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/unit/test_tier0_purity.py +0 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: Release Please
|
|
2
|
+
|
|
3
|
+
# Manages the release cycle: maintains a Release PR that bumps the version in
|
|
4
|
+
# pyproject.toml and curates CHANGELOG.md from Conventional Commits (only the
|
|
5
|
+
# user-facing types in release-please-config.json's changelog-sections are
|
|
6
|
+
# surfaced). Merging that PR tags + publishes a GitHub Release.
|
|
7
|
+
#
|
|
8
|
+
# Runs with GH_CI_PAT (not the default GITHUB_TOKEN) so the Release it creates
|
|
9
|
+
# triggers publish-pypi.yml — events made with GITHUB_TOKEN do not start other
|
|
10
|
+
# workflows.
|
|
11
|
+
|
|
12
|
+
on:
|
|
13
|
+
push:
|
|
14
|
+
branches:
|
|
15
|
+
- main
|
|
16
|
+
|
|
17
|
+
permissions:
|
|
18
|
+
contents: write
|
|
19
|
+
pull-requests: write
|
|
20
|
+
|
|
21
|
+
jobs:
|
|
22
|
+
release-please:
|
|
23
|
+
runs-on: ubuntu-latest
|
|
24
|
+
steps:
|
|
25
|
+
- uses: googleapis/release-please-action@v4
|
|
26
|
+
with:
|
|
27
|
+
token: ${{ secrets.GH_CI_PAT }}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
name: Tier 2 — M2 / ANE (self-hosted)
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
# `labeled` fires when run-m2 is first added; `synchronize`/`reopened`
|
|
6
|
+
# re-run on every subsequent push while the label is present, so the result
|
|
7
|
+
# tracks the PR head instead of going stale. The `if` below keeps the run
|
|
8
|
+
# gated on the run-m2 label for all pull_request events.
|
|
9
|
+
types: [labeled, synchronize, reopened]
|
|
10
|
+
schedule:
|
|
11
|
+
# Nightly at 04:00 UTC (~05/06 in PL). Keeps the ANE path honest without
|
|
12
|
+
# burning the runner on every PR.
|
|
13
|
+
- cron: "0 4 * * *"
|
|
14
|
+
workflow_dispatch:
|
|
15
|
+
|
|
16
|
+
jobs:
|
|
17
|
+
m2:
|
|
18
|
+
if: |
|
|
19
|
+
github.event_name == 'schedule' ||
|
|
20
|
+
github.event_name == 'workflow_dispatch' ||
|
|
21
|
+
(github.event_name == 'pull_request' &&
|
|
22
|
+
contains(github.event.pull_request.labels.*.name, 'run-m2'))
|
|
23
|
+
# Self-hosted Apple Silicon runner (shared with ComfyUI-CoreMLSuite's Tier 2,
|
|
24
|
+
# same `coreml` label). The runner's environment MUST export
|
|
25
|
+
# COREML_DIFFUSION_TEST_CKPT: an absolute path to a cached single-file SD1.5
|
|
26
|
+
# checkpoint. The gate converts it fresh and runs the comfy-free inference
|
|
27
|
+
# golden — no ComfyUI involved.
|
|
28
|
+
runs-on: [self-hosted, macOS, ARM64, coreml]
|
|
29
|
+
timeout-minutes: 90
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v4
|
|
32
|
+
|
|
33
|
+
- uses: astral-sh/setup-uv@v7
|
|
34
|
+
with:
|
|
35
|
+
enable-cache: true
|
|
36
|
+
|
|
37
|
+
- name: uv sync
|
|
38
|
+
run: uv sync
|
|
39
|
+
|
|
40
|
+
- name: Check cached checkpoint
|
|
41
|
+
# The runner's .env must export COREML_DIFFUSION_TEST_CKPT — an absolute
|
|
42
|
+
# path to a cached single-file SD1.5 checkpoint.
|
|
43
|
+
run: |
|
|
44
|
+
set -euo pipefail
|
|
45
|
+
if [ -z "${COREML_DIFFUSION_TEST_CKPT:-}" ]; then
|
|
46
|
+
echo "COREML_DIFFUSION_TEST_CKPT unset — add it to the runner's .env."
|
|
47
|
+
exit 1
|
|
48
|
+
fi
|
|
49
|
+
test -f "$COREML_DIFFUSION_TEST_CKPT" || {
|
|
50
|
+
echo "checkpoint not found: $COREML_DIFFUSION_TEST_CKPT"; exit 1; }
|
|
51
|
+
echo "Tier 2: checkpoint \`$COREML_DIFFUSION_TEST_CKPT\`" >> "$GITHUB_STEP_SUMMARY"
|
|
52
|
+
|
|
53
|
+
- name: Convert UNet fresh (batch=2 for CFG)
|
|
54
|
+
# Convert on every run. The .mlpackage cache key is conversion
|
|
55
|
+
# *parameters* only, not the conversion code or toolchain — a stale model
|
|
56
|
+
# would let a conversion regression pass. batch=2 because guided CFG feeds
|
|
57
|
+
# uncond+cond in a single forward pass, and ANE input shapes are fixed at
|
|
58
|
+
# convert time.
|
|
59
|
+
run: |
|
|
60
|
+
set -euo pipefail
|
|
61
|
+
MLPKG="$RUNNER_TEMP/sd15_b2.mlpackage"
|
|
62
|
+
rm -rf "$MLPKG"
|
|
63
|
+
uv run coreml-diffusion convert \
|
|
64
|
+
--ckpt "$COREML_DIFFUSION_TEST_CKPT" \
|
|
65
|
+
--model-version SD15 \
|
|
66
|
+
--out "$MLPKG" \
|
|
67
|
+
--batch-size 2 --height 512 --width 512 --attn-impl SPLIT_EINSUM
|
|
68
|
+
echo "COREML_DIFFUSION_TEST_MLPACKAGE=$MLPKG" >> "$GITHUB_ENV"
|
|
69
|
+
|
|
70
|
+
- name: Run Tier 2 (m2 marker)
|
|
71
|
+
# Builds a stock diffusers pipeline around the converted UNet and asserts
|
|
72
|
+
# the generated image against the committed golden (exact match, else
|
|
73
|
+
# PSNR >= GOLDEN_PSNR_MIN_DB). VAE/text encoder on torch, UNet on the ANE.
|
|
74
|
+
run: uv run --no-sync pytest -m m2 tests/ -v
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.2](https://github.com/aszc-dev/coreml-diffusion/compare/v0.1.1...v0.1.2) (2026-05-27)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### 🐛 Bug Fixes
|
|
7
|
+
|
|
8
|
+
* **attention:** convertible fp32 ORIGINAL attention for the Core ML GPU path ([#2](https://github.com/aszc-dev/coreml-diffusion/issues/2)) ([28e56fc](https://github.com/aszc-dev/coreml-diffusion/commit/28e56fcf8c2242ebbe4c05abd05f7e796069d7d1))
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: coreml-diffusion
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Convert diffusion-model checkpoints (SD1.5/SDXL) to Core ML for Apple Neural Engine — framework-free, ComfyUI-independent.
|
|
5
|
+
Project-URL: Homepage, https://github.com/aszc-dev/coreml-diffusion
|
|
6
|
+
Project-URL: Repository, https://github.com/aszc-dev/coreml-diffusion
|
|
7
|
+
Project-URL: Issues, https://github.com/aszc-dev/coreml-diffusion/issues
|
|
8
|
+
Author-email: Adrian Szczepański <hi@aszc.dev>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ane,apple-neural-engine,comfyui,core-ml,coreml,diffusers,diffusion,sdxl,stable-diffusion
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Operating System :: MacOS
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Multimedia :: Graphics :: Graphics Conversion
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Typing :: Typed
|
|
20
|
+
Requires-Python: <3.13,>=3.12
|
|
21
|
+
Requires-Dist: coremltools<10,>=9
|
|
22
|
+
Requires-Dist: diffusers>=0.30
|
|
23
|
+
Requires-Dist: numpy<3,>=2
|
|
24
|
+
Requires-Dist: omegaconf>=2.3
|
|
25
|
+
Requires-Dist: peft>=0.13
|
|
26
|
+
Requires-Dist: torch>=2.7
|
|
27
|
+
Requires-Dist: transformers<5,>=4.44
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# coreml-diffusion
|
|
31
|
+
|
|
32
|
+
Convert diffusion-model checkpoints into Core ML `.mlpackage` artifacts for the
|
|
33
|
+
Apple Neural Engine (ANE) — framework-free and standalone.
|
|
34
|
+
|
|
35
|
+
`coreml-diffusion` takes a single-file Stable Diffusion checkpoint and produces a
|
|
36
|
+
Core ML UNet you can run on-device (macOS/iOS) via Core ML, in a Python pipeline,
|
|
37
|
+
or load into any host that consumes the artifact.
|
|
38
|
+
|
|
39
|
+
## What this is
|
|
40
|
+
|
|
41
|
+
A standalone toolkit and knowledge base for running diffusion models on the Apple
|
|
42
|
+
Neural Engine via Core ML. The niche is **diffusion on the ANE**: low-power,
|
|
43
|
+
GPU-free, embeddable in a Swift/iOS app. ANE is the differentiator — this is about
|
|
44
|
+
feasibility and power efficiency for SD1.5/SDXL on ANE, not a raw-throughput claim
|
|
45
|
+
against desktop GPUs.
|
|
46
|
+
|
|
47
|
+
The scope is diffusion architectures generally, not Stable Diffusion specifically.
|
|
48
|
+
The project aims to gather, in one place: the conversion path, a reproducible
|
|
49
|
+
benchmarking suite for objective comparison, a per-model catalogue documenting the
|
|
50
|
+
quirks of each architecture on the ANE, and the sources behind it all.
|
|
51
|
+
|
|
52
|
+
Supported today: SD1.5 and SDXL (verified). SDXL refiner and LCM convert but are
|
|
53
|
+
not yet golden-verified (experimental).
|
|
54
|
+
|
|
55
|
+
## Install
|
|
56
|
+
|
|
57
|
+
```sh
|
|
58
|
+
uv pip install coreml-diffusion # from PyPI
|
|
59
|
+
uv pip install -e . # from a checkout
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Requires Python 3.12 and (for conversion) `coremltools` 9 — conversion runs on
|
|
63
|
+
macOS; the package imports and its CLI parse on any platform.
|
|
64
|
+
|
|
65
|
+
## CLI
|
|
66
|
+
|
|
67
|
+
```sh
|
|
68
|
+
coreml-diffusion convert \
|
|
69
|
+
--ckpt path/to/model.safetensors \
|
|
70
|
+
--model-version SD15 \
|
|
71
|
+
--out unet.mlpackage \
|
|
72
|
+
--height 512 --width 512 \
|
|
73
|
+
--attn-impl SPLIT_EINSUM \
|
|
74
|
+
--quantize none
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Options: `--batch-size`, `--controlnet`, `--lora PATH[:STRENGTH]` (repeatable),
|
|
78
|
+
`--config` (original-config YAML). `--quantize {none,8,6,4}` applies k-means
|
|
79
|
+
weight palettization. Run `coreml-diffusion convert --help` for the full list.
|
|
80
|
+
|
|
81
|
+
The output `.mlpackage` is the deliverable: load it natively in Swift/Core ML, run
|
|
82
|
+
it through the Python inference pipeline below, or hand it to any consuming host.
|
|
83
|
+
|
|
84
|
+
### Model sources
|
|
85
|
+
|
|
86
|
+
Register directories so `--ckpt` accepts a bare name instead of a full path:
|
|
87
|
+
|
|
88
|
+
```sh
|
|
89
|
+
coreml-diffusion sources add comfy /path/to/ComfyUI/models # --kind comfy|flat
|
|
90
|
+
coreml-diffusion sources list # sources + checkpoints
|
|
91
|
+
coreml-diffusion convert --ckpt v1-5-pruned-emaonly --model-version SD15 --out unet.mlpackage
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Sources are recorded in `~/.config/coreml-diffusion/sources.toml`. `comfy` knows the
|
|
95
|
+
`models/{checkpoints,loras,vae,...}` layout; `flat` is a plain checkpoint directory.
|
|
96
|
+
|
|
97
|
+
## Library
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
import coreml_diffusion
|
|
101
|
+
from coreml_diffusion import ModelVersion
|
|
102
|
+
|
|
103
|
+
coreml_diffusion.convert(
|
|
104
|
+
"model.safetensors", ModelVersion.SD15, "unet.mlpackage",
|
|
105
|
+
height=512, width=512, attn_impl="SPLIT_EINSUM",
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Inference (in progress)
|
|
110
|
+
|
|
111
|
+
A framework-free inference path lets a converted `.mlpackage` generate images with
|
|
112
|
+
no host framework: a `diffusers` pipeline runs the stock VAE / text encoder on
|
|
113
|
+
torch while the UNet is served from Core ML on the ANE. This doubles as the
|
|
114
|
+
package's own regression anchor — the Tier 2 (`m2`) golden image, asserted on an
|
|
115
|
+
Apple Silicon runner — and as the reference for the on-device write-up. See
|
|
116
|
+
`tests/m2/`.
|
|
117
|
+
|
|
118
|
+
## Discovery API
|
|
119
|
+
|
|
120
|
+
`list_model_versions`, `list_attention_impls`, `list_quant_modes`, and
|
|
121
|
+
`CONTRACT_VERSION` report what this build can convert. The identifiers are an
|
|
122
|
+
additive-only contract: removing or renaming one is a major version bump, because
|
|
123
|
+
downstream consumers reference these strings verbatim.
|
|
124
|
+
|
|
125
|
+
## ComfyUI
|
|
126
|
+
|
|
127
|
+
[ComfyUI-CoreMLSuite](https://github.com/aszc-dev/ComfyUI-CoreMLSuite) consumes
|
|
128
|
+
this package for its conversion path and drives its node dropdowns from the
|
|
129
|
+
discovery API above — installing a newer `coreml-diffusion` surfaces new
|
|
130
|
+
conversion types in the node with no Suite change. The Suite is one consumer;
|
|
131
|
+
this package neither depends on nor requires ComfyUI.
|
|
132
|
+
|
|
133
|
+
## License
|
|
134
|
+
|
|
135
|
+
MIT
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# coreml-diffusion
|
|
2
|
+
|
|
3
|
+
Convert diffusion-model checkpoints into Core ML `.mlpackage` artifacts for the
|
|
4
|
+
Apple Neural Engine (ANE) — framework-free and standalone.
|
|
5
|
+
|
|
6
|
+
`coreml-diffusion` takes a single-file Stable Diffusion checkpoint and produces a
|
|
7
|
+
Core ML UNet you can run on-device (macOS/iOS) via Core ML, in a Python pipeline,
|
|
8
|
+
or load into any host that consumes the artifact.
|
|
9
|
+
|
|
10
|
+
## What this is
|
|
11
|
+
|
|
12
|
+
A standalone toolkit and knowledge base for running diffusion models on the Apple
|
|
13
|
+
Neural Engine via Core ML. The niche is **diffusion on the ANE**: low-power,
|
|
14
|
+
GPU-free, embeddable in a Swift/iOS app. ANE is the differentiator — this is about
|
|
15
|
+
feasibility and power efficiency for SD1.5/SDXL on ANE, not a raw-throughput claim
|
|
16
|
+
against desktop GPUs.
|
|
17
|
+
|
|
18
|
+
The scope is diffusion architectures generally, not Stable Diffusion specifically.
|
|
19
|
+
The project aims to gather, in one place: the conversion path, a reproducible
|
|
20
|
+
benchmarking suite for objective comparison, a per-model catalogue documenting the
|
|
21
|
+
quirks of each architecture on the ANE, and the sources behind it all.
|
|
22
|
+
|
|
23
|
+
Supported today: SD1.5 and SDXL (verified). SDXL refiner and LCM convert but are
|
|
24
|
+
not yet golden-verified (experimental).
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```sh
|
|
29
|
+
uv pip install coreml-diffusion # from PyPI
|
|
30
|
+
uv pip install -e . # from a checkout
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Requires Python 3.12 and (for conversion) `coremltools` 9 — conversion runs on
|
|
34
|
+
macOS; the package imports and its CLI parse on any platform.
|
|
35
|
+
|
|
36
|
+
## CLI
|
|
37
|
+
|
|
38
|
+
```sh
|
|
39
|
+
coreml-diffusion convert \
|
|
40
|
+
--ckpt path/to/model.safetensors \
|
|
41
|
+
--model-version SD15 \
|
|
42
|
+
--out unet.mlpackage \
|
|
43
|
+
--height 512 --width 512 \
|
|
44
|
+
--attn-impl SPLIT_EINSUM \
|
|
45
|
+
--quantize none
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Options: `--batch-size`, `--controlnet`, `--lora PATH[:STRENGTH]` (repeatable),
|
|
49
|
+
`--config` (original-config YAML). `--quantize {none,8,6,4}` applies k-means
|
|
50
|
+
weight palettization. Run `coreml-diffusion convert --help` for the full list.
|
|
51
|
+
|
|
52
|
+
The output `.mlpackage` is the deliverable: load it natively in Swift/Core ML, run
|
|
53
|
+
it through the Python inference pipeline below, or hand it to any consuming host.
|
|
54
|
+
|
|
55
|
+
### Model sources
|
|
56
|
+
|
|
57
|
+
Register directories so `--ckpt` accepts a bare name instead of a full path:
|
|
58
|
+
|
|
59
|
+
```sh
|
|
60
|
+
coreml-diffusion sources add comfy /path/to/ComfyUI/models # --kind comfy|flat
|
|
61
|
+
coreml-diffusion sources list # sources + checkpoints
|
|
62
|
+
coreml-diffusion convert --ckpt v1-5-pruned-emaonly --model-version SD15 --out unet.mlpackage
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Sources are recorded in `~/.config/coreml-diffusion/sources.toml`. `comfy` knows the
|
|
66
|
+
`models/{checkpoints,loras,vae,...}` layout; `flat` is a plain checkpoint directory.
|
|
67
|
+
|
|
68
|
+
## Library
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
import coreml_diffusion
|
|
72
|
+
from coreml_diffusion import ModelVersion
|
|
73
|
+
|
|
74
|
+
coreml_diffusion.convert(
|
|
75
|
+
"model.safetensors", ModelVersion.SD15, "unet.mlpackage",
|
|
76
|
+
height=512, width=512, attn_impl="SPLIT_EINSUM",
|
|
77
|
+
)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Inference (in progress)
|
|
81
|
+
|
|
82
|
+
A framework-free inference path lets a converted `.mlpackage` generate images with
|
|
83
|
+
no host framework: a `diffusers` pipeline runs the stock VAE / text encoder on
|
|
84
|
+
torch while the UNet is served from Core ML on the ANE. This doubles as the
|
|
85
|
+
package's own regression anchor — the Tier 2 (`m2`) golden image, asserted on an
|
|
86
|
+
Apple Silicon runner — and as the reference for the on-device write-up. See
|
|
87
|
+
`tests/m2/`.
|
|
88
|
+
|
|
89
|
+
## Discovery API
|
|
90
|
+
|
|
91
|
+
`list_model_versions`, `list_attention_impls`, `list_quant_modes`, and
|
|
92
|
+
`CONTRACT_VERSION` report what this build can convert. The identifiers are an
|
|
93
|
+
additive-only contract: removing or renaming one is a major version bump, because
|
|
94
|
+
downstream consumers reference these strings verbatim.
|
|
95
|
+
|
|
96
|
+
## ComfyUI
|
|
97
|
+
|
|
98
|
+
[ComfyUI-CoreMLSuite](https://github.com/aszc-dev/ComfyUI-CoreMLSuite) consumes
|
|
99
|
+
this package for its conversion path and drives its node dropdowns from the
|
|
100
|
+
discovery API above — installing a newer `coreml-diffusion` surfaces new
|
|
101
|
+
conversion types in the node with no Suite change. The Suite is one consumer;
|
|
102
|
+
this package neither depends on nor requires ComfyUI.
|
|
103
|
+
|
|
104
|
+
## License
|
|
105
|
+
|
|
106
|
+
MIT
|
|
@@ -40,6 +40,8 @@ __all__ = [
|
|
|
40
40
|
"compose_out_name",
|
|
41
41
|
"lora_names_from_params",
|
|
42
42
|
"convert",
|
|
43
|
+
"build_pipeline",
|
|
44
|
+
"CoreMLUNet",
|
|
43
45
|
]
|
|
44
46
|
|
|
45
47
|
|
|
@@ -105,4 +107,8 @@ def __getattr__(name):
|
|
|
105
107
|
from coreml_diffusion.convert import convert as _convert
|
|
106
108
|
|
|
107
109
|
return _convert
|
|
110
|
+
if name in ("build_pipeline", "CoreMLUNet"):
|
|
111
|
+
from coreml_diffusion import inference
|
|
112
|
+
|
|
113
|
+
return getattr(inference, name)
|
|
108
114
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -14,6 +14,7 @@ Example:
|
|
|
14
14
|
import argparse
|
|
15
15
|
|
|
16
16
|
import coreml_diffusion
|
|
17
|
+
from coreml_diffusion import sources
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
def _parse_lora(spec):
|
|
@@ -31,8 +32,9 @@ def _parse_lora(spec):
|
|
|
31
32
|
def _convert_cmd(args):
|
|
32
33
|
sample_size = (args.height // 8, args.width // 8)
|
|
33
34
|
lora_weights = [_parse_lora(spec) for spec in (args.lora or [])]
|
|
35
|
+
ckpt = sources.resolve_checkpoint(args.ckpt, args.source)
|
|
34
36
|
coreml_diffusion.convert(
|
|
35
|
-
|
|
37
|
+
ckpt,
|
|
36
38
|
coreml_diffusion.ModelVersion[args.model_version],
|
|
37
39
|
args.out,
|
|
38
40
|
batch_size=args.batch_size,
|
|
@@ -45,6 +47,31 @@ def _convert_cmd(args):
|
|
|
45
47
|
)
|
|
46
48
|
|
|
47
49
|
|
|
50
|
+
def _sources_add_cmd(args):
|
|
51
|
+
entry = sources.add_source(args.name, args.path, args.kind)
|
|
52
|
+
print(f"Added source {args.name!r} ({entry['kind']}): {entry['path']}")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _sources_list_cmd(args):
|
|
56
|
+
registered = sources.load_sources()
|
|
57
|
+
if not registered:
|
|
58
|
+
print(f"No sources registered. Config: {sources.config_path()}")
|
|
59
|
+
return
|
|
60
|
+
for name, entry in sorted(registered.items()):
|
|
61
|
+
ckpts = sources.iter_checkpoints(entry)
|
|
62
|
+
print(f"{name} ({entry['kind']}): {entry['path']}")
|
|
63
|
+
if ckpts:
|
|
64
|
+
for stem in ckpts:
|
|
65
|
+
print(f" - {stem}")
|
|
66
|
+
else:
|
|
67
|
+
print(" (no checkpoints found)")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _sources_remove_cmd(args):
|
|
71
|
+
sources.remove_source(args.name)
|
|
72
|
+
print(f"Removed source {args.name!r}")
|
|
73
|
+
|
|
74
|
+
|
|
48
75
|
def build_parser():
|
|
49
76
|
parser = argparse.ArgumentParser(
|
|
50
77
|
prog="coreml-diffusion",
|
|
@@ -54,7 +81,15 @@ def build_parser():
|
|
|
54
81
|
|
|
55
82
|
conv = sub.add_parser("convert", help="Convert a checkpoint's UNet to a .mlpackage")
|
|
56
83
|
conv.add_argument(
|
|
57
|
-
"--ckpt",
|
|
84
|
+
"--ckpt",
|
|
85
|
+
required=True,
|
|
86
|
+
help="Checkpoint path, or a name resolved against registered sources "
|
|
87
|
+
"(see 'coreml-diffusion sources')",
|
|
88
|
+
)
|
|
89
|
+
conv.add_argument(
|
|
90
|
+
"--source",
|
|
91
|
+
default=None,
|
|
92
|
+
help="Restrict --ckpt name resolution to this registered source",
|
|
58
93
|
)
|
|
59
94
|
conv.add_argument(
|
|
60
95
|
"--model-version",
|
|
@@ -101,6 +136,28 @@ def build_parser():
|
|
|
101
136
|
help="K-means weight palettization bits (default none = unquantized)",
|
|
102
137
|
)
|
|
103
138
|
conv.set_defaults(func=_convert_cmd)
|
|
139
|
+
|
|
140
|
+
src = sub.add_parser("sources", help="Manage model source directories")
|
|
141
|
+
src_sub = src.add_subparsers(dest="sources_command", required=True)
|
|
142
|
+
|
|
143
|
+
s_add = src_sub.add_parser("add", help="Register (or overwrite) a model source")
|
|
144
|
+
s_add.add_argument("name", help="Short name for the source, e.g. 'comfy'")
|
|
145
|
+
s_add.add_argument("path", help="Base directory of the source")
|
|
146
|
+
s_add.add_argument(
|
|
147
|
+
"--kind",
|
|
148
|
+
choices=sources.SOURCE_KINDS,
|
|
149
|
+
default="comfy",
|
|
150
|
+
help="Directory layout (default comfy: models/{checkpoints,loras,vae,...})",
|
|
151
|
+
)
|
|
152
|
+
s_add.set_defaults(func=_sources_add_cmd)
|
|
153
|
+
|
|
154
|
+
s_list = src_sub.add_parser("list", help="List sources and their checkpoints")
|
|
155
|
+
s_list.set_defaults(func=_sources_list_cmd)
|
|
156
|
+
|
|
157
|
+
s_rm = src_sub.add_parser("remove", help="Unregister a source")
|
|
158
|
+
s_rm.add_argument("name", help="Source name to remove")
|
|
159
|
+
s_rm.set_defaults(func=_sources_remove_cmd)
|
|
160
|
+
|
|
104
161
|
return parser
|
|
105
162
|
|
|
106
163
|
|
|
@@ -9,6 +9,7 @@ CHUNK_SIZE = 512
|
|
|
9
9
|
|
|
10
10
|
def apply_attention_implementation(unet, attention_implementation):
|
|
11
11
|
if attention_implementation == "ORIGINAL":
|
|
12
|
+
unet.set_attn_processor(OriginalAttnProcessor())
|
|
12
13
|
return unet
|
|
13
14
|
|
|
14
15
|
if attention_implementation == "SPLIT_EINSUM":
|
|
@@ -24,6 +25,43 @@ def apply_attention_implementation(unet, attention_implementation):
|
|
|
24
25
|
)
|
|
25
26
|
|
|
26
27
|
|
|
28
|
+
class OriginalAttnProcessor:
|
|
29
|
+
"""Full (non-split) multi-head attention with an fp32 score path.
|
|
30
|
+
|
|
31
|
+
The ORIGINAL implementation targets the Core ML GPU path (SPLIT_EINSUM* are
|
|
32
|
+
the ANE-friendly default). It is *not* diffusers' stock attention: that path
|
|
33
|
+
routes through ``F.scaled_dot_product_attention`` plus ``view(B, -1, heads,
|
|
34
|
+
d)`` reshapes that fail to convert under coremltools 9 (the same einsum graph
|
|
35
|
+
SPLIT_EINSUM uses converts cleanly). Nor is it diffusers' legacy
|
|
36
|
+
``AttnProcessor`` — its ``get_attention_scores`` builds the score buffer with
|
|
37
|
+
``torch.empty(query.shape[0], ...)``, whose dynamic int shape also fails ct9.
|
|
38
|
+
|
|
39
|
+
So this reuses the SPLIT_EINSUM conversion-safe boilerplate and supplies a
|
|
40
|
+
plain full-attention kernel that upcasts QK^T + softmax to fp32. Without the
|
|
41
|
+
upcast, fp16 self-attention at 64x64 latents (4096 query tokens) overflows ->
|
|
42
|
+
inf -> NaN after softmax.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __call__(
|
|
46
|
+
self,
|
|
47
|
+
attn,
|
|
48
|
+
hidden_states,
|
|
49
|
+
encoder_hidden_states=None,
|
|
50
|
+
attention_mask=None,
|
|
51
|
+
temb=None,
|
|
52
|
+
*args,
|
|
53
|
+
**kwargs,
|
|
54
|
+
):
|
|
55
|
+
return _attention_forward(
|
|
56
|
+
attn,
|
|
57
|
+
hidden_states,
|
|
58
|
+
encoder_hidden_states,
|
|
59
|
+
attention_mask,
|
|
60
|
+
temb,
|
|
61
|
+
original,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
27
65
|
class SplitEinsumAttnProcessor:
|
|
28
66
|
def __call__(
|
|
29
67
|
self,
|
|
@@ -158,6 +196,29 @@ def _attention_forward(
|
|
|
158
196
|
return hidden_states
|
|
159
197
|
|
|
160
198
|
|
|
199
|
+
def original(q, k, v, mask, heads, dim_head):
|
|
200
|
+
"""Full multi-head attention with the QK^T scaling + softmax in fp32.
|
|
201
|
+
|
|
202
|
+
Same ``[B, C, 1, S]`` channel-major layout and mask convention as
|
|
203
|
+
``split_einsum`` (so it slots into ``_attention_forward`` unchanged), but
|
|
204
|
+
computes the whole score matrix per head in one batched einsum instead of the
|
|
205
|
+
per-head split. Upcasting the scores to fp32 keeps the softmax stable when the
|
|
206
|
+
converted model runs in fp16 (QK^T at 4096 tokens overflows fp16 otherwise).
|
|
207
|
+
"""
|
|
208
|
+
batch = q.size(0)
|
|
209
|
+
mh_q = q.view(batch, heads, dim_head, -1).float()
|
|
210
|
+
mh_k = k.view(batch, heads, dim_head, -1).float()
|
|
211
|
+
mh_v = v.view(batch, heads, dim_head, -1)
|
|
212
|
+
|
|
213
|
+
weights = torch.einsum("becq,beck->bkeq", mh_q, mh_k) * (dim_head**-0.5)
|
|
214
|
+
if mask is not None:
|
|
215
|
+
weights = weights + mask
|
|
216
|
+
weights = weights.softmax(dim=1).to(mh_v.dtype)
|
|
217
|
+
|
|
218
|
+
outputs = torch.einsum("bkeq,beck->becq", weights, mh_v)
|
|
219
|
+
return outputs.reshape(batch, heads * dim_head, 1, -1)
|
|
220
|
+
|
|
221
|
+
|
|
161
222
|
def split_einsum(q, k, v, mask, heads, dim_head):
|
|
162
223
|
q_heads = _split_heads(q, heads, dim_head)
|
|
163
224
|
k = k.transpose(1, 3)
|
|
@@ -36,7 +36,11 @@ def get_unet(model_version: ModelVersion, ref_unet, attention_implementation):
|
|
|
36
36
|
ref_unet.eval(),
|
|
37
37
|
attention_implementation,
|
|
38
38
|
)
|
|
39
|
-
|
|
39
|
+
# The freshly built wrapper defaults to training mode; the inner UNet is
|
|
40
|
+
# already eval, but coremltools inspects the top-level traced module and warns
|
|
41
|
+
# ("Model is not in eval mode"). eval() on the wrapper silences it and makes
|
|
42
|
+
# the eval-mode trace explicit (output is unchanged — UNet dropout p=0).
|
|
43
|
+
return CoreMLUNetWrapper(unet, model_version).eval()
|
|
40
44
|
|
|
41
45
|
|
|
42
46
|
def get_encoder_hidden_states_shape(ref_unet, batch_size):
|