coreml-diffusion 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. coreml_diffusion-0.1.2/.github/workflows/release-please.yml +27 -0
  2. coreml_diffusion-0.1.2/.github/workflows/tier2.yml +74 -0
  3. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/.gitignore +1 -0
  4. coreml_diffusion-0.1.2/.release-please-manifest.json +3 -0
  5. coreml_diffusion-0.1.2/CHANGELOG.md +8 -0
  6. coreml_diffusion-0.1.2/PKG-INFO +135 -0
  7. coreml_diffusion-0.1.2/README.md +106 -0
  8. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/__init__.py +6 -0
  9. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/cli.py +59 -2
  10. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/attention.py +61 -0
  11. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/convert.py +5 -1
  12. coreml_diffusion-0.1.2/coreml_diffusion/inference.py +176 -0
  13. coreml_diffusion-0.1.2/coreml_diffusion/sources.py +170 -0
  14. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/pyproject.toml +12 -5
  15. coreml_diffusion-0.1.2/release-please-config.json +22 -0
  16. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/conftest.py +0 -2
  17. coreml_diffusion-0.1.2/tests/m2/goldens/sd15_astronaut.png +0 -0
  18. coreml_diffusion-0.1.2/tests/m2/goldens/sd15_astronaut.sha256 +1 -0
  19. coreml_diffusion-0.1.2/tests/m2/test_inference_golden.py +111 -0
  20. coreml_diffusion-0.1.2/tests/m2/test_original_gpu.py +106 -0
  21. coreml_diffusion-0.1.2/tests/smoke/test_original_attention.py +95 -0
  22. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/unit/test_cli.py +8 -0
  23. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/unit/test_conversion_helpers.py +24 -3
  24. coreml_diffusion-0.1.2/tests/unit/test_sources.py +103 -0
  25. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/uv.lock +13 -93
  26. coreml_diffusion-0.1.0/PKG-INFO +0 -98
  27. coreml_diffusion-0.1.0/README.md +0 -69
  28. coreml_diffusion-0.1.0/tests/inference/test_pipeline_inference.py +0 -26
  29. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/.github/workflows/publish-pypi.yml +0 -0
  30. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/.github/workflows/tier0.yml +0 -0
  31. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/.github/workflows/tier1.yml +0 -0
  32. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/LICENSE +0 -0
  33. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/attention.py +0 -0
  34. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/__init__.py +0 -0
  35. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/shapes.py +0 -0
  36. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/trace.py +0 -0
  37. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/unet.py +0 -0
  38. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/logger.py +0 -0
  39. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/model_version.py +0 -0
  40. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/coreml_diffusion/naming.py +0 -0
  41. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/smoke/test_split_einsum_attention.py +0 -0
  42. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/smoke/test_synthetic_unet.py +0 -0
  43. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/unit/test_characterization_out_name.py +0 -0
  44. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/unit/test_discovery_api.py +0 -0
  45. {coreml_diffusion-0.1.0 → coreml_diffusion-0.1.2}/tests/unit/test_tier0_purity.py +0 -0
@@ -0,0 +1,27 @@
1
+ name: Release Please
2
+
3
+ # Manages the release cycle: maintains a Release PR that bumps the version in
4
+ # pyproject.toml and curates CHANGELOG.md from Conventional Commits (only the
5
+ # user-facing types in release-please-config.json's changelog-sections are
6
+ # surfaced). Merging that PR tags + publishes a GitHub Release.
7
+ #
8
+ # Runs with GH_CI_PAT (not the default GITHUB_TOKEN) so the Release it creates
9
+ # triggers publish-pypi.yml — events made with GITHUB_TOKEN do not start other
10
+ # workflows.
11
+
12
+ on:
13
+ push:
14
+ branches:
15
+ - main
16
+
17
+ permissions:
18
+ contents: write
19
+ pull-requests: write
20
+
21
+ jobs:
22
+ release-please:
23
+ runs-on: ubuntu-latest
24
+ steps:
25
+ - uses: googleapis/release-please-action@v4
26
+ with:
27
+ token: ${{ secrets.GH_CI_PAT }}
@@ -0,0 +1,74 @@
1
+ name: Tier 2 — M2 / ANE (self-hosted)
2
+
3
+ on:
4
+ pull_request:
5
+ # `labeled` fires when run-m2 is first added; `synchronize`/`reopened`
6
+ # re-run on every subsequent push while the label is present, so the result
7
+ # tracks the PR head instead of going stale. The `if` below keeps the run
8
+ # gated on the run-m2 label for all pull_request events.
9
+ types: [labeled, synchronize, reopened]
10
+ schedule:
11
+ # Nightly at 04:00 UTC (~05/06 in PL). Keeps the ANE path honest without
12
+ # burning the runner on every PR.
13
+ - cron: "0 4 * * *"
14
+ workflow_dispatch:
15
+
16
+ jobs:
17
+ m2:
18
+ if: |
19
+ github.event_name == 'schedule' ||
20
+ github.event_name == 'workflow_dispatch' ||
21
+ (github.event_name == 'pull_request' &&
22
+ contains(github.event.pull_request.labels.*.name, 'run-m2'))
23
+ # Self-hosted Apple Silicon runner (shared with ComfyUI-CoreMLSuite's Tier 2,
24
+ # same `coreml` label). The runner's environment MUST export
25
+ # COREML_DIFFUSION_TEST_CKPT: an absolute path to a cached single-file SD1.5
26
+ # checkpoint. The gate converts it fresh and runs the comfy-free inference
27
+ # golden — no ComfyUI involved.
28
+ runs-on: [self-hosted, macOS, ARM64, coreml]
29
+ timeout-minutes: 90
30
+ steps:
31
+ - uses: actions/checkout@v4
32
+
33
+ - uses: astral-sh/setup-uv@v7
34
+ with:
35
+ enable-cache: true
36
+
37
+ - name: uv sync
38
+ run: uv sync
39
+
40
+ - name: Check cached checkpoint
41
+ # The runner's .env must export COREML_DIFFUSION_TEST_CKPT — an absolute
42
+ # path to a cached single-file SD1.5 checkpoint.
43
+ run: |
44
+ set -euo pipefail
45
+ if [ -z "${COREML_DIFFUSION_TEST_CKPT:-}" ]; then
46
+ echo "COREML_DIFFUSION_TEST_CKPT unset — add it to the runner's .env."
47
+ exit 1
48
+ fi
49
+ test -f "$COREML_DIFFUSION_TEST_CKPT" || {
50
+ echo "checkpoint not found: $COREML_DIFFUSION_TEST_CKPT"; exit 1; }
51
+ echo "Tier 2: checkpoint \`$COREML_DIFFUSION_TEST_CKPT\`" >> "$GITHUB_STEP_SUMMARY"
52
+
53
+ - name: Convert UNet fresh (batch=2 for CFG)
54
+ # Convert on every run. The .mlpackage cache key is conversion
55
+ # *parameters* only, not the conversion code or toolchain — a stale model
56
+ # would let a conversion regression pass. batch=2 because guided CFG feeds
57
+ # uncond+cond in a single forward pass, and ANE input shapes are fixed at
58
+ # convert time.
59
+ run: |
60
+ set -euo pipefail
61
+ MLPKG="$RUNNER_TEMP/sd15_b2.mlpackage"
62
+ rm -rf "$MLPKG"
63
+ uv run coreml-diffusion convert \
64
+ --ckpt "$COREML_DIFFUSION_TEST_CKPT" \
65
+ --model-version SD15 \
66
+ --out "$MLPKG" \
67
+ --batch-size 2 --height 512 --width 512 --attn-impl SPLIT_EINSUM
68
+ echo "COREML_DIFFUSION_TEST_MLPACKAGE=$MLPKG" >> "$GITHUB_ENV"
69
+
70
+ - name: Run Tier 2 (m2 marker)
71
+ # Builds a stock diffusers pipeline around the converted UNet and asserts
72
+ # the generated image against the committed golden (exact match, else
73
+ # PSNR >= GOLDEN_PSNR_MIN_DB). VAE/text encoder on torch, UNet on the ANE.
74
+ run: uv run --no-sync pytest -m m2 tests/ -v
@@ -6,3 +6,4 @@ __pycache__/
6
6
  dist/
7
7
  build/
8
8
  *.egg-info/
9
+ CLAUDE.md
@@ -0,0 +1,3 @@
1
+ {
2
+ ".": "0.1.2"
3
+ }
@@ -0,0 +1,8 @@
1
+ # Changelog
2
+
3
+ ## [0.1.2](https://github.com/aszc-dev/coreml-diffusion/compare/v0.1.1...v0.1.2) (2026-05-27)
4
+
5
+
6
+ ### 🐛 Bug Fixes
7
+
8
+ * **attention:** convertible fp32 ORIGINAL attention for the Core ML GPU path ([#2](https://github.com/aszc-dev/coreml-diffusion/issues/2)) ([28e56fc](https://github.com/aszc-dev/coreml-diffusion/commit/28e56fcf8c2242ebbe4c05abd05f7e796069d7d1))
@@ -0,0 +1,135 @@
1
+ Metadata-Version: 2.4
2
+ Name: coreml-diffusion
3
+ Version: 0.1.2
4
+ Summary: Convert diffusion-model checkpoints (SD1.5/SDXL) to Core ML for Apple Neural Engine — framework-free, ComfyUI-independent.
5
+ Project-URL: Homepage, https://github.com/aszc-dev/coreml-diffusion
6
+ Project-URL: Repository, https://github.com/aszc-dev/coreml-diffusion
7
+ Project-URL: Issues, https://github.com/aszc-dev/coreml-diffusion/issues
8
+ Author-email: Adrian Szczepański <hi@aszc.dev>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: ane,apple-neural-engine,comfyui,core-ml,coreml,diffusers,diffusion,sdxl,stable-diffusion
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Operating System :: MacOS
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Multimedia :: Graphics :: Graphics Conversion
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Typing :: Typed
20
+ Requires-Python: <3.13,>=3.12
21
+ Requires-Dist: coremltools<10,>=9
22
+ Requires-Dist: diffusers>=0.30
23
+ Requires-Dist: numpy<3,>=2
24
+ Requires-Dist: omegaconf>=2.3
25
+ Requires-Dist: peft>=0.13
26
+ Requires-Dist: torch>=2.7
27
+ Requires-Dist: transformers<5,>=4.44
28
+ Description-Content-Type: text/markdown
29
+
30
+ # coreml-diffusion
31
+
32
+ Convert diffusion-model checkpoints into Core ML `.mlpackage` artifacts for the
33
+ Apple Neural Engine (ANE) — framework-free and standalone.
34
+
35
+ `coreml-diffusion` takes a single-file Stable Diffusion checkpoint and produces a
36
+ Core ML UNet you can run on-device (macOS/iOS) via Core ML, in a Python pipeline,
37
+ or load into any host that consumes the artifact.
38
+
39
+ ## What this is
40
+
41
+ A standalone toolkit and knowledge base for running diffusion models on the Apple
42
+ Neural Engine via Core ML. The niche is **diffusion on the ANE**: low-power,
43
+ GPU-free, embeddable in a Swift/iOS app. ANE is the differentiator — this is about
44
+ feasibility and power efficiency for SD1.5/SDXL on ANE, not a raw-throughput claim
45
+ against desktop GPUs.
46
+
47
+ The scope is diffusion architectures generally, not Stable Diffusion specifically.
48
+ The project aims to gather, in one place: the conversion path, a reproducible
49
+ benchmarking suite for objective comparison, a per-model catalogue documenting the
50
+ quirks of each architecture on the ANE, and the sources behind it all.
51
+
52
+ Supported today: SD1.5 and SDXL (verified). SDXL refiner and LCM convert but are
53
+ not yet golden-verified (experimental).
54
+
55
+ ## Install
56
+
57
+ ```sh
58
+ uv pip install coreml-diffusion # from PyPI
59
+ uv pip install -e . # from a checkout
60
+ ```
61
+
62
+ Requires Python 3.12 and (for conversion) `coremltools` 9 — conversion runs on
63
+ macOS; the package imports and its CLI parse on any platform.
64
+
65
+ ## CLI
66
+
67
+ ```sh
68
+ coreml-diffusion convert \
69
+ --ckpt path/to/model.safetensors \
70
+ --model-version SD15 \
71
+ --out unet.mlpackage \
72
+ --height 512 --width 512 \
73
+ --attn-impl SPLIT_EINSUM \
74
+ --quantize none
75
+ ```
76
+
77
+ Options: `--batch-size`, `--controlnet`, `--lora PATH[:STRENGTH]` (repeatable),
78
+ `--config` (original-config YAML). `--quantize {none,8,6,4}` applies k-means
79
+ weight palettization. Run `coreml-diffusion convert --help` for the full list.
80
+
81
+ The output `.mlpackage` is the deliverable: load it natively in Swift/Core ML, run
82
+ it through the Python inference pipeline below, or hand it to any consuming host.
83
+
84
+ ### Model sources
85
+
86
+ Register directories so `--ckpt` accepts a bare name instead of a full path:
87
+
88
+ ```sh
89
+ coreml-diffusion sources add comfy /path/to/ComfyUI/models # --kind comfy|flat
90
+ coreml-diffusion sources list # sources + checkpoints
91
+ coreml-diffusion convert --ckpt v1-5-pruned-emaonly --model-version SD15 --out unet.mlpackage
92
+ ```
93
+
94
+ Sources are recorded in `~/.config/coreml-diffusion/sources.toml`. `comfy` knows the
95
+ `models/{checkpoints,loras,vae,...}` layout; `flat` is a plain checkpoint directory.
96
+
97
+ ## Library
98
+
99
+ ```python
100
+ import coreml_diffusion
101
+ from coreml_diffusion import ModelVersion
102
+
103
+ coreml_diffusion.convert(
104
+ "model.safetensors", ModelVersion.SD15, "unet.mlpackage",
105
+ height=512, width=512, attn_impl="SPLIT_EINSUM",
106
+ )
107
+ ```
108
+
109
+ ## Inference (in progress)
110
+
111
+ A framework-free inference path lets a converted `.mlpackage` generate images with
112
+ no host framework: a `diffusers` pipeline runs the stock VAE / text encoder on
113
+ torch while the UNet is served from Core ML on the ANE. This doubles as the
114
+ package's own regression anchor — the Tier 2 (`m2`) golden image, asserted on an
115
+ Apple Silicon runner — and as the reference for the on-device write-up. See
116
+ `tests/m2/`.
117
+
118
+ ## Discovery API
119
+
120
+ `list_model_versions`, `list_attention_impls`, `list_quant_modes`, and
121
+ `CONTRACT_VERSION` report what this build can convert. The identifiers are an
122
+ additive-only contract: removing or renaming one is a major version bump, because
123
+ downstream consumers reference these strings verbatim.
124
+
125
+ ## ComfyUI
126
+
127
+ [ComfyUI-CoreMLSuite](https://github.com/aszc-dev/ComfyUI-CoreMLSuite) consumes
128
+ this package for its conversion path and drives its node dropdowns from the
129
+ discovery API above — installing a newer `coreml-diffusion` surfaces new
130
+ conversion types in the node with no Suite change. The Suite is one consumer;
131
+ this package neither depends on nor requires ComfyUI.
132
+
133
+ ## License
134
+
135
+ MIT
@@ -0,0 +1,106 @@
1
+ # coreml-diffusion
2
+
3
+ Convert diffusion-model checkpoints into Core ML `.mlpackage` artifacts for the
4
+ Apple Neural Engine (ANE) — framework-free and standalone.
5
+
6
+ `coreml-diffusion` takes a single-file Stable Diffusion checkpoint and produces a
7
+ Core ML UNet you can run on-device (macOS/iOS) via Core ML, in a Python pipeline,
8
+ or load into any host that consumes the artifact.
9
+
10
+ ## What this is
11
+
12
+ A standalone toolkit and knowledge base for running diffusion models on the Apple
13
+ Neural Engine via Core ML. The niche is **diffusion on the ANE**: low-power,
14
+ GPU-free, embeddable in a Swift/iOS app. ANE is the differentiator — this is about
15
+ feasibility and power efficiency for SD1.5/SDXL on ANE, not a raw-throughput claim
16
+ against desktop GPUs.
17
+
18
+ The scope is diffusion architectures generally, not Stable Diffusion specifically.
19
+ The project aims to gather, in one place: the conversion path, a reproducible
20
+ benchmarking suite for objective comparison, a per-model catalogue documenting the
21
+ quirks of each architecture on the ANE, and the sources behind it all.
22
+
23
+ Supported today: SD1.5 and SDXL (verified). SDXL refiner and LCM convert but are
24
+ not yet golden-verified (experimental).
25
+
26
+ ## Install
27
+
28
+ ```sh
29
+ uv pip install coreml-diffusion # from PyPI
30
+ uv pip install -e . # from a checkout
31
+ ```
32
+
33
+ Requires Python 3.12 and (for conversion) `coremltools` 9 — conversion runs on
34
+ macOS; the package imports and its CLI parse on any platform.
35
+
36
+ ## CLI
37
+
38
+ ```sh
39
+ coreml-diffusion convert \
40
+ --ckpt path/to/model.safetensors \
41
+ --model-version SD15 \
42
+ --out unet.mlpackage \
43
+ --height 512 --width 512 \
44
+ --attn-impl SPLIT_EINSUM \
45
+ --quantize none
46
+ ```
47
+
48
+ Options: `--batch-size`, `--controlnet`, `--lora PATH[:STRENGTH]` (repeatable),
49
+ `--config` (original-config YAML). `--quantize {none,8,6,4}` applies k-means
50
+ weight palettization. Run `coreml-diffusion convert --help` for the full list.
51
+
52
+ The output `.mlpackage` is the deliverable: load it natively in Swift/Core ML, run
53
+ it through the Python inference pipeline below, or hand it to any consuming host.
54
+
55
+ ### Model sources
56
+
57
+ Register directories so `--ckpt` accepts a bare name instead of a full path:
58
+
59
+ ```sh
60
+ coreml-diffusion sources add comfy /path/to/ComfyUI/models # --kind comfy|flat
61
+ coreml-diffusion sources list # sources + checkpoints
62
+ coreml-diffusion convert --ckpt v1-5-pruned-emaonly --model-version SD15 --out unet.mlpackage
63
+ ```
64
+
65
+ Sources are recorded in `~/.config/coreml-diffusion/sources.toml`. `comfy` knows the
66
+ `models/{checkpoints,loras,vae,...}` layout; `flat` is a plain checkpoint directory.
67
+
68
+ ## Library
69
+
70
+ ```python
71
+ import coreml_diffusion
72
+ from coreml_diffusion import ModelVersion
73
+
74
+ coreml_diffusion.convert(
75
+ "model.safetensors", ModelVersion.SD15, "unet.mlpackage",
76
+ height=512, width=512, attn_impl="SPLIT_EINSUM",
77
+ )
78
+ ```
79
+
80
+ ## Inference (in progress)
81
+
82
+ A framework-free inference path lets a converted `.mlpackage` generate images with
83
+ no host framework: a `diffusers` pipeline runs the stock VAE / text encoder on
84
+ torch while the UNet is served from Core ML on the ANE. This doubles as the
85
+ package's own regression anchor — the Tier 2 (`m2`) golden image, asserted on an
86
+ Apple Silicon runner — and as the reference for the on-device write-up. See
87
+ `tests/m2/`.
88
+
89
+ ## Discovery API
90
+
91
+ `list_model_versions`, `list_attention_impls`, `list_quant_modes`, and
92
+ `CONTRACT_VERSION` report what this build can convert. The identifiers are an
93
+ additive-only contract: removing or renaming one is a major version bump, because
94
+ downstream consumers reference these strings verbatim.
95
+
96
+ ## ComfyUI
97
+
98
+ [ComfyUI-CoreMLSuite](https://github.com/aszc-dev/ComfyUI-CoreMLSuite) consumes
99
+ this package for its conversion path and drives its node dropdowns from the
100
+ discovery API above — installing a newer `coreml-diffusion` surfaces new
101
+ conversion types in the node with no Suite change. The Suite is one consumer;
102
+ this package neither depends on nor requires ComfyUI.
103
+
104
+ ## License
105
+
106
+ MIT
@@ -40,6 +40,8 @@ __all__ = [
40
40
  "compose_out_name",
41
41
  "lora_names_from_params",
42
42
  "convert",
43
+ "build_pipeline",
44
+ "CoreMLUNet",
43
45
  ]
44
46
 
45
47
 
@@ -105,4 +107,8 @@ def __getattr__(name):
105
107
  from coreml_diffusion.convert import convert as _convert
106
108
 
107
109
  return _convert
110
+ if name in ("build_pipeline", "CoreMLUNet"):
111
+ from coreml_diffusion import inference
112
+
113
+ return getattr(inference, name)
108
114
  raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -14,6 +14,7 @@ Example:
14
14
  import argparse
15
15
 
16
16
  import coreml_diffusion
17
+ from coreml_diffusion import sources
17
18
 
18
19
 
19
20
  def _parse_lora(spec):
@@ -31,8 +32,9 @@ def _parse_lora(spec):
31
32
  def _convert_cmd(args):
32
33
  sample_size = (args.height // 8, args.width // 8)
33
34
  lora_weights = [_parse_lora(spec) for spec in (args.lora or [])]
35
+ ckpt = sources.resolve_checkpoint(args.ckpt, args.source)
34
36
  coreml_diffusion.convert(
35
- args.ckpt,
37
+ ckpt,
36
38
  coreml_diffusion.ModelVersion[args.model_version],
37
39
  args.out,
38
40
  batch_size=args.batch_size,
@@ -45,6 +47,31 @@ def _convert_cmd(args):
45
47
  )
46
48
 
47
49
 
50
+ def _sources_add_cmd(args):
51
+ entry = sources.add_source(args.name, args.path, args.kind)
52
+ print(f"Added source {args.name!r} ({entry['kind']}): {entry['path']}")
53
+
54
+
55
+ def _sources_list_cmd(args):
56
+ registered = sources.load_sources()
57
+ if not registered:
58
+ print(f"No sources registered. Config: {sources.config_path()}")
59
+ return
60
+ for name, entry in sorted(registered.items()):
61
+ ckpts = sources.iter_checkpoints(entry)
62
+ print(f"{name} ({entry['kind']}): {entry['path']}")
63
+ if ckpts:
64
+ for stem in ckpts:
65
+ print(f" - {stem}")
66
+ else:
67
+ print(" (no checkpoints found)")
68
+
69
+
70
+ def _sources_remove_cmd(args):
71
+ sources.remove_source(args.name)
72
+ print(f"Removed source {args.name!r}")
73
+
74
+
48
75
  def build_parser():
49
76
  parser = argparse.ArgumentParser(
50
77
  prog="coreml-diffusion",
@@ -54,7 +81,15 @@ def build_parser():
54
81
 
55
82
  conv = sub.add_parser("convert", help="Convert a checkpoint's UNet to a .mlpackage")
56
83
  conv.add_argument(
57
- "--ckpt", required=True, help="Path to the source .safetensors checkpoint"
84
+ "--ckpt",
85
+ required=True,
86
+ help="Checkpoint path, or a name resolved against registered sources "
87
+ "(see 'coreml-diffusion sources')",
88
+ )
89
+ conv.add_argument(
90
+ "--source",
91
+ default=None,
92
+ help="Restrict --ckpt name resolution to this registered source",
58
93
  )
59
94
  conv.add_argument(
60
95
  "--model-version",
@@ -101,6 +136,28 @@ def build_parser():
101
136
  help="K-means weight palettization bits (default none = unquantized)",
102
137
  )
103
138
  conv.set_defaults(func=_convert_cmd)
139
+
140
+ src = sub.add_parser("sources", help="Manage model source directories")
141
+ src_sub = src.add_subparsers(dest="sources_command", required=True)
142
+
143
+ s_add = src_sub.add_parser("add", help="Register (or overwrite) a model source")
144
+ s_add.add_argument("name", help="Short name for the source, e.g. 'comfy'")
145
+ s_add.add_argument("path", help="Base directory of the source")
146
+ s_add.add_argument(
147
+ "--kind",
148
+ choices=sources.SOURCE_KINDS,
149
+ default="comfy",
150
+ help="Directory layout (default comfy: models/{checkpoints,loras,vae,...})",
151
+ )
152
+ s_add.set_defaults(func=_sources_add_cmd)
153
+
154
+ s_list = src_sub.add_parser("list", help="List sources and their checkpoints")
155
+ s_list.set_defaults(func=_sources_list_cmd)
156
+
157
+ s_rm = src_sub.add_parser("remove", help="Unregister a source")
158
+ s_rm.add_argument("name", help="Source name to remove")
159
+ s_rm.set_defaults(func=_sources_remove_cmd)
160
+
104
161
  return parser
105
162
 
106
163
 
@@ -9,6 +9,7 @@ CHUNK_SIZE = 512
9
9
 
10
10
  def apply_attention_implementation(unet, attention_implementation):
11
11
  if attention_implementation == "ORIGINAL":
12
+ unet.set_attn_processor(OriginalAttnProcessor())
12
13
  return unet
13
14
 
14
15
  if attention_implementation == "SPLIT_EINSUM":
@@ -24,6 +25,43 @@ def apply_attention_implementation(unet, attention_implementation):
24
25
  )
25
26
 
26
27
 
28
+ class OriginalAttnProcessor:
29
+ """Full (non-split) multi-head attention with an fp32 score path.
30
+
31
+ The ORIGINAL implementation targets the Core ML GPU path (SPLIT_EINSUM* are
32
+ the ANE-friendly default). It is *not* diffusers' stock attention: that path
33
+ routes through ``F.scaled_dot_product_attention`` plus ``view(B, -1, heads,
34
+ d)`` reshapes that fail to convert under coremltools 9 (the same einsum graph
35
+ SPLIT_EINSUM uses converts cleanly). Nor is it diffusers' legacy
36
+ ``AttnProcessor`` — its ``get_attention_scores`` builds the score buffer with
37
+ ``torch.empty(query.shape[0], ...)``, whose dynamic int shape also fails ct9.
38
+
39
+ So this reuses the SPLIT_EINSUM conversion-safe boilerplate and supplies a
40
+ plain full-attention kernel that upcasts QK^T + softmax to fp32. Without the
41
+ upcast, fp16 self-attention at 64x64 latents (4096 query tokens) overflows ->
42
+ inf -> NaN after softmax.
43
+ """
44
+
45
+ def __call__(
46
+ self,
47
+ attn,
48
+ hidden_states,
49
+ encoder_hidden_states=None,
50
+ attention_mask=None,
51
+ temb=None,
52
+ *args,
53
+ **kwargs,
54
+ ):
55
+ return _attention_forward(
56
+ attn,
57
+ hidden_states,
58
+ encoder_hidden_states,
59
+ attention_mask,
60
+ temb,
61
+ original,
62
+ )
63
+
64
+
27
65
  class SplitEinsumAttnProcessor:
28
66
  def __call__(
29
67
  self,
@@ -158,6 +196,29 @@ def _attention_forward(
158
196
  return hidden_states
159
197
 
160
198
 
199
+ def original(q, k, v, mask, heads, dim_head):
200
+ """Full multi-head attention with the QK^T scaling + softmax in fp32.
201
+
202
+ Same ``[B, C, 1, S]`` channel-major layout and mask convention as
203
+ ``split_einsum`` (so it slots into ``_attention_forward`` unchanged), but
204
+ computes the whole score matrix per head in one batched einsum instead of the
205
+ per-head split. Upcasting the scores to fp32 keeps the softmax stable when the
206
+ converted model runs in fp16 (QK^T at 4096 tokens overflows fp16 otherwise).
207
+ """
208
+ batch = q.size(0)
209
+ mh_q = q.view(batch, heads, dim_head, -1).float()
210
+ mh_k = k.view(batch, heads, dim_head, -1).float()
211
+ mh_v = v.view(batch, heads, dim_head, -1)
212
+
213
+ weights = torch.einsum("becq,beck->bkeq", mh_q, mh_k) * (dim_head**-0.5)
214
+ if mask is not None:
215
+ weights = weights + mask
216
+ weights = weights.softmax(dim=1).to(mh_v.dtype)
217
+
218
+ outputs = torch.einsum("bkeq,beck->becq", weights, mh_v)
219
+ return outputs.reshape(batch, heads * dim_head, 1, -1)
220
+
221
+
161
222
  def split_einsum(q, k, v, mask, heads, dim_head):
162
223
  q_heads = _split_heads(q, heads, dim_head)
163
224
  k = k.transpose(1, 3)
@@ -36,7 +36,11 @@ def get_unet(model_version: ModelVersion, ref_unet, attention_implementation):
36
36
  ref_unet.eval(),
37
37
  attention_implementation,
38
38
  )
39
- return CoreMLUNetWrapper(unet, model_version)
39
+ # The freshly built wrapper defaults to training mode; the inner UNet is
40
+ # already eval, but coremltools inspects the top-level traced module and warns
41
+ # ("Model is not in eval mode"). eval() on the wrapper silences it and makes
42
+ # the eval-mode trace explicit (output is unchanged — UNet dropout p=0).
43
+ return CoreMLUNetWrapper(unet, model_version).eval()
40
44
 
41
45
 
42
46
  def get_encoder_hidden_states_shape(ref_unet, batch_size):