PyPI - coreml-diffusion - Versions diffs - 0.1.1__tar.gz → 0.1.2__tar.gz - Mend

coreml-diffusion 0.1.1tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

coreml_diffusion-0.1.2/.github/workflows/release-please.yml ADDED Viewed

@@ -0,0 +1,27 @@
+name: Release Please
+# Manages the release cycle: maintains a Release PR that bumps the version in
+# pyproject.toml and curates CHANGELOG.md from Conventional Commits (only the
+# user-facing types in release-please-config.json's changelog-sections are
+# surfaced). Merging that PR tags + publishes a GitHub Release.
+#
+# Runs with GH_CI_PAT (not the default GITHUB_TOKEN) so the Release it creates
+# triggers publish-pypi.yml — events made with GITHUB_TOKEN do not start other
+# workflows.
+on:
+  push:
+    branches:
+      - main
+permissions:
+  contents: write
+  pull-requests: write
+jobs:
+  release-please:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: googleapis/release-please-action@v4
+        with:
+          token: ${{ secrets.GH_CI_PAT }}

coreml_diffusion-0.1.2/.release-please-manifest.json ADDED Viewed

@@ -0,0 +1,3 @@
+{
+  ".": "0.1.2"
+}

coreml_diffusion-0.1.2/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,8 @@
+# Changelog
+## [0.1.2](https://github.com/aszc-dev/coreml-diffusion/compare/v0.1.1...v0.1.2) (2026-05-27)
+### 🐛 Bug Fixes
+* **attention:** convertible fp32 ORIGINAL attention for the Core ML GPU path ([#2](https://github.com/aszc-dev/coreml-diffusion/issues/2)) ([28e56fc](https://github.com/aszc-dev/coreml-diffusion/commit/28e56fcf8c2242ebbe4c05abd05f7e796069d7d1))

{coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: coreml-diffusion
-Version: 0.1.1
+Version: 0.1.2
 Summary: Convert diffusion-model checkpoints (SD1.5/SDXL) to Core ML for Apple Neural Engine — framework-free, ComfyUI-independent.
 Project-URL: Homepage, https://github.com/aszc-dev/coreml-diffusion
 Project-URL: Repository, https://github.com/aszc-dev/coreml-diffusion

{coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/attention.py RENAMED Viewed

@@ -9,6 +9,7 @@ CHUNK_SIZE = 512
 def apply_attention_implementation(unet, attention_implementation):
     if attention_implementation == "ORIGINAL":
+        unet.set_attn_processor(OriginalAttnProcessor())
         return unet
     if attention_implementation == "SPLIT_EINSUM":
@@ -24,6 +25,43 @@ def apply_attention_implementation(unet, attention_implementation):
     )
+class OriginalAttnProcessor:
+    """Full (non-split) multi-head attention with an fp32 score path.
+    The ORIGINAL implementation targets the Core ML GPU path (SPLIT_EINSUM* are
+    the ANE-friendly default). It is *not* diffusers' stock attention: that path
+    routes through ``F.scaled_dot_product_attention`` plus ``view(B, -1, heads,
+    d)`` reshapes that fail to convert under coremltools 9 (the same einsum graph
+    SPLIT_EINSUM uses converts cleanly). Nor is it diffusers' legacy
+    ``AttnProcessor`` — its ``get_attention_scores`` builds the score buffer with
+    ``torch.empty(query.shape[0], ...)``, whose dynamic int shape also fails ct9.
+    So this reuses the SPLIT_EINSUM conversion-safe boilerplate and supplies a
+    plain full-attention kernel that upcasts QK^T + softmax to fp32. Without the
+    upcast, fp16 self-attention at 64x64 latents (4096 query tokens) overflows ->
+    inf -> NaN after softmax.
+    """
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        return _attention_forward(
+            attn,
+            hidden_states,
+            encoder_hidden_states,
+            attention_mask,
+            temb,
+            original,
+        )
 class SplitEinsumAttnProcessor:
     def __call__(
         self,
@@ -158,6 +196,29 @@ def _attention_forward(
     return hidden_states
+def original(q, k, v, mask, heads, dim_head):
+    """Full multi-head attention with the QK^T scaling + softmax in fp32.
+    Same ``[B, C, 1, S]`` channel-major layout and mask convention as
+    ``split_einsum`` (so it slots into ``_attention_forward`` unchanged), but
+    computes the whole score matrix per head in one batched einsum instead of the
+    per-head split. Upcasting the scores to fp32 keeps the softmax stable when the
+    converted model runs in fp16 (QK^T at 4096 tokens overflows fp16 otherwise).
+    """
+    batch = q.size(0)
+    mh_q = q.view(batch, heads, dim_head, -1).float()
+    mh_k = k.view(batch, heads, dim_head, -1).float()
+    mh_v = v.view(batch, heads, dim_head, -1)
+    weights = torch.einsum("becq,beck->bkeq", mh_q, mh_k) * (dim_head**-0.5)
+    if mask is not None:
+        weights = weights + mask
+    weights = weights.softmax(dim=1).to(mh_v.dtype)
+    outputs = torch.einsum("bkeq,beck->becq", weights, mh_v)
+    return outputs.reshape(batch, heads * dim_head, 1, -1)
 def split_einsum(q, k, v, mask, heads, dim_head):
     q_heads = _split_heads(q, heads, dim_head)
     k = k.transpose(1, 3)

{coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/convert.py RENAMED Viewed

@@ -36,7 +36,11 @@ def get_unet(model_version: ModelVersion, ref_unet, attention_implementation):
         ref_unet.eval(),
         attention_implementation,
     )
-    return CoreMLUNetWrapper(unet, model_version)
+    # The freshly built wrapper defaults to training mode; the inner UNet is
+    # already eval, but coremltools inspects the top-level traced module and warns
+    # ("Model is not in eval mode"). eval() on the wrapper silences it and makes
+    # the eval-mode trace explicit (output is unchanged — UNet dropout p=0).
+    return CoreMLUNetWrapper(unet, model_version).eval()
 def get_encoder_hidden_states_shape(ref_unet, batch_size):

{coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "coreml-diffusion"
 description = "Convert diffusion-model checkpoints (SD1.5/SDXL) to Core ML for Apple Neural Engine — framework-free, ComfyUI-independent."
-version = "0.1.1"
+version = "0.1.2"
 license = "MIT"
 license-files = ["LICENSE"]
 requires-python = ">=3.12,<3.13"

coreml_diffusion-0.1.2/release-please-config.json ADDED Viewed

@@ -0,0 +1,22 @@
+{
+  "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json",
+  "packages": {
+    ".": {
+      "release-type": "python",
+      "bump-minor-pre-major": true,
+      "bump-patch-for-minor-pre-major": true,
+      "changelog-sections": [
+        { "type": "feat", "section": "✨ Features", "hidden": false },
+        { "type": "fix", "section": "🐛 Bug Fixes", "hidden": false },
+        { "type": "perf", "section": "⚡ Performance", "hidden": false },
+        { "type": "docs", "section": "📚 Documentation", "hidden": false },
+        { "type": "chore", "hidden": true },
+        { "type": "build", "hidden": true },
+        { "type": "ci", "hidden": true },
+        { "type": "refactor", "hidden": true },
+        { "type": "test", "hidden": true },
+        { "type": "style", "hidden": true }
+      ]
+    }
+  }
+}

coreml_diffusion-0.1.2/tests/m2/test_original_gpu.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""[M2-GPU] ORIGINAL attention correctness gate on the Core ML GPU path.
+ORIGINAL is the GPU-targeted attention implementation (SPLIT_EINSUM is the
+ANE-friendly default and is what the image golden in test_inference_golden.py
+exercises on the ANE). ORIGINAL is retained for broader coremltools coverage and
+benchmarks where Core ML on the GPU is a valid target.
+This is the real-hardware half of the fp16-overflow fix: the CPU smoke test
+(tests/smoke/test_original_attention.py) proves the torch graph upcasts QK^T /
+softmax to fp32, but only a converted model run through coremltools' FLOAT16
+precision proves that upcast *survives* to the GPU. If it were downcast away,
+SD1.5 self-attention at 64x64 (4096 query tokens) would overflow fp16 -> NaN.
+Convert a real SD1.5 UNet with ORIGINAL, run a single forward on the GPU, and
+gate on: output is finite (the overflow guard) AND cosine similarity to a torch
+fp32 reference UNet stays high (the model still computes the right thing).
+Requires Apple Silicon and a single-file SD1.5 checkpoint:
+  COREML_DIFFUSION_TEST_CKPT   absolute path to the checkpoint
+  ORIGINAL_GPU_COSINE_MIN      optional cosine floor (default 0.99)
+Skips otherwise, so Tier 0 on Linux is unaffected.
+"""
+import os
+import platform
+import numpy as np
+import pytest
+CKPT = os.environ.get("COREML_DIFFUSION_TEST_CKPT")
+COSINE_MIN = float(os.environ.get("ORIGINAL_GPU_COSINE_MIN", "0.99"))
+SAMPLE_SHAPE = (1, 4, 64, 64)  # 512x512 latent — the fp16-overflow case
+TIMESTEP = 999.0
+TOKENS = 77
+SEED = 0
+def _cosine(a: np.ndarray, b: np.ndarray) -> float:
+    a = a.astype(np.float64).ravel()
+    b = b.astype(np.float64).ravel()
+    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
+@pytest.fixture
+def prerequisites():
+    if platform.machine() != "arm64":
+        pytest.skip("requires Apple Silicon")
+    if not CKPT:
+        pytest.skip(
+            "set COREML_DIFFUSION_TEST_CKPT to a single-file SD1.5 checkpoint "
+            "to run the Tier 2 ORIGINAL/GPU correctness gate"
+        )
+def test_original_attention_survives_coreml_to_gpu(prerequisites, tmp_path):
+    import coremltools as ct
+    import torch
+    from coreml_diffusion import ModelVersion
+    from coreml_diffusion.convert import convert, load_unet
+    out_path = tmp_path / "sd15_original_b1.mlpackage"
+    convert(
+        CKPT,
+        ModelVersion.SD15,
+        str(out_path),
+        batch_size=1,
+        sample_size=(64, 64),
+        attn_impl="ORIGINAL",
+    )
+    # Fixed input shared by both the converted model and the torch reference.
+    torch.manual_seed(SEED)
+    sample = torch.randn(*SAMPLE_SHAPE)
+    timestep = torch.tensor([TIMESTEP])
+    cross_dim = load_unet(CKPT, None).config.cross_attention_dim
+    encoder_hidden_states = torch.randn(1, TOKENS, cross_dim)
+    # Ground truth: the canonical UNet math in fp32 on torch (stock diffusers).
+    ref_unet = load_unet(CKPT, None).eval()
+    with torch.no_grad():
+        reference = ref_unet(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            return_dict=False,
+        )[0].numpy()
+    # Converted ORIGINAL UNet on the Core ML GPU path (fp16 weights/activations,
+    # attention upcast to fp32 inside the graph).
+    model = ct.models.MLModel(str(out_path), compute_units=ct.ComputeUnit.CPU_AND_GPU)
+    predicted = model.predict(
+        {
+            "sample": sample.numpy().astype(np.float16),
+            "timestep": timestep.numpy().astype(np.float16),
+            "encoder_hidden_states": encoder_hidden_states.numpy().astype(np.float16),
+        }
+    )["noise_pred"]
+    assert np.isfinite(predicted).all(), "fp16 attention overflowed to NaN/inf"
+    cosine = _cosine(predicted, reference)
+    assert cosine >= COSINE_MIN, (
+        f"cosine {cosine:.5f} < {COSINE_MIN}: ORIGINAL/GPU output diverged from "
+        f"the torch fp32 reference"
+    )

coreml_diffusion-0.1.2/tests/smoke/test_original_attention.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Tier 1 smoke: the ORIGINAL attention path is convertible and fp16-safe.
+Two regressions are guarded here:
+1. ORIGINAL must not be a no-op. It used to ``return unet`` unchanged, leaving
+   diffusers' stock ``AttnProcessor2_0`` (SDPA), which does not even convert under
+   coremltools 9 (its ``view(B, -1, heads, d)`` reshapes fail the torch frontend).
+   ``apply_attention_implementation(unet, "ORIGINAL")`` must install our
+   conversion-safe ``OriginalAttnProcessor`` instead.
+2. The ORIGINAL kernel upcasts QK^T / softmax to fp32. Run in fp16, full
+   self-attention at SD1.5's highest-resolution block overflows fp16 in the score
+   matmul -> inf -> NaN; the fp32 score path keeps it finite.
+Needs only torch + diffusers (no coremltools), but lives in Tier 1 because
+tests/unit/ must stay free of heavy imports.
+"""
+import platform
+import pytest
+import torch
+from diffusers import UNet2DConditionModel
+from diffusers.models.attention_processor import Attention, AttnProcessor2_0
+from coreml_diffusion.conversion.attention import (
+    OriginalAttnProcessor,
+    apply_attention_implementation,
+    original,
+)
+pytestmark = pytest.mark.skipif(
+    platform.system() != "Darwin" or platform.machine() != "arm64",
+    reason="Tier 1 requires macOS on Apple Silicon",
+)
+def _tiny_unet():
+    """Minimal UNet2DConditionModel with cross-attention blocks (so it owns a few
+    ``Attention`` modules) — small enough to construct in milliseconds."""
+    return UNet2DConditionModel(
+        sample_size=8,
+        in_channels=4,
+        out_channels=4,
+        layers_per_block=1,
+        block_out_channels=(32, 64),
+        down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+        up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
+        cross_attention_dim=32,
+        attention_head_dim=8,
+        norm_num_groups=8,
+    )
+def test_original_installs_convertible_processor():
+    """The bug was that ORIGINAL returned the UNet unchanged, leaving the stock
+    SDPA processor (which does not convert under ct9). Lock in: the default is
+    SDPA, and ORIGINAL swaps every Attention to OriginalAttnProcessor."""
+    torch.manual_seed(0)
+    unet = _tiny_unet()
+    attn_modules = [m for m in unet.modules() if isinstance(m, Attention)]
+    assert attn_modules, "expected the tiny UNet to own Attention modules"
+    # Pre-condition: diffusers' default is SDPA.
+    assert all(isinstance(m.processor, AttnProcessor2_0) for m in attn_modules)
+    apply_attention_implementation(unet, "ORIGINAL")
+    # Post-condition: our conversion-safe full-attention processor everywhere.
+    assert all(isinstance(m.processor, OriginalAttnProcessor) for m in attn_modules)
+def test_original_upcast_keeps_fp16_attention_finite():
+    """Reproduce the failure mode: a large QK^T overflows fp16 to NaN, but the
+    ORIGINAL kernel's fp32 score path keeps the output finite."""
+    torch.manual_seed(0)
+    heads, dim_head, seq = 8, 40, 64
+    channels = heads * dim_head
+    # [B, C, 1, S] layout (what _attention_forward hands the kernel). Large q/k so
+    # the score matmul exceeds fp16's 65504 ceiling.
+    q = (torch.randn(1, channels, 1, seq) * 60).half()
+    k = (torch.randn(1, channels, 1, seq) * 60).half()
+    v = torch.randn(1, channels, 1, seq).half()
+    # Naive fp16 scores (no upcast) overflow -> inf -> NaN after softmax.
+    mh_q = q.view(1, heads, dim_head, -1)
+    mh_k = k.view(1, heads, dim_head, -1)
+    naive_scores = torch.einsum("becq,beck->bkeq", mh_q, mh_k) * (dim_head**-0.5)
+    assert torch.isnan(naive_scores.softmax(dim=1)).any(), "expected fp16 overflow"
+    # original() upcasts the score matmul + softmax to fp32 -> finite output.
+    out = original(q, k, v, None, heads, dim_head)
+    assert torch.isfinite(out).all()
+    assert out.shape == (1, channels, 1, seq)

{coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/unit/test_conversion_helpers.py RENAMED Viewed

@@ -6,6 +6,7 @@ from coreml_diffusion.conversion.attention import (
     SplitEinsumAttnProcessor,
     SplitEinsumV2AttnProcessor,
     apply_attention_implementation,
+    original,
     split_einsum,
     split_einsum_v2,
 )
@@ -106,6 +107,24 @@ def test_unet_wrapper_routes_sdxl_added_conditioning():
     assert unet.call["added_cond_kwargs"]["text_embeds"] is text_embeds
+def test_original_matches_reference_attention_math():
+    torch.manual_seed(0)
+    batch = 2
+    heads = 3
+    dim_head = 4
+    sequence = 16
+    channels = heads * dim_head
+    q = torch.randn(batch, channels, 1, sequence)
+    k = torch.randn(batch, channels, 1, sequence)
+    v = torch.randn(batch, channels, 1, sequence)
+    expected = _original_attention(q, k, v, None, heads, dim_head)
+    # original() upcasts to fp32 internally, so on fp32 inputs it reproduces the
+    # reference math; the bkeq score layout differs but the semantics do not.
+    assert torch.allclose(original(q, k, v, None, heads, dim_head), expected, atol=1e-6)
 def test_split_einsum_matches_original_attention_math():
     torch.manual_seed(0)
     batch = 2
@@ -151,11 +170,13 @@ def test_split_einsum_v2_chunked_path_matches_original_attention_math():
 def test_apply_attention_implementation_sets_split_processors():
+    # ORIGINAL is intentionally absent here: asserting it installs
+    # OriginalAttnProcessor on real Attention modules needs diffusers (heavy) and
+    # would break the Tier-0 framework-free promise. That path is covered by
+    # tests/smoke/test_original_attention.py. (The kernel itself, original(), is
+    # framework-free and is exercised by test_original_matches_reference_*.)
     unet = RecordingProcessorUNet()
-    assert apply_attention_implementation(unet, "ORIGINAL") is unet
-    assert unet.processor is None
     apply_attention_implementation(unet, "SPLIT_EINSUM")
     assert isinstance(unet.processor, SplitEinsumAttnProcessor)