coreml-diffusion 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. coreml_diffusion-0.1.2/.github/workflows/release-please.yml +27 -0
  2. coreml_diffusion-0.1.2/.release-please-manifest.json +3 -0
  3. coreml_diffusion-0.1.2/CHANGELOG.md +8 -0
  4. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/PKG-INFO +1 -1
  5. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/attention.py +61 -0
  6. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/convert.py +5 -1
  7. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/pyproject.toml +1 -1
  8. coreml_diffusion-0.1.2/release-please-config.json +22 -0
  9. coreml_diffusion-0.1.2/tests/m2/test_original_gpu.py +106 -0
  10. coreml_diffusion-0.1.2/tests/smoke/test_original_attention.py +95 -0
  11. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/unit/test_conversion_helpers.py +24 -3
  12. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/.github/workflows/publish-pypi.yml +0 -0
  13. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/.github/workflows/tier0.yml +0 -0
  14. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/.github/workflows/tier1.yml +0 -0
  15. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/.github/workflows/tier2.yml +0 -0
  16. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/.gitignore +0 -0
  17. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/LICENSE +0 -0
  18. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/README.md +0 -0
  19. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/__init__.py +0 -0
  20. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/attention.py +0 -0
  21. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/cli.py +0 -0
  22. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/__init__.py +0 -0
  23. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/shapes.py +0 -0
  24. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/trace.py +0 -0
  25. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/conversion/unet.py +0 -0
  26. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/inference.py +0 -0
  27. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/logger.py +0 -0
  28. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/model_version.py +0 -0
  29. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/naming.py +0 -0
  30. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/coreml_diffusion/sources.py +0 -0
  31. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/conftest.py +0 -0
  32. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/m2/goldens/sd15_astronaut.png +0 -0
  33. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/m2/goldens/sd15_astronaut.sha256 +0 -0
  34. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/m2/test_inference_golden.py +0 -0
  35. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/smoke/test_split_einsum_attention.py +0 -0
  36. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/smoke/test_synthetic_unet.py +0 -0
  37. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/unit/test_characterization_out_name.py +0 -0
  38. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/unit/test_cli.py +0 -0
  39. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/unit/test_discovery_api.py +0 -0
  40. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/unit/test_sources.py +0 -0
  41. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/tests/unit/test_tier0_purity.py +0 -0
  42. {coreml_diffusion-0.1.1 → coreml_diffusion-0.1.2}/uv.lock +0 -0
@@ -0,0 +1,27 @@
1
+ name: Release Please
2
+
3
+ # Manages the release cycle: maintains a Release PR that bumps the version in
4
+ # pyproject.toml and curates CHANGELOG.md from Conventional Commits (only the
5
+ # user-facing types in release-please-config.json's changelog-sections are
6
+ # surfaced). Merging that PR tags + publishes a GitHub Release.
7
+ #
8
+ # Runs with GH_CI_PAT (not the default GITHUB_TOKEN) so the Release it creates
9
+ # triggers publish-pypi.yml — events made with GITHUB_TOKEN do not start other
10
+ # workflows.
11
+
12
+ on:
13
+ push:
14
+ branches:
15
+ - main
16
+
17
+ permissions:
18
+ contents: write
19
+ pull-requests: write
20
+
21
+ jobs:
22
+ release-please:
23
+ runs-on: ubuntu-latest
24
+ steps:
25
+ - uses: googleapis/release-please-action@v4
26
+ with:
27
+ token: ${{ secrets.GH_CI_PAT }}
@@ -0,0 +1,3 @@
1
+ {
2
+ ".": "0.1.2"
3
+ }
@@ -0,0 +1,8 @@
1
+ # Changelog
2
+
3
+ ## [0.1.2](https://github.com/aszc-dev/coreml-diffusion/compare/v0.1.1...v0.1.2) (2026-05-27)
4
+
5
+
6
+ ### 🐛 Bug Fixes
7
+
8
+ * **attention:** convertible fp32 ORIGINAL attention for the Core ML GPU path ([#2](https://github.com/aszc-dev/coreml-diffusion/issues/2)) ([28e56fc](https://github.com/aszc-dev/coreml-diffusion/commit/28e56fcf8c2242ebbe4c05abd05f7e796069d7d1))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coreml-diffusion
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Convert diffusion-model checkpoints (SD1.5/SDXL) to Core ML for Apple Neural Engine — framework-free, ComfyUI-independent.
5
5
  Project-URL: Homepage, https://github.com/aszc-dev/coreml-diffusion
6
6
  Project-URL: Repository, https://github.com/aszc-dev/coreml-diffusion
@@ -9,6 +9,7 @@ CHUNK_SIZE = 512
9
9
 
10
10
  def apply_attention_implementation(unet, attention_implementation):
11
11
  if attention_implementation == "ORIGINAL":
12
+ unet.set_attn_processor(OriginalAttnProcessor())
12
13
  return unet
13
14
 
14
15
  if attention_implementation == "SPLIT_EINSUM":
@@ -24,6 +25,43 @@ def apply_attention_implementation(unet, attention_implementation):
24
25
  )
25
26
 
26
27
 
28
+ class OriginalAttnProcessor:
29
+ """Full (non-split) multi-head attention with an fp32 score path.
30
+
31
+ The ORIGINAL implementation targets the Core ML GPU path (SPLIT_EINSUM* are
32
+ the ANE-friendly default). It is *not* diffusers' stock attention: that path
33
+ routes through ``F.scaled_dot_product_attention`` plus ``view(B, -1, heads,
34
+ d)`` reshapes that fail to convert under coremltools 9 (the same einsum graph
35
+ SPLIT_EINSUM uses converts cleanly). Nor is it diffusers' legacy
36
+ ``AttnProcessor`` — its ``get_attention_scores`` builds the score buffer with
37
+ ``torch.empty(query.shape[0], ...)``, whose dynamic int shape also fails ct9.
38
+
39
+ So this reuses the SPLIT_EINSUM conversion-safe boilerplate and supplies a
40
+ plain full-attention kernel that upcasts QK^T + softmax to fp32. Without the
41
+ upcast, fp16 self-attention at 64x64 latents (4096 query tokens) overflows ->
42
+ inf -> NaN after softmax.
43
+ """
44
+
45
+ def __call__(
46
+ self,
47
+ attn,
48
+ hidden_states,
49
+ encoder_hidden_states=None,
50
+ attention_mask=None,
51
+ temb=None,
52
+ *args,
53
+ **kwargs,
54
+ ):
55
+ return _attention_forward(
56
+ attn,
57
+ hidden_states,
58
+ encoder_hidden_states,
59
+ attention_mask,
60
+ temb,
61
+ original,
62
+ )
63
+
64
+
27
65
  class SplitEinsumAttnProcessor:
28
66
  def __call__(
29
67
  self,
@@ -158,6 +196,29 @@ def _attention_forward(
158
196
  return hidden_states
159
197
 
160
198
 
199
+ def original(q, k, v, mask, heads, dim_head):
200
+ """Full multi-head attention with the QK^T scaling + softmax in fp32.
201
+
202
+ Same ``[B, C, 1, S]`` channel-major layout and mask convention as
203
+ ``split_einsum`` (so it slots into ``_attention_forward`` unchanged), but
204
+ computes the whole score matrix per head in one batched einsum instead of the
205
+ per-head split. Upcasting the scores to fp32 keeps the softmax stable when the
206
+ converted model runs in fp16 (QK^T at 4096 tokens overflows fp16 otherwise).
207
+ """
208
+ batch = q.size(0)
209
+ mh_q = q.view(batch, heads, dim_head, -1).float()
210
+ mh_k = k.view(batch, heads, dim_head, -1).float()
211
+ mh_v = v.view(batch, heads, dim_head, -1)
212
+
213
+ weights = torch.einsum("becq,beck->bkeq", mh_q, mh_k) * (dim_head**-0.5)
214
+ if mask is not None:
215
+ weights = weights + mask
216
+ weights = weights.softmax(dim=1).to(mh_v.dtype)
217
+
218
+ outputs = torch.einsum("bkeq,beck->becq", weights, mh_v)
219
+ return outputs.reshape(batch, heads * dim_head, 1, -1)
220
+
221
+
161
222
  def split_einsum(q, k, v, mask, heads, dim_head):
162
223
  q_heads = _split_heads(q, heads, dim_head)
163
224
  k = k.transpose(1, 3)
@@ -36,7 +36,11 @@ def get_unet(model_version: ModelVersion, ref_unet, attention_implementation):
36
36
  ref_unet.eval(),
37
37
  attention_implementation,
38
38
  )
39
- return CoreMLUNetWrapper(unet, model_version)
39
+ # The freshly built wrapper defaults to training mode; the inner UNet is
40
+ # already eval, but coremltools inspects the top-level traced module and warns
41
+ # ("Model is not in eval mode"). eval() on the wrapper silences it and makes
42
+ # the eval-mode trace explicit (output is unchanged — UNet dropout p=0).
43
+ return CoreMLUNetWrapper(unet, model_version).eval()
40
44
 
41
45
 
42
46
  def get_encoder_hidden_states_shape(ref_unet, batch_size):
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "coreml-diffusion"
3
3
  description = "Convert diffusion-model checkpoints (SD1.5/SDXL) to Core ML for Apple Neural Engine — framework-free, ComfyUI-independent."
4
- version = "0.1.1"
4
+ version = "0.1.2"
5
5
  license = "MIT"
6
6
  license-files = ["LICENSE"]
7
7
  requires-python = ">=3.12,<3.13"
@@ -0,0 +1,22 @@
1
+ {
2
+ "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json",
3
+ "packages": {
4
+ ".": {
5
+ "release-type": "python",
6
+ "bump-minor-pre-major": true,
7
+ "bump-patch-for-minor-pre-major": true,
8
+ "changelog-sections": [
9
+ { "type": "feat", "section": "✨ Features", "hidden": false },
10
+ { "type": "fix", "section": "🐛 Bug Fixes", "hidden": false },
11
+ { "type": "perf", "section": "⚡ Performance", "hidden": false },
12
+ { "type": "docs", "section": "📚 Documentation", "hidden": false },
13
+ { "type": "chore", "hidden": true },
14
+ { "type": "build", "hidden": true },
15
+ { "type": "ci", "hidden": true },
16
+ { "type": "refactor", "hidden": true },
17
+ { "type": "test", "hidden": true },
18
+ { "type": "style", "hidden": true }
19
+ ]
20
+ }
21
+ }
22
+ }
@@ -0,0 +1,106 @@
1
+ """[M2-GPU] ORIGINAL attention correctness gate on the Core ML GPU path.
2
+
3
+ ORIGINAL is the GPU-targeted attention implementation (SPLIT_EINSUM is the
4
+ ANE-friendly default and is what the image golden in test_inference_golden.py
5
+ exercises on the ANE). ORIGINAL is retained for broader coremltools coverage and
6
+ benchmarks where Core ML on the GPU is a valid target.
7
+
8
+ This is the real-hardware half of the fp16-overflow fix: the CPU smoke test
9
+ (tests/smoke/test_original_attention.py) proves the torch graph upcasts QK^T /
10
+ softmax to fp32, but only a converted model run through coremltools' FLOAT16
11
+ precision proves that upcast *survives* to the GPU. If it were downcast away,
12
+ SD1.5 self-attention at 64x64 (4096 query tokens) would overflow fp16 -> NaN.
13
+
14
+ Convert a real SD1.5 UNet with ORIGINAL, run a single forward on the GPU, and
15
+ gate on: output is finite (the overflow guard) AND cosine similarity to a torch
16
+ fp32 reference UNet stays high (the model still computes the right thing).
17
+
18
+ Requires Apple Silicon and a single-file SD1.5 checkpoint:
19
+ COREML_DIFFUSION_TEST_CKPT absolute path to the checkpoint
20
+ ORIGINAL_GPU_COSINE_MIN optional cosine floor (default 0.99)
21
+ Skips otherwise, so Tier 0 on Linux is unaffected.
22
+ """
23
+
24
+ import os
25
+ import platform
26
+
27
+ import numpy as np
28
+ import pytest
29
+
30
+ CKPT = os.environ.get("COREML_DIFFUSION_TEST_CKPT")
31
+ COSINE_MIN = float(os.environ.get("ORIGINAL_GPU_COSINE_MIN", "0.99"))
32
+
33
+ SAMPLE_SHAPE = (1, 4, 64, 64) # 512x512 latent — the fp16-overflow case
34
+ TIMESTEP = 999.0
35
+ TOKENS = 77
36
+ SEED = 0
37
+
38
+
39
+ def _cosine(a: np.ndarray, b: np.ndarray) -> float:
40
+ a = a.astype(np.float64).ravel()
41
+ b = b.astype(np.float64).ravel()
42
+ return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
43
+
44
+
45
+ @pytest.fixture
46
+ def prerequisites():
47
+ if platform.machine() != "arm64":
48
+ pytest.skip("requires Apple Silicon")
49
+ if not CKPT:
50
+ pytest.skip(
51
+ "set COREML_DIFFUSION_TEST_CKPT to a single-file SD1.5 checkpoint "
52
+ "to run the Tier 2 ORIGINAL/GPU correctness gate"
53
+ )
54
+
55
+
56
+ def test_original_attention_survives_coreml_to_gpu(prerequisites, tmp_path):
57
+ import coremltools as ct
58
+ import torch
59
+
60
+ from coreml_diffusion import ModelVersion
61
+ from coreml_diffusion.convert import convert, load_unet
62
+
63
+ out_path = tmp_path / "sd15_original_b1.mlpackage"
64
+ convert(
65
+ CKPT,
66
+ ModelVersion.SD15,
67
+ str(out_path),
68
+ batch_size=1,
69
+ sample_size=(64, 64),
70
+ attn_impl="ORIGINAL",
71
+ )
72
+
73
+ # Fixed input shared by both the converted model and the torch reference.
74
+ torch.manual_seed(SEED)
75
+ sample = torch.randn(*SAMPLE_SHAPE)
76
+ timestep = torch.tensor([TIMESTEP])
77
+ cross_dim = load_unet(CKPT, None).config.cross_attention_dim
78
+ encoder_hidden_states = torch.randn(1, TOKENS, cross_dim)
79
+
80
+ # Ground truth: the canonical UNet math in fp32 on torch (stock diffusers).
81
+ ref_unet = load_unet(CKPT, None).eval()
82
+ with torch.no_grad():
83
+ reference = ref_unet(
84
+ sample,
85
+ timestep,
86
+ encoder_hidden_states=encoder_hidden_states,
87
+ return_dict=False,
88
+ )[0].numpy()
89
+
90
+ # Converted ORIGINAL UNet on the Core ML GPU path (fp16 weights/activations,
91
+ # attention upcast to fp32 inside the graph).
92
+ model = ct.models.MLModel(str(out_path), compute_units=ct.ComputeUnit.CPU_AND_GPU)
93
+ predicted = model.predict(
94
+ {
95
+ "sample": sample.numpy().astype(np.float16),
96
+ "timestep": timestep.numpy().astype(np.float16),
97
+ "encoder_hidden_states": encoder_hidden_states.numpy().astype(np.float16),
98
+ }
99
+ )["noise_pred"]
100
+
101
+ assert np.isfinite(predicted).all(), "fp16 attention overflowed to NaN/inf"
102
+ cosine = _cosine(predicted, reference)
103
+ assert cosine >= COSINE_MIN, (
104
+ f"cosine {cosine:.5f} < {COSINE_MIN}: ORIGINAL/GPU output diverged from "
105
+ f"the torch fp32 reference"
106
+ )
@@ -0,0 +1,95 @@
1
+ """Tier 1 smoke: the ORIGINAL attention path is convertible and fp16-safe.
2
+
3
+ Two regressions are guarded here:
4
+
5
+ 1. ORIGINAL must not be a no-op. It used to ``return unet`` unchanged, leaving
6
+ diffusers' stock ``AttnProcessor2_0`` (SDPA), which does not even convert under
7
+ coremltools 9 (its ``view(B, -1, heads, d)`` reshapes fail the torch frontend).
8
+ ``apply_attention_implementation(unet, "ORIGINAL")`` must install our
9
+ conversion-safe ``OriginalAttnProcessor`` instead.
10
+
11
+ 2. The ORIGINAL kernel upcasts QK^T / softmax to fp32. Run in fp16, full
12
+ self-attention at SD1.5's highest-resolution block overflows fp16 in the score
13
+ matmul -> inf -> NaN; the fp32 score path keeps it finite.
14
+
15
+ Needs only torch + diffusers (no coremltools), but lives in Tier 1 because
16
+ tests/unit/ must stay free of heavy imports.
17
+ """
18
+
19
+ import platform
20
+
21
+ import pytest
22
+ import torch
23
+ from diffusers import UNet2DConditionModel
24
+ from diffusers.models.attention_processor import Attention, AttnProcessor2_0
25
+
26
+ from coreml_diffusion.conversion.attention import (
27
+ OriginalAttnProcessor,
28
+ apply_attention_implementation,
29
+ original,
30
+ )
31
+
32
+ pytestmark = pytest.mark.skipif(
33
+ platform.system() != "Darwin" or platform.machine() != "arm64",
34
+ reason="Tier 1 requires macOS on Apple Silicon",
35
+ )
36
+
37
+
38
+ def _tiny_unet():
39
+ """Minimal UNet2DConditionModel with cross-attention blocks (so it owns a few
40
+ ``Attention`` modules) — small enough to construct in milliseconds."""
41
+ return UNet2DConditionModel(
42
+ sample_size=8,
43
+ in_channels=4,
44
+ out_channels=4,
45
+ layers_per_block=1,
46
+ block_out_channels=(32, 64),
47
+ down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
48
+ up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
49
+ cross_attention_dim=32,
50
+ attention_head_dim=8,
51
+ norm_num_groups=8,
52
+ )
53
+
54
+
55
+ def test_original_installs_convertible_processor():
56
+ """The bug was that ORIGINAL returned the UNet unchanged, leaving the stock
57
+ SDPA processor (which does not convert under ct9). Lock in: the default is
58
+ SDPA, and ORIGINAL swaps every Attention to OriginalAttnProcessor."""
59
+ torch.manual_seed(0)
60
+ unet = _tiny_unet()
61
+ attn_modules = [m for m in unet.modules() if isinstance(m, Attention)]
62
+ assert attn_modules, "expected the tiny UNet to own Attention modules"
63
+
64
+ # Pre-condition: diffusers' default is SDPA.
65
+ assert all(isinstance(m.processor, AttnProcessor2_0) for m in attn_modules)
66
+
67
+ apply_attention_implementation(unet, "ORIGINAL")
68
+
69
+ # Post-condition: our conversion-safe full-attention processor everywhere.
70
+ assert all(isinstance(m.processor, OriginalAttnProcessor) for m in attn_modules)
71
+
72
+
73
+ def test_original_upcast_keeps_fp16_attention_finite():
74
+ """Reproduce the failure mode: a large QK^T overflows fp16 to NaN, but the
75
+ ORIGINAL kernel's fp32 score path keeps the output finite."""
76
+ torch.manual_seed(0)
77
+ heads, dim_head, seq = 8, 40, 64
78
+ channels = heads * dim_head
79
+
80
+ # [B, C, 1, S] layout (what _attention_forward hands the kernel). Large q/k so
81
+ # the score matmul exceeds fp16's 65504 ceiling.
82
+ q = (torch.randn(1, channels, 1, seq) * 60).half()
83
+ k = (torch.randn(1, channels, 1, seq) * 60).half()
84
+ v = torch.randn(1, channels, 1, seq).half()
85
+
86
+ # Naive fp16 scores (no upcast) overflow -> inf -> NaN after softmax.
87
+ mh_q = q.view(1, heads, dim_head, -1)
88
+ mh_k = k.view(1, heads, dim_head, -1)
89
+ naive_scores = torch.einsum("becq,beck->bkeq", mh_q, mh_k) * (dim_head**-0.5)
90
+ assert torch.isnan(naive_scores.softmax(dim=1)).any(), "expected fp16 overflow"
91
+
92
+ # original() upcasts the score matmul + softmax to fp32 -> finite output.
93
+ out = original(q, k, v, None, heads, dim_head)
94
+ assert torch.isfinite(out).all()
95
+ assert out.shape == (1, channels, 1, seq)
@@ -6,6 +6,7 @@ from coreml_diffusion.conversion.attention import (
6
6
  SplitEinsumAttnProcessor,
7
7
  SplitEinsumV2AttnProcessor,
8
8
  apply_attention_implementation,
9
+ original,
9
10
  split_einsum,
10
11
  split_einsum_v2,
11
12
  )
@@ -106,6 +107,24 @@ def test_unet_wrapper_routes_sdxl_added_conditioning():
106
107
  assert unet.call["added_cond_kwargs"]["text_embeds"] is text_embeds
107
108
 
108
109
 
110
+ def test_original_matches_reference_attention_math():
111
+ torch.manual_seed(0)
112
+ batch = 2
113
+ heads = 3
114
+ dim_head = 4
115
+ sequence = 16
116
+ channels = heads * dim_head
117
+ q = torch.randn(batch, channels, 1, sequence)
118
+ k = torch.randn(batch, channels, 1, sequence)
119
+ v = torch.randn(batch, channels, 1, sequence)
120
+
121
+ expected = _original_attention(q, k, v, None, heads, dim_head)
122
+
123
+ # original() upcasts to fp32 internally, so on fp32 inputs it reproduces the
124
+ # reference math; the bkeq score layout differs but the semantics do not.
125
+ assert torch.allclose(original(q, k, v, None, heads, dim_head), expected, atol=1e-6)
126
+
127
+
109
128
  def test_split_einsum_matches_original_attention_math():
110
129
  torch.manual_seed(0)
111
130
  batch = 2
@@ -151,11 +170,13 @@ def test_split_einsum_v2_chunked_path_matches_original_attention_math():
151
170
 
152
171
 
153
172
  def test_apply_attention_implementation_sets_split_processors():
173
+ # ORIGINAL is intentionally absent here: asserting it installs
174
+ # OriginalAttnProcessor on real Attention modules needs diffusers (heavy) and
175
+ # would break the Tier-0 framework-free promise. That path is covered by
176
+ # tests/smoke/test_original_attention.py. (The kernel itself, original(), is
177
+ # framework-free and is exercised by test_original_matches_reference_*.)
154
178
  unet = RecordingProcessorUNet()
155
179
 
156
- assert apply_attention_implementation(unet, "ORIGINAL") is unet
157
- assert unet.processor is None
158
-
159
180
  apply_attention_implementation(unet, "SPLIT_EINSUM")
160
181
  assert isinstance(unet.processor, SplitEinsumAttnProcessor)
161
182