PyPI - n4ax - Versions diffs - 0.1.0__tar.gz - Mend

n4ax 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

n4ax-0.1.0/LICENSE +21 -0
n4ax-0.1.0/PKG-INFO +132 -0
n4ax-0.1.0/README.md +95 -0
n4ax-0.1.0/n4ax/__init__.py +6 -0
n4ax-0.1.0/n4ax/core.py +256 -0
n4ax-0.1.0/n4ax.egg-info/PKG-INFO +132 -0
n4ax-0.1.0/n4ax.egg-info/SOURCES.txt +12 -0
n4ax-0.1.0/n4ax.egg-info/dependency_links.txt +1 -0
n4ax-0.1.0/n4ax.egg-info/requires.txt +20 -0
n4ax-0.1.0/n4ax.egg-info/top_level.txt +1 -0
n4ax-0.1.0/pyproject.toml +78 -0
n4ax-0.1.0/setup.cfg +4 -0
n4ax-0.1.0/tests/test_basic.py +56 -0
n4ax-0.1.0/tests/test_vs_itk.py +49 -0

n4ax-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Gragas
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

n4ax-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,132 @@
+Metadata-Version: 2.4
+Name: n4ax
+Version: 0.1.0
+Summary: JAX/GPU N4 bias field correction — a fast drop-in match for ITK N4
+Author: Geoffroy Oudoumanessah, Jacopo Iollo
+Author-email: Gragas <contact@gragas.ai>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/GragasLab/n4ax
+Project-URL: Repository, https://github.com/GragasLab/n4ax
+Keywords: MRI,bias field,N4,N3,JAX,GPU
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: jax>=0.8.0
+Requires-Dist: numpy>=1.24.0
+Provides-Extra: cpu
+Requires-Dist: jax[cpu]>=0.8.0; extra == "cpu"
+Provides-Extra: cuda12
+Requires-Dist: jax[cuda12]>=0.8.0; extra == "cuda12"
+Provides-Extra: compare
+Requires-Dist: SimpleITK>=2.3.0; extra == "compare"
+Requires-Dist: matplotlib>=3.7; extra == "compare"
+Requires-Dist: nibabel>=5.0; extra == "compare"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: ruff>=0.4.0; extra == "dev"
+Requires-Dist: pre-commit>=3.0; extra == "dev"
+Requires-Dist: SimpleITK>=2.3.0; extra == "dev"
+Dynamic: license-file
+# n4ax
+**N4 bias field correction in pure JAX** — a fast, GPU-friendly, *drop-in* match for
+ITK / SimpleITK's `N4BiasFieldCorrectionImageFilter`.
+n4ax reimplements the N4 algorithm (Tustison et al., 2010 — N3 histogram sharpening
++ multi-resolution B-spline) faithfully enough to **match SimpleITK to ~1%** on real
+MRI, while running **~1500× faster on a GPU** and **~20× faster on the same CPU**.
+![NKI raw vs n4ax-corrected vs ITK-corrected](assets/nki_sub-0002_gpu.png)
+*Raw NKI T1w (with B1 shading) → n4ax-corrected → ITK-corrected (visually identical) → estimated bias field.*
+## Why
+N4 is the de-facto standard bias correction, but ITK's implementation is CPU-only and
+slow (minutes per volume). In a GPU MRI pipeline it becomes the bottleneck. n4ax gives
+**N4-quality output on the GPU in tens of milliseconds**, with no custom CUDA — just JAX.
+## Install
+```bash
+uv sync --extra cuda12        # GPU (CUDA 12)
+uv sync --extra cpu           # CPU
+uv sync --extra cuda12 --extra dev      # + tests/linting
+uv sync --extra cuda12 --extra compare  # + SimpleITK/matplotlib for benchmarks
+```
+## Usage
+```python
+import nibabel as nib
+import n4ax
+vol = nib.load("t1w.nii.gz").get_fdata()      # 3D (or 2D) array, intensities >= 0
+corrected = n4ax.n4(vol)                        # Otsu mask computed automatically
+# or pass your own mask, and/or get the log bias field:
+corrected, log_bias = n4ax.n4(vol, mask=mask, return_bias=True)
+```
+`corrected == vol / exp(log_bias)`. The default config (`iters=(8,12,12,8)`,
+`over_relax=1.8`) is tuned for speed; for the tightest ITK match use the robust
+fallback `n4ax.n4(vol, iters=(50,50,30,20), over_relax=1.0, conv_threshold=1.5e-3)`.
+## Benchmark
+Real NKI T1w volumes (256×176×256, ~2 M brain voxels), N4 `[50,50,30,20]`, same Otsu mask.
+ITK on an 8-core CPU; n4ax CPU on the same node; n4ax GPU on an NVIDIA A100.
+| Method | Time / volume | Speedup vs ITK |
+|---|--:|--:|
+| ITK N4 (CPU, 8 cores) | **146 s** | 1× |
+| n4ax (CPU, 8 cores) | **7.7 s** | **~19×** |
+| n4ax (A100 GPU) | **93 ms** | **~1571×** |
+**Accuracy vs ITK** (corrected image, global scale removed — pipelines intensity-normalise anyway):
+mean **1.15 %**, per-subject 0.79–1.59 % over 6 NKI scans. On a single fitting level n4ax
+matches ITK to **0.4 %**, and a single N4 iteration to **0.1 %** — the building blocks are exact;
+the residual is N4's own iterative crawl (ITK itself only converges after ~30 iters/level).
+Multiple subjects, raw (top) vs n4ax-corrected (bottom):
+![NKI grid](assets/nki_grid_gpu.png)
+Reproduce: `python scripts/bench_nki.py` (GPU) and `JAX_PLATFORMS=cpu python scripts/bench_nki.py --skip-itk --skip-fig --tag cpu`.
+## How it's fast (no custom kernels)
+- **Separable B-spline fit.** N4's per-iteration B-spline least-squares (Lee MBA) is a
+  94 M-way scatter into a tiny control lattice — brutal atomic contention (~30 ms/iter).
+  Because the cubic weights depend only on the per-axis index and the Lee denominator
+  factorises, this becomes **3 small dense matmuls per axis** (cuBLAS) — *identical math*,
+  0.1 ms/iter.
+- **Privatised histogram.** The N3 sharpening histogram (1.5 M → 200 bins) is privatised
+  over 256 lanes to avoid atomic serialisation.
+- **Over-relaxation.** N4's fixed point is invariant to `B += α·S` (S = 0 there), so
+  `α ≈ 1.8` reaches ITK's result in far fewer iterations.
+- The whole solve is one fused, jitted program with a device-side convergence loop.
+Two things that mattered for *correctness*: zero-padding the sharpening FFT (circular
+wraparound otherwise breaks convergence), and that float32 == float64 here (verified).
+## Tests
+```bash
+uv run pytest          # basic correctness + ground-truth match vs SimpleITK
+```
+`tests/test_vs_itk.py` asserts n4ax matches SimpleITK's N4 (the reference) within tolerance
+on a phantom; `tests/test_basic.py` covers shapes, 2D/3D, the `image/exp(bias)` identity,
+bias flattening, and the Otsu mask.
+## Status
+Alpha. The fast defaults are tuned on NKI/phantom data; validate on your own data before
+production (the `iters=(50,50,30,20), over_relax=1.0` fallback is the conservative choice).

n4ax-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,95 @@
+# n4ax
+**N4 bias field correction in pure JAX** — a fast, GPU-friendly, *drop-in* match for
+ITK / SimpleITK's `N4BiasFieldCorrectionImageFilter`.
+n4ax reimplements the N4 algorithm (Tustison et al., 2010 — N3 histogram sharpening
++ multi-resolution B-spline) faithfully enough to **match SimpleITK to ~1%** on real
+MRI, while running **~1500× faster on a GPU** and **~20× faster on the same CPU**.
+![NKI raw vs n4ax-corrected vs ITK-corrected](assets/nki_sub-0002_gpu.png)
+*Raw NKI T1w (with B1 shading) → n4ax-corrected → ITK-corrected (visually identical) → estimated bias field.*
+## Why
+N4 is the de-facto standard bias correction, but ITK's implementation is CPU-only and
+slow (minutes per volume). In a GPU MRI pipeline it becomes the bottleneck. n4ax gives
+**N4-quality output on the GPU in tens of milliseconds**, with no custom CUDA — just JAX.
+## Install
+```bash
+uv sync --extra cuda12        # GPU (CUDA 12)
+uv sync --extra cpu           # CPU
+uv sync --extra cuda12 --extra dev      # + tests/linting
+uv sync --extra cuda12 --extra compare  # + SimpleITK/matplotlib for benchmarks
+```
+## Usage
+```python
+import nibabel as nib
+import n4ax
+vol = nib.load("t1w.nii.gz").get_fdata()      # 3D (or 2D) array, intensities >= 0
+corrected = n4ax.n4(vol)                        # Otsu mask computed automatically
+# or pass your own mask, and/or get the log bias field:
+corrected, log_bias = n4ax.n4(vol, mask=mask, return_bias=True)
+```
+`corrected == vol / exp(log_bias)`. The default config (`iters=(8,12,12,8)`,
+`over_relax=1.8`) is tuned for speed; for the tightest ITK match use the robust
+fallback `n4ax.n4(vol, iters=(50,50,30,20), over_relax=1.0, conv_threshold=1.5e-3)`.
+## Benchmark
+Real NKI T1w volumes (256×176×256, ~2 M brain voxels), N4 `[50,50,30,20]`, same Otsu mask.
+ITK on an 8-core CPU; n4ax CPU on the same node; n4ax GPU on an NVIDIA A100.
+| Method | Time / volume | Speedup vs ITK |
+|---|--:|--:|
+| ITK N4 (CPU, 8 cores) | **146 s** | 1× |
+| n4ax (CPU, 8 cores) | **7.7 s** | **~19×** |
+| n4ax (A100 GPU) | **93 ms** | **~1571×** |
+**Accuracy vs ITK** (corrected image, global scale removed — pipelines intensity-normalise anyway):
+mean **1.15 %**, per-subject 0.79–1.59 % over 6 NKI scans. On a single fitting level n4ax
+matches ITK to **0.4 %**, and a single N4 iteration to **0.1 %** — the building blocks are exact;
+the residual is N4's own iterative crawl (ITK itself only converges after ~30 iters/level).
+Multiple subjects, raw (top) vs n4ax-corrected (bottom):
+![NKI grid](assets/nki_grid_gpu.png)
+Reproduce: `python scripts/bench_nki.py` (GPU) and `JAX_PLATFORMS=cpu python scripts/bench_nki.py --skip-itk --skip-fig --tag cpu`.
+## How it's fast (no custom kernels)
+- **Separable B-spline fit.** N4's per-iteration B-spline least-squares (Lee MBA) is a
+  94 M-way scatter into a tiny control lattice — brutal atomic contention (~30 ms/iter).
+  Because the cubic weights depend only on the per-axis index and the Lee denominator
+  factorises, this becomes **3 small dense matmuls per axis** (cuBLAS) — *identical math*,
+  0.1 ms/iter.
+- **Privatised histogram.** The N3 sharpening histogram (1.5 M → 200 bins) is privatised
+  over 256 lanes to avoid atomic serialisation.
+- **Over-relaxation.** N4's fixed point is invariant to `B += α·S` (S = 0 there), so
+  `α ≈ 1.8` reaches ITK's result in far fewer iterations.
+- The whole solve is one fused, jitted program with a device-side convergence loop.
+Two things that mattered for *correctness*: zero-padding the sharpening FFT (circular
+wraparound otherwise breaks convergence), and that float32 == float64 here (verified).
+## Tests
+```bash
+uv run pytest          # basic correctness + ground-truth match vs SimpleITK
+```
+`tests/test_vs_itk.py` asserts n4ax matches SimpleITK's N4 (the reference) within tolerance
+on a phantom; `tests/test_basic.py` covers shapes, 2D/3D, the `image/exp(bias)` identity,
+bias flattening, and the Otsu mask.
+## Status
+Alpha. The fast defaults are tuned on NKI/phantom data; validate on your own data before
+production (the `iters=(50,50,30,20), over_relax=1.0` fallback is the conservative choice).

n4ax-0.1.0/n4ax/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""n4ax — JAX/GPU N4 bias field correction (a fast drop-in match for ITK N4)."""
+from .core import n4, otsu_mask
+__all__ = ["n4", "otsu_mask"]

n4ax-0.1.0/n4ax/core.py ADDED Viewed

@@ -0,0 +1,256 @@
+"""N4 bias field correction in pure JAX — a fast, GPU-friendly drop-in match for
+ITK / SimpleITK's ``N4BiasFieldCorrectionImageFilter``.
+Algorithm (Tustison 2010 N4 = N3 histogram sharpening + multi-resolution B-spline):
+    u = log(v) over the mask;  B = 0  (log bias field)
+    for each fitting level (mesh = 1, 2, 4, 8):
+        repeat:
+            uc = u - B
+            E  = sharpen(uc)               # N3 histogram deconvolution (Wiener, FFT)
+            S  = bspline_fit(uc - E)        # cubic B-spline least-squares (Lee MBA)
+            B  = B + over_relax * S
+    corrected = exp(u - B)                  # == v / exp(B)
+Every building block matches ITK's implementation (parametric coords, cubic
+B-spline weights, Lee-MBA accumulation, N3 Wiener deconvolution). Two ideas make
+it fast on a GPU without any custom kernel:
+* the B-spline fit is **separable** (weights depend only on the per-axis index and
+  the Lee denominator factorises), so the per-voxel scatter into the control
+  lattice becomes three small dense matmuls per axis — no atomic contention;
+* the sharpening histogram is **privatised** over K lanes so the value-scatter
+  doesn't serialise on atomics.
+``over_relax > 1`` accelerates N4's (slow, monotone) crawl to the *same* fixed
+point (S = 0 there, so ``B += a*S`` is fixed-point invariant), reaching ITK's
+result in far fewer iterations.
+"""
+from __future__ import annotations
+import functools
+import jax
+import jax.numpy as jnp
+SPLINE_ORDER = 3
+REAL = jnp.float32  # the iteration is float-precision-insensitive (verified vs float64)
+# ----------------------------- Otsu mask ------------------------------------
+def otsu_mask(volume, nbins: int = 200):
+    """Binary foreground mask via Otsu's threshold (matches ITK ``OtsuThreshold``
+    with insideValue=0/outsideValue=1: foreground = intensity above threshold)."""
+    v = jnp.asarray(volume, REAL)
+    vmin, vmax = jnp.min(v), jnp.max(v)
+    edges = jnp.linspace(vmin, vmax, nbins + 1)
+    hist, _ = jnp.histogram(v, bins=edges)
+    hist = hist.astype(REAL)
+    centers = 0.5 * (edges[:-1] + edges[1:])
+    w = jnp.cumsum(hist)
+    wb = w
+    wf = w[-1] - w
+    csum = jnp.cumsum(hist * centers)
+    mb = jnp.where(wb > 0, csum / jnp.where(wb > 0, wb, 1), 0.0)
+    mf = jnp.where(wf > 0, (csum[-1] - csum) / jnp.where(wf > 0, wf, 1), 0.0)
+    between = wb * wf * (mb - mf) ** 2
+    thr = centers[jnp.argmax(between)]
+    return (v > thr).astype(REAL)
+# ----------------------------- N3 sharpening --------------------------------
+@functools.partial(jax.jit, static_argnums=(2,))
+def _sharpen(uc, mask, nbins=200, fwhm=0.15, wiener=0.01):
+    """N3 histogram-deconvolution sharpening of the masked log image ``uc``."""
+    m = mask > 0.5
+    vmin = jnp.min(jnp.where(m, uc, jnp.inf))
+    vmax = jnp.max(jnp.where(m, uc, -jnp.inf))
+    slope = (vmax - vmin) / (nbins - 1)
+    # parzen (2-bin linear) histogram, privatised over K lanes to avoid atomic
+    # contention (the value-scatter was ~100% of sharpen's cost otherwise).
+    cidx = (uc - vmin) / slope
+    idx = jnp.floor(cidx).astype(jnp.int32)
+    off = cidx - idx
+    w = m.astype(REAL).reshape(-1)
+    idxf = idx.reshape(-1)
+    offf = off.reshape(-1)
+    K = 256
+    lane = jnp.arange(idxf.shape[0], dtype=jnp.int32) % K
+    Hp = jnp.zeros((K, nbins), REAL)
+    Hp = Hp.at[lane, jnp.clip(idxf, 0, nbins - 1)].add(w * (1.0 - offf))
+    Hp = Hp.at[lane, jnp.clip(idxf + 1, 0, nbins - 1)].add(w * offf)
+    H = jnp.sum(Hp, axis=0)
+    # zero-padded FFT (npad >= 2*nbins) so the Gaussian deconvolution doesn't
+    # suffer circular wraparound (that wraparound otherwise breaks convergence).
+    npad = 1
+    while npad < 2 * nbins:
+        npad *= 2
+    Hp_ = jnp.zeros((npad,), REAL).at[:nbins].set(H)
+    k = jnp.arange(npad).astype(REAL)
+    ln2 = jnp.log(2.0)
+    scaled_fwhm = fwhm / slope
+    exp_factor = 4.0 * ln2 / (scaled_fwhm**2)
+    scale_factor = 2.0 * jnp.sqrt(ln2 / jnp.pi) / scaled_fwhm
+    d = jnp.where(k > npad / 2, k - npad, k)
+    F = scale_factor * jnp.exp(-(d**2) * exp_factor)
+    Ff = jnp.fft.fft(F)
+    Gf = jnp.conj(Ff) / (jnp.abs(Ff) ** 2 + wiener)  # Wiener filter
+    Uhat = jnp.clip(jnp.real(jnp.fft.ifft(jnp.fft.fft(Hp_) * Gf)), 0.0, None)
+    centers = vmin + k * slope
+    num = jnp.real(jnp.fft.ifft(jnp.fft.fft(Uhat * centers) * Ff))
+    den = jnp.real(jnp.fft.ifft(jnp.fft.fft(Uhat) * Ff))
+    E = (num / jnp.where(jnp.abs(den) > 1e-10, den, 1e-10))[:nbins]
+    ci = jnp.clip(cidx, 0.0, nbins - 1.0)
+    lo = jnp.floor(ci).astype(jnp.int32)
+    fr = ci - lo
+    hi = jnp.clip(lo + 1, 0, nbins - 1)
+    return jnp.where(m, E[lo] * (1.0 - fr) + E[hi] * fr, 0.0)
+# ----------------------------- B-spline fit ---------------------------------
+def _bspline_w(frac):
+    """Order-3 uniform B-spline weights for the 4 controls span..span+3."""
+    f = frac
+    return jnp.stack(
+        [
+            (1.0 - f) ** 3 / 6.0,
+            (3.0 * f**3 - 6.0 * f**2 + 4.0) / 6.0,
+            (-3.0 * f**3 + 3.0 * f**2 + 3.0 * f + 1.0) / 6.0,
+            f**3 / 6.0,
+        ],
+        axis=-1,
+    )
+def _axis_mats(n, ncp, mesh):
+    """Sparse per-axis weight matrices (ncp x n) at powers 1/2/3 of the cubic
+    weights. The 3D Lee fit is separable -> these turn the scatter into matmuls."""
+    i = jnp.arange(n).astype(REAL)
+    p = jnp.clip(i / max(n - 1, 1) * mesh, 0.0, float(mesh))  # max(): handle singleton axes (2D)
+    span = jnp.clip(jnp.floor(p).astype(jnp.int32), 0, mesh - 1)
+    w = _bspline_w(p - span)
+    rows = (span[:, None] + jnp.arange(4)[None, :]).reshape(-1)
+    cols = jnp.repeat(jnp.arange(n), 4)
+    def mk(power):
+        return jnp.zeros((ncp, n), REAL).at[rows, cols].add((w**power).reshape(-1))
+    return mk(1), mk(2), mk(3)
+@functools.partial(jax.jit, static_argnums=(2, 3))
+def _bspline_fit(r, mask, ncp_shape, mesh):
+    """Cubic B-spline Lee-MBA fit of residual ``r`` over the mask, evaluated densely.
+    Separable formulation: identical math to the per-voxel scatter, as matmuls."""
+    s, h, w = r.shape
+    ncpz, ncpy, ncpx = ncp_shape
+    Wz1, Wz2, Wz3 = _axis_mats(s, ncpz, mesh)
+    Wy1, Wy2, Wy3 = _axis_mats(h, ncpy, mesh)
+    Wx1, Wx2, Wx3 = _axis_mats(w, ncpx, mesh)
+    mvox = (mask > 0.5).astype(REAL)
+    sz, sy, sx = jnp.sum(Wz2, 0), jnp.sum(Wy2, 0), jnp.sum(Wx2, 0)  # Lee denom factorises
+    g = (r * mvox) / (sz[:, None, None] * sy[None, :, None] * sx[None, None, :])
+    num = jnp.einsum("cx,abx->abc", Wx3, jnp.einsum("by,ayx->abx", Wy3, jnp.einsum("az,zyx->ayx", Wz3, g)))
+    den = jnp.einsum("cx,abx->abc", Wx2, jnp.einsum("by,ayx->abx", Wy2, jnp.einsum("az,zyx->ayx", Wz2, mvox)))
+    phi = num / jnp.where(den > 1e-12, den, 1e-12)
+    p1 = jnp.einsum("az,abc->zbc", Wz1, phi)
+    p2 = jnp.einsum("by,zbc->zyc", Wy1, p1)
+    return jnp.einsum("cx,zyc->zyx", Wx1, p2)
+def _conv_cv(b_prev, b, maskb, cnt):
+    """ITK convergence measure: CV = std/mean of exp(B_prev - B_curr) over the mask."""
+    r = jnp.exp(jnp.where(maskb, b_prev - b, 0.0))
+    mu = jnp.sum(jnp.where(maskb, r, 0.0)) / cnt
+    var = jnp.sum(jnp.where(maskb, (r - mu) ** 2, 0.0)) / (cnt - 1.0)
+    return jnp.sqrt(var) / mu
+# ----------------------------- driver ---------------------------------------
+@functools.cache
+def _compiled(iters, nbins):
+    @jax.jit
+    def core(v, mask, fwhm, wiener, threshold, tiny, over_relax):
+        maskb = mask > 0.5
+        cnt = jnp.sum(maskb.astype(REAL))
+        u = jnp.log(jnp.clip(v, tiny, None))
+        b = jnp.zeros_like(u)
+        used = []
+        for lvl, nit in enumerate(iters):
+            mesh = 2**lvl
+            ncp = (mesh + SPLINE_ORDER,) * 3
+            def cond(c, _nit=nit):
+                _, i, cv = c
+                return (i < _nit) & (cv > threshold)
+            def body(c, _ncp=ncp, _mesh=mesh):
+                bp, i, _ = c
+                uc = u - bp
+                e = _sharpen(uc, mask, nbins, fwhm, wiener)
+                s = _bspline_fit(jnp.where(maskb, uc - e, 0.0), mask, _ncp, _mesh)
+                bn = bp + over_relax * s
+                return (bn, i + 1, _conv_cv(bp, bn, maskb, cnt))
+            b, i, _ = jax.lax.while_loop(cond, body, (b, 0, jnp.array(jnp.inf, REAL)))
+            used.append(i)
+        corrected = jnp.where(maskb, jnp.exp(u - b), v)
+        return corrected, b, jnp.stack(used)
+    return core
+def n4(
+    image,
+    mask=None,
+    *,
+    iters=(8, 12, 12, 8),
+    over_relax: float = 1.8,
+    conv_threshold: float = 0.0,
+    nbins: int = 200,
+    fwhm: float = 0.15,
+    wiener: float = 0.01,
+    tiny: float = 1e-6,
+    return_bias: bool = False,
+):
+    """N4 bias field correction.
+    Args:
+        image: ND array (2D or 3D) of intensities (>= 0).
+        mask: optional binary foreground mask; if ``None``, an Otsu mask is used.
+        iters: max iterations per fitting level (mesh 1, 2, 4, 8). ``conv_threshold``
+            (ITK-style CV) can stop a level early; the default uses fixed counts
+            with over-relaxation for speed.
+        over_relax: relaxation factor (>= 1) accelerating the crawl to the fixed point.
+        nbins/fwhm/wiener: N3 sharpening parameters (ITK defaults).
+    Returns:
+        corrected image (== image / exp(bias)); also the log bias field if ``return_bias``.
+    """
+    image = jnp.asarray(image, REAL)
+    in3d = image.ndim == 3
+    v = image if in3d else image[None]
+    mask = otsu_mask(v) if mask is None else jnp.asarray(mask, REAL).reshape(v.shape)
+    core = _compiled(tuple(iters), nbins)
+    corrected, bias, _ = core(
+        v,
+        mask,
+        jnp.asarray(fwhm, REAL),
+        jnp.asarray(wiener, REAL),
+        jnp.asarray(conv_threshold, REAL),
+        jnp.asarray(tiny, REAL),
+        jnp.asarray(over_relax, REAL),
+    )
+    if not in3d:
+        corrected, bias = corrected[0], bias[0]
+    return (corrected, bias) if return_bias else corrected

n4ax-0.1.0/n4ax.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,132 @@
+Metadata-Version: 2.4
+Name: n4ax
+Version: 0.1.0
+Summary: JAX/GPU N4 bias field correction — a fast drop-in match for ITK N4
+Author: Geoffroy Oudoumanessah, Jacopo Iollo
+Author-email: Gragas <contact@gragas.ai>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/GragasLab/n4ax
+Project-URL: Repository, https://github.com/GragasLab/n4ax
+Keywords: MRI,bias field,N4,N3,JAX,GPU
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: jax>=0.8.0
+Requires-Dist: numpy>=1.24.0
+Provides-Extra: cpu
+Requires-Dist: jax[cpu]>=0.8.0; extra == "cpu"
+Provides-Extra: cuda12
+Requires-Dist: jax[cuda12]>=0.8.0; extra == "cuda12"
+Provides-Extra: compare
+Requires-Dist: SimpleITK>=2.3.0; extra == "compare"
+Requires-Dist: matplotlib>=3.7; extra == "compare"
+Requires-Dist: nibabel>=5.0; extra == "compare"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: ruff>=0.4.0; extra == "dev"
+Requires-Dist: pre-commit>=3.0; extra == "dev"
+Requires-Dist: SimpleITK>=2.3.0; extra == "dev"
+Dynamic: license-file
+# n4ax
+**N4 bias field correction in pure JAX** — a fast, GPU-friendly, *drop-in* match for
+ITK / SimpleITK's `N4BiasFieldCorrectionImageFilter`.
+n4ax reimplements the N4 algorithm (Tustison et al., 2010 — N3 histogram sharpening
++ multi-resolution B-spline) faithfully enough to **match SimpleITK to ~1%** on real
+MRI, while running **~1500× faster on a GPU** and **~20× faster on the same CPU**.
+![NKI raw vs n4ax-corrected vs ITK-corrected](assets/nki_sub-0002_gpu.png)
+*Raw NKI T1w (with B1 shading) → n4ax-corrected → ITK-corrected (visually identical) → estimated bias field.*
+## Why
+N4 is the de-facto standard bias correction, but ITK's implementation is CPU-only and
+slow (minutes per volume). In a GPU MRI pipeline it becomes the bottleneck. n4ax gives
+**N4-quality output on the GPU in tens of milliseconds**, with no custom CUDA — just JAX.
+## Install
+```bash
+uv sync --extra cuda12        # GPU (CUDA 12)
+uv sync --extra cpu           # CPU
+uv sync --extra cuda12 --extra dev      # + tests/linting
+uv sync --extra cuda12 --extra compare  # + SimpleITK/matplotlib for benchmarks
+```
+## Usage
+```python
+import nibabel as nib
+import n4ax
+vol = nib.load("t1w.nii.gz").get_fdata()      # 3D (or 2D) array, intensities >= 0
+corrected = n4ax.n4(vol)                        # Otsu mask computed automatically
+# or pass your own mask, and/or get the log bias field:
+corrected, log_bias = n4ax.n4(vol, mask=mask, return_bias=True)
+```
+`corrected == vol / exp(log_bias)`. The default config (`iters=(8,12,12,8)`,
+`over_relax=1.8`) is tuned for speed; for the tightest ITK match use the robust
+fallback `n4ax.n4(vol, iters=(50,50,30,20), over_relax=1.0, conv_threshold=1.5e-3)`.
+## Benchmark
+Real NKI T1w volumes (256×176×256, ~2 M brain voxels), N4 `[50,50,30,20]`, same Otsu mask.
+ITK on an 8-core CPU; n4ax CPU on the same node; n4ax GPU on an NVIDIA A100.
+| Method | Time / volume | Speedup vs ITK |
+|---|--:|--:|
+| ITK N4 (CPU, 8 cores) | **146 s** | 1× |
+| n4ax (CPU, 8 cores) | **7.7 s** | **~19×** |
+| n4ax (A100 GPU) | **93 ms** | **~1571×** |
+**Accuracy vs ITK** (corrected image, global scale removed — pipelines intensity-normalise anyway):
+mean **1.15 %**, per-subject 0.79–1.59 % over 6 NKI scans. On a single fitting level n4ax
+matches ITK to **0.4 %**, and a single N4 iteration to **0.1 %** — the building blocks are exact;
+the residual is N4's own iterative crawl (ITK itself only converges after ~30 iters/level).
+Multiple subjects, raw (top) vs n4ax-corrected (bottom):
+![NKI grid](assets/nki_grid_gpu.png)
+Reproduce: `python scripts/bench_nki.py` (GPU) and `JAX_PLATFORMS=cpu python scripts/bench_nki.py --skip-itk --skip-fig --tag cpu`.
+## How it's fast (no custom kernels)
+- **Separable B-spline fit.** N4's per-iteration B-spline least-squares (Lee MBA) is a
+  94 M-way scatter into a tiny control lattice — brutal atomic contention (~30 ms/iter).
+  Because the cubic weights depend only on the per-axis index and the Lee denominator
+  factorises, this becomes **3 small dense matmuls per axis** (cuBLAS) — *identical math*,
+  0.1 ms/iter.
+- **Privatised histogram.** The N3 sharpening histogram (1.5 M → 200 bins) is privatised
+  over 256 lanes to avoid atomic serialisation.
+- **Over-relaxation.** N4's fixed point is invariant to `B += α·S` (S = 0 there), so
+  `α ≈ 1.8` reaches ITK's result in far fewer iterations.
+- The whole solve is one fused, jitted program with a device-side convergence loop.
+Two things that mattered for *correctness*: zero-padding the sharpening FFT (circular
+wraparound otherwise breaks convergence), and that float32 == float64 here (verified).
+## Tests
+```bash
+uv run pytest          # basic correctness + ground-truth match vs SimpleITK
+```
+`tests/test_vs_itk.py` asserts n4ax matches SimpleITK's N4 (the reference) within tolerance
+on a phantom; `tests/test_basic.py` covers shapes, 2D/3D, the `image/exp(bias)` identity,
+bias flattening, and the Otsu mask.
+## Status
+Alpha. The fast defaults are tuned on NKI/phantom data; validate on your own data before
+production (the `iters=(50,50,30,20), over_relax=1.0` fallback is the conservative choice).

n4ax-0.1.0/n4ax.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,12 @@
+LICENSE
+README.md
+pyproject.toml
+n4ax/__init__.py
+n4ax/core.py
+n4ax.egg-info/PKG-INFO
+n4ax.egg-info/SOURCES.txt
+n4ax.egg-info/dependency_links.txt
+n4ax.egg-info/requires.txt
+n4ax.egg-info/top_level.txt
+tests/test_basic.py
+tests/test_vs_itk.py

n4ax-0.1.0/n4ax.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

n4ax-0.1.0/n4ax.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,20 @@
+jax>=0.8.0
+numpy>=1.24.0
+[compare]
+SimpleITK>=2.3.0
+matplotlib>=3.7
+nibabel>=5.0
+[cpu]
+jax[cpu]>=0.8.0
+[cuda12]
+jax[cuda12]>=0.8.0
+[dev]
+pytest>=7.0
+pytest-cov
+ruff>=0.4.0
+pre-commit>=3.0
+SimpleITK>=2.3.0

n4ax-0.1.0/n4ax.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ n4ax

n4ax-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,78 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "n4ax"
+version = "0.1.0"
+authors = [
+    {name = "Gragas", email = "contact@gragas.ai"},
+    {name = "Geoffroy Oudoumanessah"},
+    {name = "Jacopo Iollo"},
+]
+description = "JAX/GPU N4 bias field correction — a fast drop-in match for ITK N4"
+readme = "README.md"
+license = "MIT"
+requires-python = ">=3.12"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering :: Medical Science Apps.",
+]
+keywords = ["MRI", "bias field", "N4", "N3", "JAX", "GPU"]
+dependencies = [
+    "jax>=0.8.0",
+    "numpy>=1.24.0",
+]
+[project.optional-dependencies]
+cpu = ["jax[cpu]>=0.8.0"]
+cuda12 = ["jax[cuda12]>=0.8.0"]
+# `compare` pulls SimpleITK (the reference N4) + plotting/IO for tests & benchmarks.
+compare = ["SimpleITK>=2.3.0", "matplotlib>=3.7", "nibabel>=5.0"]
+dev = [
+    "pytest>=7.0",
+    "pytest-cov",
+    "ruff>=0.4.0",
+    "pre-commit>=3.0",
+    "SimpleITK>=2.3.0",
+]
+[project.urls]
+Homepage = "https://github.com/GragasLab/n4ax"
+Repository = "https://github.com/GragasLab/n4ax"
+[tool.setuptools.packages.find]
+include = ["n4ax*"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+addopts = "-v --tb=short"
+pythonpath = [".", "tests"]
+[tool.ruff]
+target-version = "py312"
+line-length = 119
+[tool.ruff.lint]
+select = ["E", "F", "I", "W", "UP", "FURB", "SIM", "S110", "C4", "RUF013", "PERF102", "PLC1802", "PLC0208", "PIE794"]
+ignore = ["E501", "E741", "SIM1", "SIM905", "UP015", "UP031"]
+extend-safe-fixes = ["UP006"]
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+"*.ipynb" = ["E402", "E731", "B007", "N816"]
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+known-first-party = ["n4ax"]
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"

n4ax-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

n4ax-0.1.0/tests/test_basic.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Basic correctness: import, shapes, finiteness, 2D/3D, near-identity on no bias."""
+import numpy as np
+import n4ax
+def test_import():
+    assert callable(n4ax.n4)
+    assert callable(n4ax.otsu_mask)
+def test_3d_shape_and_finite(phantom):
+    obs, _, _ = phantom
+    corr = np.asarray(n4ax.n4(obs))
+    assert corr.shape == obs.shape
+    assert np.isfinite(corr).all()
+    assert (corr >= 0).all()
+def test_2d_runs():
+    rng = np.random.default_rng(0)
+    img = rng.uniform(0.5, 1.5, size=(64, 64)).astype(np.float32)
+    img[:5] = 0.0  # background
+    corr = np.asarray(n4ax.n4(img))
+    assert corr.shape == img.shape
+    assert np.isfinite(corr).all()
+def test_return_bias(phantom):
+    obs, _, mask = phantom
+    corr, bias = n4ax.n4(obs, mask=mask.astype(np.float32), return_bias=True)
+    corr, bias = np.asarray(corr), np.asarray(bias)
+    m = mask
+    # corrected == image / exp(bias) inside the mask
+    recon = obs / np.exp(bias)
+    assert np.abs(corr[m] - recon[m]).max() < 1e-3
+def test_recovers_bias(phantom):
+    """N4 should flatten a known smooth bias: corrected tissue is more uniform
+    (lower coefficient of variation) than the observed biased image."""
+    obs, bias, mask = phantom
+    corr = np.asarray(n4ax.n4(obs, mask=mask.astype(np.float32)))
+    m = mask
+    cv_before = obs[m].std() / obs[m].mean()
+    cv_after = corr[m].std() / corr[m].mean()
+    assert cv_after < cv_before
+def test_otsu_mask(phantom):
+    obs, _, mask = phantom
+    om = np.asarray(n4ax.otsu_mask(obs)) > 0.5
+    # Otsu foreground should agree with the true brain mask on most voxels
+    agree = (om == mask).mean()
+    assert agree > 0.95

n4ax-0.1.0/tests/test_vs_itk.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Ground-truth test: n4ax must match SimpleITK's N4 (the reference implementation).
+Uses the SAME mask for both (so we compare the N4 solve, not the masking), and
+compares the corrected image with its global scale removed (N4's bias field is
+defined up to a constant; downstream pipelines intensity-normalise anyway)."""
+import numpy as np
+import pytest
+import n4ax
+sitk = pytest.importorskip("SimpleITK")
+from _phantom import make_phantom  # noqa: E402
+def _itk_n4(obs, mask, iters=(50, 50, 30, 20)):
+    img = sitk.GetImageFromArray(obs.astype(np.float32))
+    mk = sitk.GetImageFromArray(mask.astype(np.uint8))
+    c = sitk.N4BiasFieldCorrectionImageFilter()
+    c.SetMaximumNumberOfIterations([int(i) for i in iters])
+    out = c.Execute(img, mk)
+    return sitk.GetArrayFromImage(out).astype(np.float64)
+def _scaled_relerr(a, b, m):
+    a, b = np.asarray(a, np.float64), np.asarray(b, np.float64)
+    ratio = a[m] / np.clip(b[m], 1e-6, None)
+    rel = np.abs(a[m] / np.median(ratio) - b[m]) / np.clip(np.abs(b[m]), 1e-6, None)
+    return rel
+def test_matches_simpleitk():
+    obs, _, mask = make_phantom()
+    itk = _itk_n4(obs, mask)
+    jax_corr = np.asarray(n4ax.n4(obs, mask=mask.astype(np.float32)))
+    rel = _scaled_relerr(jax_corr, itk, mask)
+    assert rel.mean() < 0.015, f"mean rel-err {rel.mean() * 100:.2f}% too high"
+    assert np.percentile(rel, 95) < 0.05, f"p95 rel-err {np.percentile(rel, 95) * 100:.2f}% too high"
+def test_closer_to_itk_than_uncorrected():
+    """n4ax's correction must be much closer to ITK's than doing nothing."""
+    obs, _, mask = make_phantom(seed=1)
+    itk = _itk_n4(obs, mask)
+    jax_corr = np.asarray(n4ax.n4(obs, mask=mask.astype(np.float32)))
+    err_jax = _scaled_relerr(jax_corr, itk, mask).mean()
+    err_none = _scaled_relerr(obs, itk, mask).mean()
+    assert err_jax < 0.25 * err_none