PyPI - mlx-nufft - Versions diffs - 0.1.1__py3-none-any.whl - Mend

mlx-nufft 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

mlx_nufft/__init__.py +48 -0
mlx_nufft/api.py +388 -0
mlx_nufft/dfmath.py +134 -0
mlx_nufft/gpu_t3.py +1364 -0
mlx_nufft/nd.py +1119 -0
mlx_nufft/ref_t3.py +228 -0
mlx_nufft/sizing.py +111 -0
mlx_nufft/types12.py +290 -0
mlx_nufft/vkfft_backend.py +94 -0
mlx_nufft-0.1.1.dist-info/METADATA +202 -0
mlx_nufft-0.1.1.dist-info/RECORD +15 -0
mlx_nufft-0.1.1.dist-info/WHEEL +5 -0
mlx_nufft-0.1.1.dist-info/licenses/LICENSE +216 -0
mlx_nufft-0.1.1.dist-info/licenses/NOTICE +28 -0
mlx_nufft-0.1.1.dist-info/top_level.txt +1 -0

mlx_nufft/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""mlx-nufft: non-uniform FFTs on Apple GPUs (Metal, via MLX).
+Drop-in mirror of the `finufft` Python package's interface:
+    import mlx_nufft as finufft
+    fk = finufft.nufft2d1(x, y, c, (N1, N2), eps=1e-6)   # types 1/2/3, dims 1/2/3
+    plan = finufft.Plan(1, (N1, N2), eps=1e-6)
+    plan.setpts(x, y)
+    fk = plan.execute(c)
+plus the native plan classes:
+    from mlx_nufft import Type3Plan
+    plan = Type3Plan((x1, x2, x3), (s1, s2, s3), eps=1e-5, isign=+1)
+    f = plan.execute(c)          # f[k] = sum_j c[j] exp(i*isign*s_k.x_j)
+Precision model (see REPORT.md): fp32 GPU pipeline with the precision-
+critical setup (coordinate rescale, pre/post phases) in fp64 at plan time
+('crit64', the default). Plans cache all geometry-dependent state, so
+fixed-geometry workloads pay setup once and execute() per call.
+"""
+from .gpu_t3 import GpuT3Plan as Type3Plan
+from .types12 import Type1Plan, Type2Plan
+from .nd import Type1PlanND, Type2PlanND
+from .dfmath import expi, EXPI_MAX_PHASE
+from .sizing import kernel_params, next235even
+from .api import (Plan,
+                  nufft1d1, nufft1d2, nufft1d3,
+                  nufft2d1, nufft2d2, nufft2d3,
+                  nufft3d1, nufft3d2, nufft3d3)
+def vkfft_available():
+    """True if the optional VkFFT-Metal FFT backend (fft_backend='vkfft') is
+    built and loadable — see vkfft_bridge/build.sh."""
+    from . import vkfft_backend
+    return vkfft_backend.available()
+__version__ = "0.1.1"
+__all__ = ["Type3Plan", "Type1Plan", "Type2Plan",
+           "Type1PlanND", "Type2PlanND", "Plan",
+           "nufft1d1", "nufft1d2", "nufft1d3",
+           "nufft2d1", "nufft2d2", "nufft2d3",
+           "nufft3d1", "nufft3d2", "nufft3d3",
+           "kernel_params", "next235even", "vkfft_available",
+           "expi", "EXPI_MAX_PHASE"]

mlx_nufft/api.py ADDED Viewed

@@ -0,0 +1,388 @@
+"""finufft-compatible API for mlx_nufft.
+Drop-in mirror of the `finufft` Python package's interface, so existing
+callers switch with one import line:
+    import mlx_nufft as finufft
+    fk = finufft.nufft2d1(x, y, c, (N1, N2), eps=1e-6)
+    plan = finufft.Plan(1, (N1, N2), n_trans=8, eps=1e-6)
+    plan.setpts(x, y)
+    fk = plan.execute(c)
+Semantics mirrored from finufft 2.x:
+  - mode boxes are modeord=0: k_d integer in [-(N_d//2), (N_d-1)//2],
+    even or odd N_d;
+  - isign: non-negative means +i in the exponential (type-1/3 default +1,
+    type-2 default -1);
+  - multi-vector inputs: leading n_trans axis on strengths/mode arrays;
+  - out= arrays are filled in place when supplied;
+  - x (etc.) in [-pi, pi), folded otherwise.
+Differences (documented, not silent):
+  - computation is the validated fp32 GPU pipeline with fp64-critical
+    setup ('crit64'); requesting eps below 1e-6 clamps to 1e-6 with a
+    warning (the fp32 accuracy envelope: see ACCEPTANCE.md);
+  - complex128 inputs are accepted and returned as complex128, but the
+    transform itself is fp32-grade;
+  - modeord=1 (FFT ordering) is not implemented (raises);
+  - 1D/2D type 3 currently run as degenerate slices of the validated 3D
+    type-3 kernel (functional; not speed-tuned).
+"""
+import warnings
+import numpy as np
+from .nd import Type1PlanND, Type2PlanND
+from .gpu_t3 import GpuT3Plan
+_EPS_FLOOR = 1e-6
+_IGNORED_OPTS = {
+    "nthreads", "debug", "spread_debug", "showwarn", "fftw", "spread_sort",
+    "spread_kerevalmeth", "spread_kerpad", "chkbnds", "maxbatchsize",
+    "spread_thread", "spread_nthr_atomic", "spread_max_sp_size",
+}
+def _check_opts(kwargs):
+    opts = dict(kwargs)
+    if opts.pop("modeord", 0) not in (0,):
+        raise NotImplementedError("modeord=1 (FFT ordering) not implemented")
+    upsampfac = opts.pop("upsampfac", None)
+    if upsampfac in (0, 0.0):                 # finufft auto sentinel
+        upsampfac = None
+    prec = opts.pop("prec", "crit64")
+    fft_backend = opts.pop("fft_backend", "mlx")   # type-3 slab only
+    for k in list(opts):
+        if k in _IGNORED_OPTS:
+            opts.pop(k)
+    if opts:
+        warnings.warn(f"mlx-nufft: ignoring unknown options {sorted(opts)}")
+    return upsampfac, prec, fft_backend
+def _norm_eps(eps):
+    eps = float(eps)
+    if eps < _EPS_FLOOR:
+        warnings.warn(
+            f"mlx-nufft: eps={eps:g} is below the fp32 pipeline floor; "
+            f"clamping to {_EPS_FLOOR:g} (see ACCEPTANCE.md accuracy notes)")
+        eps = _EPS_FLOOR
+    return eps
+def _norm_isign(isign, default):
+    if isign is None:
+        return default
+    return +1 if isign >= 0 else -1
+def _out_dtype(*arrays):
+    for a in arrays:
+        if np.asarray(a).dtype in (np.complex128, np.float64):
+            return np.complex128
+    return np.complex64
+def _vec_shape(data, inner_ndim, inner_shape=None):
+    """Split data shape into (n_trans, inner shape); inner_ndim trailing.
+    If inner_shape is given, the trailing dims must match it exactly
+    (mirrors FINUFFT's strict size checks — no silent truncation)."""
+    data = np.asarray(data)
+    if data.ndim == inner_ndim:
+        out = 1, data[None, ...]
+    elif data.ndim == inner_ndim + 1:
+        out = data.shape[0], data
+    else:
+        raise ValueError(f"data must have {inner_ndim} or {inner_ndim + 1} "
+                         f"dims, got shape {data.shape}")
+    if inner_shape is not None and out[1].shape[1:] != tuple(inner_shape):
+        raise ValueError(f"data inner shape {out[1].shape[1:]} must be "
+                         f"{tuple(inner_shape)}")
+    return out
+def _fill_out(out, res, dtype):
+    res = res.astype(dtype, copy=False)
+    if out is not None:
+        # exact shape, or the (1, ...) stacked form when n_trans == 1
+        if out.shape != res.shape and out.shape != (1,) + res.shape \
+                and (1,) + out.shape != res.shape:
+            raise ValueError(f"out.shape {out.shape} does not match result "
+                             f"shape {res.shape}")
+        np.copyto(out, res.reshape(out.shape))
+        return out
+    return res
+def _embed3(arrs, dim):
+    """Zero-pad a dim<3 coordinate tuple to 3 components for GpuT3Plan."""
+    arrs = [np.asarray(a, dtype=np.float64).ravel() for a in arrs]
+    z = np.zeros(arrs[0].size)
+    return tuple(arrs) + (z,) * (3 - dim)
+def _modes_tuple(n_modes, dim, out, out_offset=0):
+    if n_modes is None:
+        if out is None:
+            raise ValueError("either n_modes or out must be supplied")
+        shape = out.shape[out_offset:]
+        if len(shape) != dim:
+            raise ValueError(f"out shape {out.shape} does not match dim {dim}")
+        return tuple(int(n) for n in shape)
+    if np.isscalar(n_modes):
+        return (int(n_modes),) * dim
+    return tuple(int(n) for n in n_modes)
+# ---------------------------------------------------------------------------
+# type 1: nonuniform -> uniform
+def _warn_no_vkfft(fft_backend, what):
+    if fft_backend != "mlx":
+        warnings.warn(f"mlx-nufft: fft_backend={fft_backend!r} applies only "
+                      f"to 3D type-3 (slab); ignored for {what}")
+def _nufft_t1(dim, coords, c, n_modes, out, eps, isign, kwargs):
+    upsampfac, prec, fft_backend = _check_opts(kwargs)
+    _warn_no_vkfft(fft_backend, "type-1")
+    eps = _norm_eps(eps)
+    isign = _norm_isign(isign, +1)
+    dtype = _out_dtype(c)
+    M = np.asarray(coords[0]).size
+    n_tr, cv = _vec_shape(c, 1, inner_shape=(M,))
+    if out is not None and n_modes is None:
+        n_modes = _modes_tuple(None, dim, out, out_offset=(1 if n_tr > 1 else 0))
+    N = _modes_tuple(n_modes, dim, out)
+    kw = {} if upsampfac is None else {"upsampfac": upsampfac}
+    plan = Type1PlanND(coords, N, eps=eps, isign=isign, prec=prec, **kw)
+    res = np.stack([plan.execute(cv[t]) for t in range(n_tr)])
+    if n_tr == 1 and (np.asarray(c).ndim == 1):
+        res = res[0]
+    return _fill_out(out, res, dtype)
+def nufft1d1(x, c, n_modes=None, out=None, eps=1e-6, isign=1, **kwargs):
+    """1D type-1: f[k] = sum_j c[j] exp(+/-i k x(j))."""
+    return _nufft_t1(1, (x,), c, n_modes, out, eps, isign, kwargs)
+def nufft2d1(x, y, c, n_modes=None, out=None, eps=1e-6, isign=1, **kwargs):
+    """2D type-1: f[k1,k2] = sum_j c[j] exp(+/-i (k1 x(j) + k2 y(j)))."""
+    return _nufft_t1(2, (x, y), c, n_modes, out, eps, isign, kwargs)
+def nufft3d1(x, y, z, c, n_modes=None, out=None, eps=1e-6, isign=1, **kwargs):
+    """3D type-1: f[k1,k2,k3] = sum_j c[j] exp(+/-i k . x_j)."""
+    return _nufft_t1(3, (x, y, z), c, n_modes, out, eps, isign, kwargs)
+# ---------------------------------------------------------------------------
+# type 2: uniform -> nonuniform
+def _nufft_t2(dim, coords, f, out, eps, isign, kwargs):
+    upsampfac, prec, fft_backend = _check_opts(kwargs)
+    _warn_no_vkfft(fft_backend, "type-2")
+    eps = _norm_eps(eps)
+    isign = _norm_isign(isign, -1)
+    dtype = _out_dtype(f)
+    n_tr, fv = _vec_shape(f, dim)
+    N = fv.shape[1:]
+    kw = {} if upsampfac is None else {"upsampfac": upsampfac}
+    plan = Type2PlanND(coords, N, eps=eps, isign=isign, prec=prec, **kw)
+    res = np.stack([plan.execute(fv[t]) for t in range(n_tr)])
+    if n_tr == 1 and (np.asarray(f).ndim == dim):
+        res = res[0]
+    return _fill_out(out, res, dtype)
+def nufft1d2(x, f, out=None, eps=1e-6, isign=-1, **kwargs):
+    """1D type-2: c[j] = sum_k f[k] exp(+/-i k x(j))."""
+    return _nufft_t2(1, (x,), f, out, eps, isign, kwargs)
+def nufft2d2(x, y, f, out=None, eps=1e-6, isign=-1, **kwargs):
+    """2D type-2: c[j] = sum_{k1,k2} f[k1,k2] exp(+/-i (k1 x(j) + k2 y(j)))."""
+    return _nufft_t2(2, (x, y), f, out, eps, isign, kwargs)
+def nufft3d2(x, y, z, f, out=None, eps=1e-6, isign=-1, **kwargs):
+    """3D type-2: c[j] = sum_k f[k] exp(+/-i k . x_j)."""
+    return _nufft_t2(3, (x, y, z), f, out, eps, isign, kwargs)
+# ---------------------------------------------------------------------------
+# type 3: nonuniform -> nonuniform
+def _nufft_t3(dim, src, c, trg, out, eps, isign, kwargs):
+    upsampfac, prec, fft_backend = _check_opts(kwargs)
+    if upsampfac is not None and upsampfac != 1.25:
+        warnings.warn("mlx-nufft: type-3 runs the validated sigma=1.25 "
+                      "pipeline; upsampfac ignored")
+    eps = _norm_eps(eps)
+    isign = _norm_isign(isign, +1)
+    dtype = _out_dtype(c)
+    M = np.asarray(src[0]).size
+    n_tr, cv = _vec_shape(c, 1, inner_shape=(M,))
+    plan = GpuT3Plan(_embed3(src, dim), _embed3(trg, dim),
+                     eps=eps, isign=isign, prec=prec, fft_backend=fft_backend)
+    res = np.stack([plan.execute(cv[t]) for t in range(n_tr)])
+    if n_tr == 1 and (np.asarray(c).ndim == 1):
+        res = res[0]
+    return _fill_out(out, res, dtype)
+def nufft1d3(x, c, s, out=None, eps=1e-6, isign=1, **kwargs):
+    """1D type-3: f[k] = sum_j c[j] exp(+/-i s[k] x[j])."""
+    return _nufft_t3(1, (x,), c, (s,), out, eps, isign, kwargs)
+def nufft2d3(x, y, c, s, t, out=None, eps=1e-6, isign=1, **kwargs):
+    """2D type-3: f[k] = sum_j c[j] exp(+/-i (s[k] x[j] + t[k] y[j]))."""
+    return _nufft_t3(2, (x, y), c, (s, t), out, eps, isign, kwargs)
+def nufft3d3(x, y, z, c, s, t, u, out=None, eps=1e-6, isign=1, **kwargs):
+    """3D type-3: f[k] = sum_j c[j] exp(+/-i (s,t,u)_k . (x,y,z)_j)."""
+    return _nufft_t3(3, (x, y, z), c, (s, t, u), out, eps, isign, kwargs)
+# ---------------------------------------------------------------------------
+# Plan interface
+class Plan:
+    """finufft.Plan-compatible plan/setpts/execute interface.
+    Plan(nufft_type, n_modes_or_dim, n_trans=1, eps=1e-6, isign=None,
+         dtype='complex128', **kwargs)
+    For types 1/2, n_modes_or_dim is the mode tuple (dim inferred from its
+    length). For type 3 it is the dimension (1, 2 or 3). setpts() builds the
+    GPU plan (points are part of plan state, as in cu/FINUFFT); execute()
+    runs each of n_trans vectors through the cached plan.
+    """
+    def __init__(self, nufft_type, n_modes_or_dim, n_trans=1, eps=1e-6,
+                 isign=None, dtype="complex128", **kwargs):
+        if nufft_type not in (1, 2, 3):
+            raise ValueError("nufft_type must be 1, 2 or 3")
+        self.type = int(nufft_type)
+        self.n_trans = int(n_trans)
+        self.eps = _norm_eps(eps)
+        self.isign = _norm_isign(isign, -1 if self.type == 2 else +1)
+        self.dtype = np.dtype(dtype)
+        if self.dtype not in (np.complex64, np.complex128):
+            raise ValueError("dtype must be complex64 or complex128")
+        self._upsampfac, self._prec, self._fft_backend = _check_opts(kwargs)
+        if self._fft_backend != "mlx" and self.type != 3:
+            _warn_no_vkfft(self._fft_backend, f"type-{self.type}")
+            self._fft_backend = "mlx"
+        if self.type == 3:
+            self.dim = int(n_modes_or_dim)
+            self.n_modes = None
+        else:
+            if np.isscalar(n_modes_or_dim):
+                n_modes_or_dim = (n_modes_or_dim,)
+            self.n_modes = tuple(int(n) for n in n_modes_or_dim)
+            self.dim = len(self.n_modes)
+        if self.dim not in (1, 2, 3):
+            raise ValueError("dim must be 1, 2 or 3")
+        self._plan = None
+        self._adjoint = None
+        self._n_targets = None
+    def setpts(self, x=None, y=None, z=None, s=None, t=None, u=None):
+        coords = [v for v in (x, y, z) if v is not None]
+        if len(coords) != self.dim:
+            raise ValueError(f"expected {self.dim} coordinate arrays, "
+                             f"got {len(coords)}")
+        kw = {} if self._upsampfac is None else {"upsampfac": self._upsampfac}
+        self._adjoint = None
+        if self.type == 1:
+            self._plan = Type1PlanND(tuple(coords), self.n_modes,
+                                     eps=self.eps, isign=self.isign,
+                                     prec=self._prec, **kw)
+        elif self.type == 2:
+            self._plan = Type2PlanND(tuple(coords), self.n_modes,
+                                     eps=self.eps, isign=self.isign,
+                                     prec=self._prec, **kw)
+        else:
+            if self._upsampfac is not None and self._upsampfac != 1.25:
+                warnings.warn("mlx-nufft: type-3 runs the validated "
+                              "sigma=1.25 pipeline; upsampfac ignored")
+            trg = [v for v in (s, t, u) if v is not None]
+            if len(trg) != self.dim:
+                raise ValueError(f"expected {self.dim} target arrays, "
+                                 f"got {len(trg)}")
+            self._plan = GpuT3Plan(_embed3(coords, self.dim),
+                                   _embed3(trg, self.dim),
+                                   eps=self.eps, isign=self.isign,
+                                   prec=self._prec, fft_backend=self._fft_backend)
+            self._n_targets = np.asarray(trg[0]).size
+        self._coords = [np.asarray(v) for v in coords]
+        self._targets = None if self.type != 3 else \
+            [np.asarray(v) for v in (s, t, u) if v is not None]
+    def execute(self, data, out=None):
+        if self._plan is None:
+            raise RuntimeError("setpts() must be called before execute()")
+        if self.type == 2:
+            inner, ishape = self.dim, self.n_modes
+        else:
+            inner, ishape = 1, (self._plan.P,)
+        n_tr, dv = _vec_shape(data, inner, inner_shape=ishape)
+        if n_tr != self.n_trans:
+            raise ValueError(f"data has {n_tr} vectors, plan has "
+                             f"n_trans={self.n_trans}")
+        res = np.stack([self._plan.execute(dv[k]) for k in range(self.n_trans)])
+        if self.n_trans == 1 and np.asarray(data).ndim == inner:
+            res = res[0]
+        return _fill_out(out, res, self.dtype)
+    def execute_adjoint(self, data, out=None):
+        """Apply the adjoint of the planned transform (finufft 2.5 API).
+        Type-1 plan adjoint maps modes -> points; type-2 adjoint maps
+        points -> modes; type-3 adjoint maps targets -> sources. Implemented
+        as the sibling transform with isign negated (the exact adjoint of
+        the NUFFT matrix; the validated type-3 adjoint identity)."""
+        if self._plan is None:
+            raise RuntimeError("setpts() must be called before "
+                               "execute_adjoint()")
+        if self._adjoint is None:
+            kw = {} if self._upsampfac is None \
+                else {"upsampfac": self._upsampfac}
+            if self.type == 1:
+                self._adjoint = Type2PlanND(tuple(self._coords), self.n_modes,
+                                            eps=self.eps, isign=-self.isign,
+                                            prec=self._prec, **kw)
+            elif self.type == 2:
+                self._adjoint = Type1PlanND(tuple(self._coords), self.n_modes,
+                                            eps=self.eps, isign=-self.isign,
+                                            prec=self._prec, **kw)
+            else:
+                self._adjoint = GpuT3Plan(_embed3(self._targets, self.dim),
+                                          _embed3(self._coords, self.dim),
+                                          eps=self.eps, isign=-self.isign,
+                                          prec=self._prec,
+                                          fft_backend=self._fft_backend)
+        if self.type == 1:
+            inner, ishape = self.dim, self.n_modes
+        elif self.type == 2:
+            inner, ishape = 1, (self._plan.P,)
+        else:
+            inner, ishape = 1, (self._n_targets,)
+        n_tr, dv = _vec_shape(data, inner, inner_shape=ishape)
+        if n_tr != self.n_trans:
+            raise ValueError(f"data has {n_tr} vectors, plan has "
+                             f"n_trans={self.n_trans}")
+        res = np.stack([self._adjoint.execute(dv[k])
+                        for k in range(self.n_trans)])
+        if self.n_trans == 1 and np.asarray(data).ndim == inner:
+            res = res[0]
+        return _fill_out(out, res, self.dtype)

mlx_nufft/dfmath.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""General double-single (df64) GPU math primitives.
+These expose the extended-precision machinery the NUFFT plans use internally
+(df64 phase accumulation + reduction mod 2pi, then f32 transcendentals) as
+standalone tools for callers that hit the same f32-precision wall — most
+commonly a large-magnitude fp64 phase that cannot be reduced mod 2pi in f32
+but whose cos/sin you want to evaluate on the GPU.
+The reduction is identical to the prephase inside GpuT3Plan.set_sources: form
+the phase in df64, k = rint(phi/2pi), then phi - k*2pi in df64 (the product
+k*2pi is exact via two_prod), and f32 cos/sin of the O(1) residual. Valid while
+the integer quotient phi/2pi stays f32-exact, i.e. |phi| <~ 2^24 * 2pi ~ 1.05e8
+radians; beyond that the residual grows and accuracy degrades gracefully.
+"""
+import numpy as np
+import mlx.core as mx
+from .gpu_t3 import _DF64_HDR
+PI = np.pi
+# valid magnitude ceiling: |phi| where rint(phi/2pi) stays an exact f32 integer
+EXPI_MAX_PHASE = float(2 ** 24 * 2.0 * PI)      # ~1.054e8 radians
+_expi_cache = {}
+def _build_expi_kernel(ncomp):
+    """e^{i*isign*phi} for phi = sum of `ncomp` df64 phase components:
+    accumulate in df64, reduce mod 2pi (k = rint(phi/2pi); phi - k*2pi in
+    df64, k*2pi exact via two_prod), then f32 cos/sin of the O(1) residual.
+    consts: [2pi_hi, 2pi_lo, 1/2pi, isign]."""
+    acc = "    df64 ph = df_make(0.0f, 0.0f);\n"
+    for c in range(ncomp):
+        acc += f"    ph = df_add(ph, df_make(ph_hi{c}[j], ph_lo{c}[j]));\n"
+    src = f"""
+    uint j = thread_position_in_grid.x;
+    if (j >= (uint)P0[0]) return;
+{acc}    float k = metal::rint(ph.hi * cst[2]);                 // ph.hi / 2pi
+    df64 red = df_add(ph, df_mul(df_make(-k, 0.0f),
+                                 df_make(cst[0], cst[1])));    // ph - k*2pi (df64)
+    float ang = cst[3] * (red.hi + red.lo);
+    out[2*j]   = metal::precise::cos(ang);
+    out[2*j+1] = metal::precise::sin(ang);
+"""
+    innames = []
+    for c in range(ncomp):
+        innames += [f"ph_hi{c}", f"ph_lo{c}"]
+    innames += ["cst", "P0"]
+    return mx.fast.metal_kernel(
+        name=f"expi_df64_n{ncomp}", input_names=innames,
+        output_names=["out"], header=_DF64_HDR, source=src)
+def expi(phases, isign=1, return_np=False):
+    """Compute e^{i*isign*phi} on the GPU for a large-magnitude fp64 phase via
+    a double-single (df64) reduction mod 2pi followed by f32 cos/sin — the
+    general form of the prephase machinery inside GpuT3Plan.set_sources.
+    Use this whenever an fp64 phase is too large to reduce in f32 (f32 loses
+    all fractional bits by |phi| ~ 1e3) but you want the cos/sin on the GPU.
+    Parameters
+    ----------
+    phases : fp64 array, or a sequence of fp64 arrays
+        Either the phase phi directly, or a small set of phase components of
+        identical shape that are summed *in df64* to form phi = sum_k phases[k]
+        (more accurate than an fp64 host sum, and keeps the sum on-GPU).
+    isign : int
+        +1 (default) or -1; computes e^{i*isign*phi}.
+    return_np : bool
+        If True, copy the result to a numpy complex64 array; otherwise return
+        the device mx.array (complex64).
+    Returns
+    -------
+    e^{i*isign*phi} as complex64, same shape as the input.
+    Accuracy: matches an fp64 host reference (np.exp) to ~1e-6 per element for
+    |phi| up to EXPI_MAX_PHASE (~1.05e8 rad) — approaching ~2e-6 right at the
+    ceiling, ~2e-7 for |phi| <~ 1e7. Beyond the ceiling the integer quotient
+    phi/2pi is no longer f32-exact and accuracy degrades gracefully (~linear in
+    |phi|, no cliff).
+    Notes
+    -----
+    - A sequence of arrays is treated as SUMMABLE phase components (summed in
+      df64). A flat python list/tuple of *scalars* is therefore rejected — it
+      would otherwise be silently summed into a single phase; pass
+      ``np.asarray(...)`` for a vector of per-element phases.
+    - ``isign`` is normalized by sign only (>=0 -> +1, else -1).
+    - NaN/inf phases propagate per element to NaN (no cross-element effect).
+    """
+    if isinstance(phases, (list, tuple)):
+        comps = [np.asarray(v, dtype=np.float64) for v in phases]
+    else:
+        comps = [np.asarray(phases, dtype=np.float64)]
+    if len(comps) < 1:
+        raise ValueError("expi: at least one phase component is required")
+    shape = comps[0].shape
+    if any(c.shape != shape for c in comps):
+        raise ValueError("expi: all phase components must share one shape")
+    if len(comps) > 1 and shape == ():
+        raise ValueError(
+            "expi: got a multi-element sequence of scalars, which would be "
+            "summed into a single phase. Pass np.asarray(...) for a vector of "
+            "per-element phases, or 1-D arrays as summable phase components.")
+    isign = +1 if isign >= 0 else -1
+    P = int(np.prod(shape)) if shape else 1
+    ncomp = len(comps)
+    kern = _expi_cache.get(ncomp)
+    if kern is None:
+        kern = _expi_cache[ncomp] = _build_expi_kernel(ncomp)
+    cst = np.zeros(4, dtype=np.float32)
+    cst[0] = np.float32(2.0 * PI)
+    cst[1] = np.float32(2.0 * PI - np.float64(cst[0]))
+    cst[2] = np.float32(1.0 / (2.0 * PI))
+    cst[3] = np.float32(isign)
+    ins = []
+    with np.errstate(invalid="ignore"):       # inf-in -> nan-out, no warning
+        for c in comps:
+            cf = c.ravel()
+            hi = cf.astype(np.float32)
+            lo = (cf - hi).astype(np.float32)
+            ins += [mx.array(hi), mx.array(lo)]
+    ins += [mx.array(cst), mx.array(np.array([P], dtype=np.int32))]
+    out = kern(inputs=ins, output_shapes=[(2 * P,)],
+               output_dtypes=[mx.float32],
+               grid=(P, 1, 1), threadgroup=(256, 1, 1))[0]
+    res = mx.reshape(mx.view(out, dtype=mx.complex64), shape)
+    mx.eval(res)
+    return np.array(res) if return_np else res