PyPI - FastSIMUS - Versions diffs - 0.0.1__py3-none-any.whl - Mend

FastSIMUS 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

fast_simus/__init__.py +33 -0
fast_simus/_pfield_math.py +261 -0
fast_simus/_pfield_strategies.py +203 -0
fast_simus/_simus_strategies.py +210 -0
fast_simus/backends/__init__.py +1 -0
fast_simus/backends/mlx.py +101 -0
fast_simus/kernels/__init__.py +9 -0
fast_simus/kernels/cuda_simus.py +321 -0
fast_simus/kernels/metal_pfield.py +219 -0
fast_simus/kernels/metal_simus.py +377 -0
fast_simus/kernels/pfield.metal +97 -0
fast_simus/kernels/simus_fused.cu +332 -0
fast_simus/kernels/simus_rx_simd.metal +128 -0
fast_simus/kernels/simus_tx_tiled.metal +175 -0
fast_simus/medium_params.py +22 -0
fast_simus/pfield.py +475 -0
fast_simus/py.typed +0 -0
fast_simus/simus.py +567 -0
fast_simus/spectrum.py +107 -0
fast_simus/transducer_params.py +160 -0
fast_simus/transducer_presets.py +102 -0
fast_simus/tx_delay.py +276 -0
fast_simus/utils/__init__.py +5 -0
fast_simus/utils/_array_api.py +294 -0
fast_simus/utils/geometry.py +88 -0
fastsimus-0.0.1.dist-info/METADATA +594 -0
fastsimus-0.0.1.dist-info/RECORD +28 -0
fastsimus-0.0.1.dist-info/WHEEL +4 -0

fast_simus/_simus_strategies.py ADDED Viewed

@@ -0,0 +1,210 @@
+"""Loop drivers for simus frequency sweep (Layer 3).
+Each driver iterates _simus_freq_step_body() using a different mechanism:
+- _simus_freq_outer_python: Python for-loop (NumPy/CuPy, constant memory)
+- _simus_freq_outer_scan: JAX lax.scan for O(1) compilation cost
+The simus step body differs from pfield's: instead of accumulating |P_k|^2
+per grid point, it computes the full TX->scatter->RX chain and accumulates
+complex RF spectrum per element.
+"""
+from __future__ import annotations
+from math import pi
+import array_api_extra as xpx
+from jaxtyping import Bool, Complex, Float
+from fast_simus.utils._array_api import Array, _ArrayNamespace
+def _simus_freq_step_body(
+    phase: Complex[Array, "n_scat n_elem n_sub"],
+    phase_step: Complex[Array, "n_scat n_elem n_sub"],
+    delay_apod_phase: Complex[Array, " n_elem"],
+    delay_apod_step: Complex[Array, " n_elem"],
+    rc: Float[Array, " n_scat"],
+    pulse_probe_k: complex | Array,
+    probe_k: float | Array,
+    is_out: Bool[Array, " n_scat"],
+    xp: _ArrayNamespace,
+    *,
+    directivity_k: Float[Array, "n_scat n_elem n_sub"] | None = None,
+) -> tuple[
+    Complex[Array, "n_scat n_elem n_sub"],
+    Complex[Array, " n_elem"],
+    Complex[Array, " n_elem"],
+]:
+    """One frequency step: TX forward, scatter, RX backprop.
+    Args:
+        phase: Geometric progression state (n_scat, n_elem, n_sub).
+        phase_step: Per-step multiplier for geometric progression.
+        delay_apod_phase: Current delay+apodization phase per element.
+        delay_apod_step: Per-step delay+apodization multiplier.
+        rc: Reflection coefficients per scatterer.
+        pulse_probe_k: Combined pulse*probe spectrum weight for this frequency.
+        probe_k: Probe-only spectrum weight for RX.
+        is_out: Boolean mask for out-of-field scatterers.
+        xp: Array namespace.
+        directivity_k: Per-source directivity (optional, for full_frequency_directivity).
+    Returns:
+        Tuple of (updated_phase, updated_delay_apod, spect_k) where
+        spect_k is the complex RF spectrum contribution for this frequency,
+        shape (n_elements,).
+    """
+    if directivity_k is not None:
+        rp_mono = xp.mean(phase * directivity_k, axis=-1)
+    else:
+        rp_mono = xp.mean(phase, axis=-1)
+    # TX: contract over elements -> pressure at each scatterer
+    p_k = pulse_probe_k * (rp_mono @ delay_apod_phase[..., None])[..., 0]
+    p_k = xp.where(is_out, xp.asarray(0.0 + 0j), p_k)
+    # RX: contract over scatterers -> spectrum per element
+    # (rc * p_k)^T @ rp_mono = sum_i(rc_i * p_k_i * rp_mono[i, e])
+    weighted = rc * p_k
+    spect_k = weighted @ rp_mono
+    spect_k = probe_k * spect_k
+    phase = phase * phase_step
+    delay_apod_phase = delay_apod_phase * delay_apod_step
+    return phase, delay_apod_phase, spect_k
+def _simus_freq_outer_python(
+    phase_init: Complex[Array, "n_scat n_elem n_sub"],
+    phase_step: Complex[Array, "n_scat n_elem n_sub"],
+    delay_apod_init: Complex[Array, " n_elem"],
+    delay_apod_step: Complex[Array, " n_elem"],
+    rc: Float[Array, " n_scat"],
+    is_out: Bool[Array, " n_scat"],
+    wavenumbers: Float[Array, " n_freq"],
+    pulse_spect: Complex[Array, " n_freq"],
+    probe_spect: Float[Array, " n_freq"],
+    seg_length: float,
+    sin_theta: Float[Array, "n_scat n_elem n_sub"],
+    full_frequency_directivity: bool,
+    xp: _ArrayNamespace,
+) -> Complex[Array, "n_freq n_elem"]:
+    """Python for-loop driver: iterates one frequency at a time.
+    Accumulates the complex RF spectrum (n_freq, n_elements).
+    Peak memory is O(n_scat * n_elem * n_sub), independent of n_freq.
+    """
+    spectra = pulse_spect * probe_spect
+    n_freq = int(wavenumbers.shape[0])
+    n_elem = phase_init.shape[1]
+    spect_accum = xp.zeros((n_freq, n_elem), dtype=phase_init.dtype)
+    phase = phase_init
+    delay_apod_phase = delay_apod_init
+    if full_frequency_directivity:
+        for k in range(n_freq):
+            sinc_arg = wavenumbers[k] * seg_length / 2.0 * sin_theta / pi
+            directivity_k = xpx.sinc(sinc_arg, xp=xp)
+            phase, delay_apod_phase, spect_k = _simus_freq_step_body(
+                phase,
+                phase_step,
+                delay_apod_phase,
+                delay_apod_step,
+                rc,
+                spectra[k],
+                probe_spect[k],
+                is_out,
+                xp,
+                directivity_k=directivity_k,
+            )
+            spect_accum = _set_row(spect_accum, k, spect_k)
+    else:
+        for k in range(n_freq):
+            phase, delay_apod_phase, spect_k = _simus_freq_step_body(
+                phase,
+                phase_step,
+                delay_apod_phase,
+                delay_apod_step,
+                rc,
+                spectra[k],
+                probe_spect[k],
+                is_out,
+                xp,
+            )
+            spect_accum = _set_row(spect_accum, k, spect_k)
+    return spect_accum
+def _set_row(
+    arr: Complex[Array, "n_freq n_elem"],
+    k: int,
+    row: Complex[Array, " n_elem"],
+) -> Complex[Array, "n_freq n_elem"]:
+    """Set row k of arr to row, Array API compatible."""
+    return xpx.at(arr)[k, :].set(row)  # type: ignore[attr-defined]
+def _simus_freq_outer_scan(
+    phase_init: Complex[Array, "n_scat n_elem n_sub"],
+    phase_step: Complex[Array, "n_scat n_elem n_sub"],
+    delay_apod_init: Complex[Array, " n_elem"],
+    delay_apod_step: Complex[Array, " n_elem"],
+    rc: Float[Array, " n_scat"],
+    is_out: Bool[Array, " n_scat"],
+    wavenumbers: Float[Array, " n_freq"],
+    pulse_spect: Complex[Array, " n_freq"],
+    probe_spect: Float[Array, " n_freq"],
+    seg_length: float,
+    sin_theta: Float[Array, "n_scat n_elem n_sub"],
+    full_frequency_directivity: bool,
+    xp: _ArrayNamespace,
+) -> Complex[Array, "n_freq n_elem"]:
+    """JAX lax.scan driver: scan over frequencies with full tensor carry.
+    The carry holds (phase, delay_apod_phase) with shapes
+    (n_scat, n_elem, n_sub) and (n_elem,). Each step outputs
+    spect_k with shape (n_elem,), stacked by scan into (n_freq, n_elem).
+    """
+    import jax
+    spectra = pulse_spect * probe_spect
+    if full_frequency_directivity:
+        def scan_fn(carry, xs):
+            phase, delay_apod = carry
+            spectrum_k, probe_k, wavenum_k = xs
+            sinc_arg = wavenum_k * seg_length / 2.0 * sin_theta / pi
+            directivity_k = xpx.sinc(sinc_arg, xp=xp)
+            rp_mono = xp.mean(phase * directivity_k, axis=-1)
+            p_k = spectrum_k * (rp_mono @ delay_apod[..., None])[..., 0]
+            p_k = xp.where(is_out, xp.asarray(0.0 + 0j), p_k)
+            spect_k = probe_k * (rc * p_k) @ rp_mono
+            phase = phase * phase_step
+            delay_apod = delay_apod * delay_apod_step
+            return (phase, delay_apod), spect_k
+        init_carry = (phase_init, delay_apod_init)
+        _, spect_all = jax.lax.scan(scan_fn, init_carry, (spectra, probe_spect, wavenumbers))
+    else:
+        def scan_fn_no_dir(carry, xs):
+            phase, delay_apod = carry
+            spectrum_k, probe_k = xs
+            rp_mono = xp.mean(phase, axis=-1)
+            p_k = spectrum_k * (rp_mono @ delay_apod[..., None])[..., 0]
+            p_k = xp.where(is_out, xp.asarray(0.0 + 0j), p_k)
+            spect_k = probe_k * (rc * p_k) @ rp_mono
+            phase = phase * phase_step
+            delay_apod = delay_apod * delay_apod_step
+            return (phase, delay_apod), spect_k
+        init_carry = (phase_init, delay_apod_init)
+        _, spect_all = jax.lax.scan(scan_fn_no_dir, init_carry, (spectra, probe_spect))
+    return spect_all

fast_simus/backends/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Backend-specific compatibility modules."""

fast_simus/backends/mlx.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Array API compatibility shim for MLX.
+Temporary until array_api_compat gains native MLX support.
+Tracking: https://github.com/data-apis/array-api-compat/issues/162
+"""
+from __future__ import annotations
+import contextlib
+from typing import Any
+import array_api_compat
+import array_api_compat.common._helpers as _helpers
+_MLX_ARRAY_API_ALIASES: dict[str, str] = {
+    "asin": "arcsin",
+    "acos": "arccos",
+    "atan2": "arctan2",
+    "bool": "bool_",
+}
+_MLX_ISDTYPE_KIND_MAP: dict[str, str] = {
+    "bool": "bool_",
+    "signed integer": "signedinteger",
+    "unsigned integer": "unsignedinteger",
+    "integral": "integer",
+    "real floating": "floating",
+    "complex floating": "complexfloating",
+    "numeric": "number",
+}
+def _make_isdtype(xp: Any) -> Any:
+    def isdtype(dtype: Any, kind: Any) -> bool:
+        if isinstance(kind, str):
+            category = _MLX_ISDTYPE_KIND_MAP.get(kind)
+            if category is None:
+                msg = f"Unrecognized dtype kind: {kind!r}"
+                raise ValueError(msg)
+            return bool(xp.issubdtype(dtype, getattr(xp, category)))
+        if isinstance(kind, tuple):
+            return any(isdtype(dtype, k) for k in kind)
+        return dtype == kind
+    return isdtype
+def _patch_namespace(xp: Any) -> None:
+    """Add Array API aliases to mlx.core (idempotent)."""
+    for standard_name, mlx_name in _MLX_ARRAY_API_ALIASES.items():
+        if not hasattr(xp, standard_name) and hasattr(xp, mlx_name):
+            setattr(xp, standard_name, getattr(xp, mlx_name))
+    if not hasattr(xp, "isdtype") and hasattr(xp, "issubdtype"):
+        xp.isdtype = _make_isdtype(xp)
+    if not hasattr(xp, "astype"):
+        def _astype(x: Any, dtype: Any, /, *, copy: bool = False) -> Any:
+            return x.astype(dtype)
+        xp.astype = _astype
+    if not getattr(xp.asarray, "_fastsimus_wrapped", False):
+        _original = xp.asarray
+        def _asarray(a: Any, *, dtype: Any = None, **_kwargs: Any) -> Any:
+            if dtype is not None:
+                return _original(a, dtype=dtype)
+            return _original(a)
+        _asarray._fastsimus_wrapped = True  # type: ignore[attr-defined]
+        xp.asarray = _asarray
+def _patch_device(xp: Any) -> None:
+    """Patch array_api_compat device() for MLX unified memory."""
+    _original = _helpers.device
+    if getattr(_original, "_fastsimus_mlx", False):
+        return
+    def _device_with_mlx(x: Any, /) -> Any:
+        if type(x).__module__.startswith("mlx"):
+            return xp.default_device()
+        return _original(x)
+    _device_with_mlx._fastsimus_mlx = True  # type: ignore[attr-defined]
+    _helpers.device = _device_with_mlx  # type: ignore[assignment]
+    array_api_compat.device = _device_with_mlx  # type: ignore[assignment]
+    with contextlib.suppress(ImportError):
+        import array_api_extra._lib._utils._compat as _xpx_compat  # type: ignore[import-untyped]
+        _xpx_compat.device = _device_with_mlx
+def ensure_compat(xp: Any) -> None:
+    """Apply all MLX compatibility patches (idempotent)."""
+    _patch_namespace(xp)
+    _patch_device(xp)

fast_simus/kernels/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Backend-specific fused kernels for FastSIMUS.
+Custom kernels provide maximum performance by fusing the entire computation
+into a single GPU dispatch. Each kernel is a different algorithm from the
+Array API path (e.g., on-the-fly geometry instead of precomputed arrays).
+Available kernels:
+- metal_pfield: Apple Silicon Metal kernel for pfield (requires MLX)
+"""

fast_simus/kernels/cuda_simus.py ADDED Viewed

@@ -0,0 +1,321 @@
+"""CuPy CUDA backend for simus.
+Compiles the v25c register-resident TX kernel via NVRTC at runtime
+(``cupy.RawModule``) -- no nanobind, no setuptools build step. Pinned to
+``(B_SCAT=10, ELEM_TILE=2)`` for RTX 4090 / sm_89 / P4-2v; performance may
+regress on other probes / GPUs (see exp22 + the FastSIMUS-cuda-tune
+follow-up).
+Output layout matches ``metal_simus.simus_metal``: complex64
+``(n_freq, n_elements)``. The shipped kernel does its own per-scatterer
+Phase-1 geometry from a flat input set, so the
+``(n_scat, n_elem, n_sub)`` phase tensors that ``_simus_freq_outer_python``
+consumes are *not* fed in here.
+Requires: CuPy on a CUDA host. Use ``cupy-cuda12x`` for CUDA 12/Pascal
+hosts and ``cupy-cuda13x`` for CUDA 13/Turing-or-newer hosts.
+"""
+from __future__ import annotations
+from math import inf, pi
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, cast
+import cupy as cp
+from fast_simus._pfield_math import NEPER_TO_DB, _subelement_centroids
+from fast_simus.medium_params import MediumParams
+from fast_simus.transducer_params import TransducerParams
+from fast_simus.utils._array_api import _ArrayNamespace
+from fast_simus.utils.geometry import element_positions
+if TYPE_CHECKING:
+    from fast_simus.simus import SimusPlan
+_KERNELS_DIR = Path(__file__).parent
+_SOURCE_NAME = "simus_fused.cu"
+# Pinned tuning -- see docs/progress/experiments/exp22-svshmem-et2.md.
+# These constants are RTX 4090 / sm_89 / P4-2v optimal; not autotuned.
+_B_SCAT = 10
+_ELEM_TILE = 2
+_TG_SIZE = 128
+_TILE_SE = 16
+_GRID_BLOCKS = 256  # 2 * 128 SMs on RTX 4090
+# CuPy / NVRTC auto-derives ``--gpu-architecture`` from the current device,
+# so we don't pin it here. Tuning constants (B_SCAT, ELEM_TILE, TG_SIZE)
+# are still hardwired for sm_89 and may need adjustment for sm_80 / sm_90.
+# Default static dynamic-shmem cap (48 KB) is below what some probes
+# need (e.g. L11-5v with n_sub=2 hits ~64 KB). We raise the per-kernel
+# cap via cuFuncSetAttribute when required. Modern GPUs (sm_75+) support
+# up to ~96-100 KB dynamic shared memory per block.
+_DEFAULT_SHMEM_CAP_BYTES = 48 * 1024
+_MAX_DYNAMIC_SHMEM_BYTES = 96 * 1024
+_source_cache: dict[str, str] = {}
+_kernel_cache: dict[tuple[int, int, int], Any] = {}
+def _load_source(filename: str) -> str:
+    if filename not in _source_cache:
+        _source_cache[filename] = (_KERNELS_DIR / filename).read_text()
+    return _source_cache[filename]
+def _shmem_bytes(n_elem: int, n_sub: int) -> int:
+    """Bytes of dynamic shared memory required by the v25c kernel.
+    Layout (see ``simus_fused.cu``):
+        7 * B_SCAT * N_ES floats of TX/RX geometry + 3 * N_ELEM floats of
+        per-element broadcast (da_init_re, da_init_im, dps).
+    """
+    n_es = n_elem * n_sub
+    return (7 * _B_SCAT * n_es + 3 * n_elem) * 4
+def _get_kernel(n_elem: int, n_sub: int, n_freq: int) -> Any:
+    """Compile + cache simus_fused_kernel for the given problem shape.
+    The cache key is ``(n_elem, n_sub, n_freq)`` -- ``n_scat`` is not in
+    the key because the kernel grid-strides over scatterers (one fused
+    launch covers the whole sweep, unlike the Metal split-kernel path).
+    """
+    key = (n_elem, n_sub, n_freq)
+    if key in _kernel_cache:
+        return _kernel_cache[key]
+    n_es = n_elem * n_sub
+    max_fpt = (n_freq + _TG_SIZE - 1) // _TG_SIZE
+    options = (
+        "--std=c++17",
+        "--use_fast_math",
+        "--extra-device-vectorization",
+        f"-DN_ELEM={n_elem}",
+        f"-DN_SUB={n_sub}",
+        f"-DN_FREQ={n_freq}",
+        f"-DN_ES={n_es}",
+        f"-DTILE_SE={_TILE_SE}",
+        f"-DTG_SIZE={_TG_SIZE}",
+        f"-DMAX_FPT={max_fpt}",
+        f"-DB_SCAT={_B_SCAT}",
+        f"-DELEM_TILE={_ELEM_TILE}",
+    )
+    module = cp.RawModule(
+        code=_load_source(_SOURCE_NAME),
+        backend="nvrtc",
+        options=options,
+        name_expressions=("simus_fused_kernel",),
+    )
+    kernel = module.get_function("simus_fused_kernel")
+    _kernel_cache[key] = kernel
+    return kernel
+def _prepare_inputs(
+    scatterers: Any,
+    rc: Any,
+    delays_clean: Any,
+    tx_apodization: Any,
+    plan: SimusPlan,
+    params: TransducerParams,
+    medium: MediumParams,
+) -> dict[str, Any]:
+    """Pack the 15 input arrays + 12 scalars the v25c kernel expects.
+    Mirrors ``metal_simus._prepare_common`` but without the
+    ``(n_scat, n_elem, n_sub)`` expansion: v25c does its own Phase-1
+    geometry from the flat per-element / per-sub-element inputs.
+    """
+    c = medium.speed_of_sound
+    alpha = medium.attenuation
+    n_elem = params.n_elements
+    n_sub = plan.n_sub
+    n_freq = int(plan.selected_freqs.shape[0])
+    # element_positions with `xp=cp` returns CuPy arrays directly.
+    xp_cp = cast(_ArrayNamespace, cp)
+    elem_pos, theta_e, apex_offset = element_positions(n_elem, params.pitch, params.radius, xp_cp)
+    if theta_e is None:
+        theta_e = cp.zeros(n_elem, dtype=cp.float32)
+    # Sub-element offsets per (elem, sub) flattened to N_ES with se = elem*n_sub + sub
+    # (see kernel line `int elem = se / N_SUB;`).
+    offsets = cast(cp.ndarray, _subelement_centroids(params.element_width, n_sub, theta_e, xp_cp))
+    sub_dx = cp.ascontiguousarray(offsets[..., 0].reshape(-1).astype(cp.float32))
+    sub_dz = cp.ascontiguousarray(offsets[..., 1].reshape(-1).astype(cp.float32))
+    cos_te = cp.ascontiguousarray(cp.cos(theta_e).astype(cp.float32))
+    sin_neg_te = cp.ascontiguousarray(cp.sin(-theta_e).astype(cp.float32))
+    # Frequency-grid scalars
+    freq_start = float(plan.selected_freqs[0])
+    freq_step = float(plan.selected_freqs[1] - plan.selected_freqs[0]) if n_freq > 1 else 0.0
+    # Delay+apodization as separate per-element arrays. The kernel folds
+    # tx_apodization into the initial value and steps phase by 2*pi*freq_step
+    # per outer-frequency iteration.
+    da_init_re = cp.ascontiguousarray((cp.cos(2 * pi * freq_start * delays_clean) * tx_apodization).astype(cp.float32))
+    da_init_im = cp.ascontiguousarray((cp.sin(2 * pi * freq_start * delays_clean) * tx_apodization).astype(cp.float32))
+    dps = cp.ascontiguousarray((2 * pi * freq_step * delays_clean).astype(cp.float32))
+    # Pulse * probe (complex), and probe magnitude separately for the RX leg.
+    pulse_probe = cast(cp.ndarray, plan.pulse_spectrum * plan.probe_spectrum).astype(cp.complex64)
+    pp_re = cp.ascontiguousarray(cp.real(pulse_probe).astype(cp.float32))
+    pp_im = cp.ascontiguousarray(cp.imag(pulse_probe).astype(cp.float32))
+    probe_real = cp.ascontiguousarray(cp.asarray(plan.probe_spectrum).astype(cp.float32))
+    # Convex array radius is float('inf') for linear arrays. Replace with
+    # 1e31 so the kernel's `radius * radius` stays finite in fp32.
+    radius_v = params.radius if params.radius != inf else 1e31
+    # Pad scatterers to a multiple of B_SCAT. The kernel processes
+    # B_SCAT scatterers per block and, when actual_b < B_SCAT, leaves
+    # shmem GEO_* slots for si in [actual_b, B_SCAT) uninitialized.
+    # Phase 3's cmul(cv=0, garbage) then produces NaN if the garbage is
+    # NaN. Padding with valid positions and rc=0 makes Phase 1 populate
+    # all si slots while contributing zero to the spectrum (rc=0 zeros
+    # tk in Phase 2, and the GEO progression stays finite).
+    n_scat = int(scatterers.shape[0])
+    n_scat_padded = ((n_scat + _B_SCAT - 1) // _B_SCAT) * _B_SCAT
+    if n_scat_padded > n_scat:
+        pad = n_scat_padded - n_scat
+        scat_x = cp.concatenate(
+            [scatterers[:, 0].astype(cp.float32), cp.repeat(scatterers[:1, 0].astype(cp.float32), pad)],
+        )
+        scat_z = cp.concatenate(
+            [scatterers[:, 1].astype(cp.float32), cp.repeat(scatterers[:1, 1].astype(cp.float32), pad)],
+        )
+        rc_padded = cp.concatenate([rc.astype(cp.float32), cp.zeros(pad, dtype=cp.float32)])
+    else:
+        scat_x = scatterers[:, 0].astype(cp.float32)
+        scat_z = scatterers[:, 1].astype(cp.float32)
+        rc_padded = rc.astype(cp.float32)
+    return {
+        "scat_x": cp.ascontiguousarray(scat_x),
+        "scat_z": cp.ascontiguousarray(scat_z),
+        "rc": cp.ascontiguousarray(rc_padded),
+        "elem_x": cp.ascontiguousarray(elem_pos[:, 0].astype(cp.float32)),
+        "elem_z": cp.ascontiguousarray(elem_pos[:, 1].astype(cp.float32)),
+        "cos_te": cos_te,
+        "sin_neg_te": sin_neg_te,
+        "sub_dx": sub_dx,
+        "sub_dz": sub_dz,
+        "da_init_re": da_init_re,
+        "da_init_im": da_init_im,
+        "dps": dps,
+        "pp_re": pp_re,
+        "pp_im": pp_im,
+        "probe_real": probe_real,
+        "n_scat": n_scat_padded,
+        "kw_init": 2 * pi * freq_start / c,
+        "alpha_init": alpha / NEPER_TO_DB * freq_start / 1e6 * 1e2,
+        "kw_step": 2 * pi * freq_step / c,
+        "alpha_step": alpha / NEPER_TO_DB * freq_step / 1e6 * 1e2,
+        "min_dist": c / params.freq_center / 2.0,
+        "seg_len": plan.seg_length,
+        "center_kw": 2 * pi * params.freq_center / c,
+        "inv_nsub": 1.0 / n_sub,
+        "radius_v": radius_v,
+        "apex_offset": apex_offset,
+        "n_elem": n_elem,
+        "n_sub": n_sub,
+        "n_freq": n_freq,
+    }
+def simus_cuda(
+    scatterers: Any,
+    rc: Any,
+    params: TransducerParams,
+    plan: SimusPlan,
+    medium: MediumParams,
+    delays_clean: Any,
+    tx_apodization: Any,
+) -> Any:
+    """Compute simus RF spectrum using the v25c CUDA kernel via CuPy/NVRTC.
+    Single fused TX+RX kernel that grid-strides over scatterers; no
+    chunking is needed since per-thread state is in registers.
+    Args:
+        scatterers: Scatterer positions (x, z) in meters. Shape ``(n_scat, 2)``.
+        rc: Reflection coefficients. Shape ``(n_scat,)``.
+        params: Transducer parameters.
+        plan: Precomputed frequency plan from ``simus_precompute``.
+        medium: Medium parameters.
+        delays_clean: NaN-cleaned delays. Shape ``(n_elements,)``.
+        tx_apodization: Per-element apodization (NaN-zeroed). Shape ``(n_elements,)``.
+    Returns:
+        Complex RF spectrum, shape ``(n_freq, n_elements)``, dtype ``complex64``.
+    """
+    d = _prepare_inputs(scatterers, rc, delays_clean, tx_apodization, plan, params, medium)
+    n_elem, n_sub, n_freq = d["n_elem"], d["n_sub"], d["n_freq"]
+    shmem = _shmem_bytes(n_elem, n_sub)
+    if shmem > _MAX_DYNAMIC_SHMEM_BYTES:
+        msg = (
+            f"v25c shmem {shmem} B exceeds the {_MAX_DYNAMIC_SHMEM_BYTES} B "
+            f"per-block cap for (n_elem={n_elem}, n_sub={n_sub}); needs a "
+            f"smaller B_SCAT or a different probe."
+        )
+        raise RuntimeError(msg)
+    kernel = _get_kernel(n_elem, n_sub, n_freq)
+    # Raise per-kernel dynamic-shmem cap when we exceed the 48 KB default.
+    # No-op when shmem fits under _DEFAULT_SHMEM_CAP_BYTES.
+    if shmem > _DEFAULT_SHMEM_CAP_BYTES:
+        kernel.max_dynamic_shared_size_bytes = shmem
+    # Output buffers; kernel uses atomicAdd into spect_re[elem*N_FREQ + f].
+    spect_re = cp.zeros(n_elem * n_freq, dtype=cp.float32)
+    spect_im = cp.zeros(n_elem * n_freq, dtype=cp.float32)
+    args = (
+        d["scat_x"],
+        d["scat_z"],
+        d["rc"],
+        d["elem_x"],
+        d["elem_z"],
+        d["cos_te"],
+        d["sin_neg_te"],
+        d["sub_dx"],
+        d["sub_dz"],
+        d["da_init_re"],
+        d["da_init_im"],
+        d["dps"],
+        d["pp_re"],
+        d["pp_im"],
+        d["probe_real"],
+        spect_re,
+        spect_im,
+        cp.int32(d["n_scat"]),
+        cp.float32(d["kw_init"]),
+        cp.float32(d["alpha_init"]),
+        cp.float32(d["kw_step"]),
+        cp.float32(d["alpha_step"]),
+        cp.float32(d["min_dist"]),
+        cp.float32(d["seg_len"]),
+        cp.float32(d["center_kw"]),
+        cp.float32(d["inv_nsub"]),
+        cp.float32(d["radius_v"]),
+        cp.float32(d["apex_offset"]),
+    )
+    kernel(
+        grid=(_GRID_BLOCKS, 1, 1),
+        block=(_TG_SIZE, 1, 1),
+        args=args,
+        shared_mem=shmem,
+    )
+    # Row-major (n_elem, n_freq) -> column-major (n_freq, n_elem) complex64
+    # to match metal_simus / _simus_freq_outer_python output convention.
+    spect = (spect_re + 1j * spect_im).reshape(n_elem, n_freq).T
+    return spect.astype(cp.complex64)