PyPI - FastSIMUS - Versions diffs - 0.0.1__py3-none-any.whl - Mend

FastSIMUS 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

fast_simus/__init__.py +33 -0
fast_simus/_pfield_math.py +261 -0
fast_simus/_pfield_strategies.py +203 -0
fast_simus/_simus_strategies.py +210 -0
fast_simus/backends/__init__.py +1 -0
fast_simus/backends/mlx.py +101 -0
fast_simus/kernels/__init__.py +9 -0
fast_simus/kernels/cuda_simus.py +321 -0
fast_simus/kernels/metal_pfield.py +219 -0
fast_simus/kernels/metal_simus.py +377 -0
fast_simus/kernels/pfield.metal +97 -0
fast_simus/kernels/simus_fused.cu +332 -0
fast_simus/kernels/simus_rx_simd.metal +128 -0
fast_simus/kernels/simus_tx_tiled.metal +175 -0
fast_simus/medium_params.py +22 -0
fast_simus/pfield.py +475 -0
fast_simus/py.typed +0 -0
fast_simus/simus.py +567 -0
fast_simus/spectrum.py +107 -0
fast_simus/transducer_params.py +160 -0
fast_simus/transducer_presets.py +102 -0
fast_simus/tx_delay.py +276 -0
fast_simus/utils/__init__.py +5 -0
fast_simus/utils/_array_api.py +294 -0
fast_simus/utils/geometry.py +88 -0
fastsimus-0.0.1.dist-info/METADATA +594 -0
fastsimus-0.0.1.dist-info/RECORD +28 -0
fastsimus-0.0.1.dist-info/WHEEL +4 -0

fast_simus/kernels/metal_pfield.py ADDED Viewed

@@ -0,0 +1,219 @@
+"""Custom Metal kernel for pfield computation on Apple Silicon.
+Fuses geometry, phase initialization, and frequency sweep into a single
+GPU kernel. One thread per grid point computes the full pressure contribution
+on-the-fly, avoiding large intermediate arrays.
+Requires: MLX (mlx package) on Apple Silicon.
+Limitations:
+    - Soft baffle only (BaffleType.SOFT assumed)
+    - Center-frequency directivity only (full_frequency_directivity=False)
+    - Linear arrays only (convex array support needs testing)
+"""
+from __future__ import annotations
+from math import inf, pi
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, cast
+import mlx.core as mx
+from fast_simus._pfield_math import NEPER_TO_DB, _subelement_centroids
+from fast_simus.medium_params import MediumParams
+from fast_simus.transducer_params import TransducerParams
+from fast_simus.utils._array_api import Array, _ArrayNamespace
+from fast_simus.utils.geometry import element_positions
+if TYPE_CHECKING:
+    from fast_simus.pfield import PfieldPlan
+_metal_source_cache: str | None = None
+def _load_kernel_source() -> str:
+    """Read the Metal kernel body from ``pfield.metal`` (cached)."""
+    global _metal_source_cache
+    if _metal_source_cache is None:
+        _metal_source_cache = (Path(__file__).parent / "pfield.metal").read_text()
+    return _metal_source_cache
+_kernel_cache: dict[tuple[int, int, int], Any] = {}
+def build_pfield_kernel(n_elem: int, n_sub: int, n_freq: int) -> Any:
+    """Build (or retrieve cached) Metal kernel for given dimensions.
+    Args:
+        n_elem: Number of transducer elements.
+        n_sub: Number of sub-elements per element.
+        n_freq: Number of frequency samples.
+    Returns:
+        Compiled Metal kernel callable.
+    """
+    key = (n_elem, n_sub, n_freq)
+    if key in _kernel_cache:
+        return _kernel_cache[key]
+    n_es = n_elem * n_sub
+    header = f"#define N_ELEM {n_elem}\n#define N_SUB {n_sub}\n#define N_FREQ {n_freq}\n#define N_ES {n_es}\n"
+    kernel = mx.fast.metal_kernel(
+        name=f"pfield_{n_elem}_{n_sub}_{n_freq}",
+        input_names=[
+            "grid_x",
+            "grid_z",
+            "elem_x",
+            "elem_z",
+            "theta_e",
+            "sub_dx",
+            "sub_dz",
+            "da_init_re",
+            "da_init_im",
+            "da_step_re",
+            "da_step_im",
+            "pp_mag_sq",
+            "is_out",
+            "scalars",
+        ],
+        output_names=["pressure"],
+        header=header,
+        source=_load_kernel_source(),
+    )
+    _kernel_cache[key] = kernel
+    return kernel
+def pfield_metal(
+    positions: mx.array,
+    params: TransducerParams,
+    plan: PfieldPlan,
+    medium: MediumParams,
+    delays_clean: mx.array,
+    tx_apodization: mx.array,
+) -> mx.array:
+    """Compute pressure field using a custom Metal kernel.
+    Computes geometry on-the-fly per grid point, avoiding large intermediate
+    arrays (*grid, n_elements, n_sub). Returns raw pressure accumulation
+    (sum of |P_k|^2 * correction), NOT the final sqrt -- the caller applies
+    sqrt after the dispatch block.
+    Args:
+        positions: Grid positions (x, z) in meters. Shape ``(*grid_shape, 2)``.
+        params: Transducer parameters.
+        plan: Precomputed frequency plan from ``pfield_precompute``.
+        medium: Medium parameters.
+        delays_clean: NaN-cleaned delays. Shape ``(n_elements,)``.
+        tx_apodization: Per-element apodization (NaN-zeroed). Shape ``(n_elements,)``.
+    Returns:
+        Raw pressure accumulation, shape ``(*grid_shape,)``.
+        Caller must apply ``xp.sqrt(result)`` to get RMS pressure.
+    """
+    c = medium.speed_of_sound
+    alpha = medium.attenuation
+    n_elem = params.n_elements
+    n_sub = plan.n_sub
+    n_freq = int(plan.selected_freqs.shape[0])
+    grid_shape = positions.shape[:-1]
+    # Element geometry
+    elem_pos, theta_e, apex_offset = element_positions(
+        n_elem,
+        params.pitch,
+        params.radius,
+        cast(_ArrayNamespace, mx),
+    )
+    if theta_e is None:
+        theta_e = mx.zeros(n_elem, dtype=mx.float32)
+    # Subelement offsets -- reuse shared geometry, reshape to flat (n_elem*n_sub,)
+    xp_mx = cast(_ArrayNamespace, mx)
+    offsets = _subelement_centroids(params.element_width, n_sub, cast("Array", theta_e), xp_mx)
+    sub_dx = cast(mx.array, offsets[..., 0]).reshape(-1)
+    sub_dz = cast(mx.array, offsets[..., 1]).reshape(-1)
+    # is_out mask (float32: 1.0=out, 0.0=in)
+    x_flat = positions[..., 0].reshape(-1)
+    z_flat = positions[..., 1].reshape(-1)
+    is_out = (z_flat < 0).astype(mx.float32)
+    if params.radius != inf:
+        in_arc = (x_flat**2 + (z_flat + apex_offset) ** 2) <= params.radius**2
+        is_out = mx.maximum(is_out, in_arc.astype(mx.float32))
+    # Derive freq_start / freq_step from the canonical selected_freqs array.
+    freq_start = float(plan.selected_freqs[0])
+    freq_step = float(plan.selected_freqs[1] - plan.selected_freqs[0]) if n_freq > 1 else 0.0
+    # Delay+apodization split into real/imag
+    ph_init = mx.array(2.0 * pi * freq_start, dtype=mx.float32) * delays_clean
+    da_init_re = (mx.cos(ph_init) * tx_apodization).astype(mx.float32)
+    da_init_im = (mx.sin(ph_init) * tx_apodization).astype(mx.float32)
+    ph_step = mx.array(2.0 * pi * freq_step, dtype=mx.float32) * delays_clean
+    da_step_re = mx.cos(ph_step).astype(mx.float32)
+    da_step_im = mx.sin(ph_step).astype(mx.float32)
+    # |pulse_spectrum * probe_spectrum|^2
+    _pulse = cast(mx.array, plan.pulse_spectrum)
+    _probe = cast(mx.array, plan.probe_spectrum)
+    pp_mag_sq = mx.abs(_pulse).astype(mx.float32) ** 2 * _probe.astype(mx.float32) ** 2
+    # Scalar physics parameters
+    wavenumber_init = 2.0 * pi * freq_start / c
+    attenuation_init = alpha / NEPER_TO_DB * freq_start / 1e6 * 1e2
+    wavenumber_step = 2.0 * pi * freq_step / c
+    attenuation_step = alpha / NEPER_TO_DB * freq_step / 1e6 * 1e2
+    min_distance = c / params.freq_center / 2.0
+    center_wavenumber = 2.0 * pi * params.freq_center / c
+    # 1/n_sub^2 because kernel sums (not means) over sub-elements.
+    # correction_factor is applied by the caller uniformly across all strategies.
+    effective_correction = 1.0 / (n_sub**2)
+    scalars = mx.array(
+        [
+            wavenumber_init,
+            attenuation_init,
+            wavenumber_step,
+            attenuation_step,
+            min_distance,
+            plan.seg_length,
+            center_wavenumber,
+            effective_correction,
+        ],
+        dtype=mx.float32,
+    )
+    # Build kernel and dispatch
+    n_grid = int(x_flat.shape[0])
+    kernel = build_pfield_kernel(n_elem, n_sub, n_freq)
+    outputs = kernel(
+        inputs=[
+            x_flat.astype(mx.float32),
+            z_flat.astype(mx.float32),
+            elem_pos[:, 0].astype(mx.float32),
+            elem_pos[:, 1].astype(mx.float32),
+            theta_e.astype(mx.float32),
+            sub_dx.astype(mx.float32),
+            sub_dz.astype(mx.float32),
+            da_init_re,
+            da_init_im,
+            da_step_re,
+            da_step_im,
+            pp_mag_sq,
+            is_out.astype(mx.float32),
+            scalars,
+        ],
+        output_shapes=[(n_grid,)],
+        output_dtypes=[mx.float32],
+        grid=(n_grid, 1, 1),
+        threadgroup=(256, 1, 1),
+    )
+    # Return raw accumulation (acc / n_sub^2). The caller applies
+    # sqrt(pressure_accum * correction_factor) uniformly for all strategies.
+    return outputs[0].reshape(grid_shape)

fast_simus/kernels/metal_simus.py ADDED Viewed

@@ -0,0 +1,377 @@
+"""Custom Metal kernel for simus RF spectrum on Apple Silicon.
+Two-kernel architecture for optimal GPU occupancy:
+  - Kernel A (TX): Element-tiled progression with shared-memory geometry.
+    One threadgroup per scatterer; threads cooperatively compute geometry,
+    then each thread processes sub-element tiles with ALU-only geometric
+    progression. TILE_SE=16, threadgroup=64.
+  - Kernel B (RX): SIMD-reduce RX with SCAT_REDUCE scatterers per
+    threadgroup.  Adjacent SIMD threads handle the same element from
+    different scatterers and use simd_shuffle_xor to sum contributions
+    before a single atomic write.  Cuts atomic ops by SCAT_REDUCE (2x)
+    while preserving coalesced output access.
+    Threadgroup size = N_ELEM * SCAT_REDUCE (128 for P4-2v with SR=2).
+For large scatterer counts, scatterers are processed in chunks that fit
+within ``MAX_TX_INTERMEDIATE_BYTES``, with the split-path spectrum
+accumulated across chunks via simple addition.
+Requires: MLX (mlx package) on Apple Silicon.
+Limitations:
+    - Soft baffle only (BaffleType.SOFT assumed)
+    - Center-frequency directivity only (full_frequency_directivity=False)
+    - Linear arrays only (convex array support needs testing)
+"""
+from __future__ import annotations
+from math import inf, pi
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, cast
+import mlx.core as mx
+from fast_simus._pfield_math import NEPER_TO_DB, _subelement_centroids
+from fast_simus.medium_params import MediumParams
+from fast_simus.transducer_params import TransducerParams
+from fast_simus.utils._array_api import Array, _ArrayNamespace
+from fast_simus.utils.geometry import element_positions
+if TYPE_CHECKING:
+    from fast_simus.simus import SimusPlan
+_KERNELS_DIR = Path(__file__).parent
+MAX_TX_INTERMEDIATE_BYTES = 256 * 1024 * 1024  # 256 MB
+_TX_TILE_SE = 16
+_TX_TILE_TG = 64
+_RX_SCAT_REDUCE = 2
+# TX tiled kernel: register pressure is only TILE_SE * 2 * 8 bytes per thread
+# (256 bytes for TILE_SE=16), well within Apple Silicon's register budget.
+_TX_OPTIMAL_CHUNK: dict[int, int] = {
+    64: 10_000,  # P4-2v class (64 elem, 256B registers/thread)
+    128: 5_000,  # L11-5v class (128 elem, 256B registers/thread)
+}
+_TX_DEFAULT_CHUNK = 10_000
+# ---------------------------------------------------------------------------
+# Source caching
+# ---------------------------------------------------------------------------
+_source_cache: dict[str, str] = {}
+def _load_source(filename: str) -> str:
+    if filename not in _source_cache:
+        _source_cache[filename] = (_KERNELS_DIR / filename).read_text()
+    return _source_cache[filename]
+# ---------------------------------------------------------------------------
+# Kernel builders (cached by dimension tuple)
+# ---------------------------------------------------------------------------
+_kernel_cache: dict[tuple, Any] = {}
+def _make_header(n_elem: int, n_sub: int, n_freq: int, n_scat: int) -> str:
+    return (
+        f"#define N_ELEM {n_elem}\n"
+        f"#define N_SUB {n_sub}\n"
+        f"#define N_FREQ {n_freq}\n"
+        f"#define N_ES {n_elem * n_sub}\n"
+        f"#define N_SCAT {n_scat}\n"
+    )
+def _build_tx(n_elem: int, n_sub: int, n_freq: int, n_scat: int) -> Any:
+    """Build the tiled TX kernel (element-tiled progression with shared geometry)."""
+    key = ("tx_tiled", n_elem, n_sub, n_freq, n_scat)
+    if key not in _kernel_cache:
+        tg = _TX_TILE_TG
+        header = (
+            _make_header(n_elem, n_sub, n_freq, n_scat)
+            + f"#define TILE_SE {_TX_TILE_SE}\n"
+            + f"#define TG_SIZE {tg}\n"
+            + f"#define MAX_FPT (({n_freq} + {tg} - 1) / {tg})\n"
+        )
+        _kernel_cache[key] = mx.fast.metal_kernel(
+            name=f"simus_tx_tiled_{n_elem}_{n_sub}_{n_freq}_{n_scat}",
+            input_names=[
+                "scat_x",
+                "scat_z",
+                "elem_x",
+                "elem_z",
+                "theta_e",
+                "sub_dx",
+                "sub_dz",
+                "da_init_re",
+                "da_init_im",
+                "delay_phase_step",
+                "pp_re",
+                "pp_im",
+                "is_out",
+                "scalars",
+            ],
+            output_names=["tx_re", "tx_im"],
+            header=header,
+            source=_load_source("simus_tx_tiled.metal"),
+        )
+    return _kernel_cache[key]
+def _build_rx(n_elem: int, n_sub: int, n_freq: int, n_scat: int) -> Any:
+    """Build the SIMD-reduce RX kernel.
+    Groups SCAT_REDUCE scatterers per threadgroup.  Adjacent threads handle
+    the same element from different scatterers and use simd_shuffle_xor to
+    sum contributions before writing a single atomic.  Cuts atomic writes by
+    SCAT_REDUCE while preserving coalesced output access.
+    """
+    sr = _RX_SCAT_REDUCE
+    key = ("rx_simd", n_elem, n_sub, n_freq, n_scat, sr)
+    if key not in _kernel_cache:
+        header = _make_header(n_elem, n_sub, n_freq, n_scat) + f"#define SCAT_REDUCE {sr}\n"
+        _kernel_cache[key] = mx.fast.metal_kernel(
+            name=f"simus_rx_simd_{n_elem}_{n_sub}_{n_freq}_{n_scat}_{sr}",
+            input_names=[
+                "scat_x",
+                "scat_z",
+                "elem_x",
+                "elem_z",
+                "theta_e",
+                "sub_dx",
+                "sub_dz",
+                "tx_re",
+                "tx_im",
+                "probe",
+                "rc",
+                "scalars",
+            ],
+            output_names=["spect_re", "spect_im"],
+            header=header,
+            source=_load_source("simus_rx_simd.metal"),
+            atomic_outputs=True,
+        )
+    return _kernel_cache[key]
+# ---------------------------------------------------------------------------
+# Input preparation
+# ---------------------------------------------------------------------------
+def _prepare_common(
+    scatterers: mx.array,
+    rc: mx.array,
+    params: TransducerParams,
+    plan: SimusPlan,
+    medium: MediumParams,
+    delays_clean: mx.array,
+    tx_apodization: mx.array,
+) -> dict[str, Any]:
+    """Prepare all GPU-side inputs from plan and params."""
+    c = medium.speed_of_sound
+    alpha = medium.attenuation
+    n_elem = params.n_elements
+    n_sub = plan.n_sub
+    n_freq = int(plan.selected_freqs.shape[0])
+    n_scat = int(scatterers.shape[0])
+    xp_mx = cast(_ArrayNamespace, mx)
+    elem_pos, theta_e, apex_offset = element_positions(
+        n_elem,
+        params.pitch,
+        params.radius,
+        xp_mx,
+    )
+    if theta_e is None:
+        theta_e = mx.zeros(n_elem, dtype=mx.float32)
+    offsets = _subelement_centroids(params.element_width, n_sub, cast("Array", theta_e), xp_mx)
+    sub_dx = cast(mx.array, offsets[..., 0]).reshape(-1)
+    sub_dz = cast(mx.array, offsets[..., 1]).reshape(-1)
+    x_flat = scatterers[:, 0]
+    z_flat = scatterers[:, 1]
+    is_out = (z_flat < 0).astype(mx.float32)
+    if params.radius != inf:
+        in_arc = (x_flat**2 + (z_flat + apex_offset) ** 2) <= params.radius**2
+        is_out = mx.maximum(is_out, in_arc.astype(mx.float32))
+    freq_start = float(plan.selected_freqs[0])
+    freq_step = float(plan.selected_freqs[1] - plan.selected_freqs[0]) if n_freq > 1 else 0.0
+    ph_init = mx.array(2.0 * pi * freq_start, dtype=mx.float32) * delays_clean
+    da_init_re = (mx.cos(ph_init) * tx_apodization).astype(mx.float32)
+    da_init_im = (mx.sin(ph_init) * tx_apodization).astype(mx.float32)
+    ph_step = mx.array(2.0 * pi * freq_step, dtype=mx.float32) * delays_clean
+    delay_phase_step = ph_step.astype(mx.float32)
+    _pulse = cast(mx.array, plan.pulse_spectrum)
+    _probe = cast(mx.array, plan.probe_spectrum)
+    pp_complex = _pulse * _probe
+    pp_re = mx.real(pp_complex).astype(mx.float32)
+    pp_im = mx.imag(pp_complex).astype(mx.float32)
+    probe_real = _probe.astype(mx.float32)
+    wavenumber_init = 2.0 * pi * freq_start / c
+    attenuation_init = alpha / NEPER_TO_DB * freq_start / 1e6 * 1e2
+    wavenumber_step = 2.0 * pi * freq_step / c
+    attenuation_step = alpha / NEPER_TO_DB * freq_step / 1e6 * 1e2
+    min_distance = c / params.freq_center / 2.0
+    center_wavenumber = 2.0 * pi * params.freq_center / c
+    inv_n_sub = 1.0 / n_sub
+    scalars = mx.array(
+        [
+            wavenumber_init,
+            attenuation_init,
+            wavenumber_step,
+            attenuation_step,
+            min_distance,
+            plan.seg_length,
+            center_wavenumber,
+            inv_n_sub,
+        ],
+        dtype=mx.float32,
+    )
+    return {
+        "x_flat": x_flat.astype(mx.float32),
+        "z_flat": z_flat.astype(mx.float32),
+        "elem_x": elem_pos[:, 0].astype(mx.float32),
+        "elem_z": elem_pos[:, 1].astype(mx.float32),
+        "theta_e": theta_e.astype(mx.float32),
+        "sub_dx": sub_dx.astype(mx.float32),
+        "sub_dz": sub_dz.astype(mx.float32),
+        "da_init_re": da_init_re,
+        "da_init_im": da_init_im,
+        "delay_phase_step": delay_phase_step,
+        "pp_re": pp_re,
+        "pp_im": pp_im,
+        "probe_real": probe_real,
+        "rc": rc.astype(mx.float32),
+        "is_out": is_out,
+        "scalars": scalars,
+        "n_elem": n_elem,
+        "n_sub": n_sub,
+        "n_freq": n_freq,
+        "n_scat": n_scat,
+    }
+# ---------------------------------------------------------------------------
+# Dispatch
+# ---------------------------------------------------------------------------
+def _dispatch_split(d: dict[str, Any]) -> mx.array:
+    """Two-kernel path with automatic chunking for large scatterer counts.
+    Scatterers are processed in chunks that fit the TX intermediate buffer
+    within ``MAX_TX_INTERMEDIATE_BYTES``. Each chunk runs TX then RX, and
+    the per-chunk spectra are summed on the host.
+    """
+    n_elem, n_sub, n_freq, n_scat = d["n_elem"], d["n_sub"], d["n_freq"], d["n_scat"]
+    spect_size = n_freq * n_elem
+    # Use TX-throughput-optimal chunk sizes, capped by memory budget
+    bytes_per_scat = n_freq * 4 * 2  # float32 re + im
+    mem_chunk = max(1, MAX_TX_INTERMEDIATE_BYTES // bytes_per_scat)
+    perf_chunk = _TX_OPTIMAL_CHUNK.get(n_elem, _TX_DEFAULT_CHUNK)
+    chunk_size = min(mem_chunk, perf_chunk)
+    # Geometry arrays shared across all chunks
+    geom_tx = [
+        d["elem_x"],
+        d["elem_z"],
+        d["theta_e"],
+        d["sub_dx"],
+        d["sub_dz"],
+        d["da_init_re"],
+        d["da_init_im"],
+        d["delay_phase_step"],
+        d["pp_re"],
+        d["pp_im"],
+    ]
+    geom_rx = [d["elem_x"], d["elem_z"], d["theta_e"], d["sub_dx"], d["sub_dz"]]
+    probe = d["probe_real"]
+    scalars = d["scalars"]
+    # Build kernels for the standard chunk size (cached, compiled once per probe)
+    k_tx = _build_tx(n_elem, n_sub, n_freq, chunk_size)
+    k_rx = _build_rx(n_elem, n_sub, n_freq, chunk_size)
+    total_re = mx.zeros(spect_size, dtype=mx.float32)
+    total_im = mx.zeros(spect_size, dtype=mx.float32)
+    for start in range(0, n_scat, chunk_size):
+        end = min(start + chunk_size, n_scat)
+        cn = end - start
+        cx = d["x_flat"][start:end]
+        cz = d["z_flat"][start:end]
+        crc = d["rc"][start:end]
+        c_out = d["is_out"][start:end]
+        # TX kernel: one threadgroup per scatterer (tiled progression)
+        tg = _TX_TILE_TG
+        tx_out = k_tx(
+            inputs=[cx, cz, *geom_tx, c_out, scalars],
+            output_shapes=[(cn * n_freq,), (cn * n_freq,)],
+            output_dtypes=[mx.float32, mx.float32],
+            grid=(cn * tg, 1, 1),
+            threadgroup=(tg, 1, 1),
+        )
+        # RX kernel: SCAT_REDUCE scatterers per threadgroup, SIMD reduction
+        sr = _RX_SCAT_REDUCE
+        rx_tg = n_elem * sr
+        n_tgs = (cn + sr - 1) // sr
+        rx_out = k_rx(
+            inputs=[cx, cz, *geom_rx, tx_out[0], tx_out[1], probe, crc, scalars],
+            output_shapes=[(spect_size,), (spect_size,)],
+            output_dtypes=[mx.float32, mx.float32],
+            grid=(n_tgs * rx_tg, 1, 1),
+            threadgroup=(rx_tg, 1, 1),
+            init_value=0.0,
+        )
+        total_re = total_re + rx_out[0]
+        total_im = total_im + rx_out[1]
+    spect_re = total_re.reshape(n_freq, n_elem)
+    spect_im = total_im.reshape(n_freq, n_elem)
+    return (spect_re + 1j * spect_im).astype(mx.complex64)
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def simus_metal(
+    scatterers: mx.array,
+    rc: mx.array,
+    params: TransducerParams,
+    plan: SimusPlan,
+    medium: MediumParams,
+    delays_clean: mx.array,
+    tx_apodization: mx.array,
+) -> mx.array:
+    """Compute simus RF spectrum using custom Metal kernels.
+    Uses a two-kernel TX/RX split with automatic chunking for large
+    scatterer counts. Each chunk fits within the TX intermediate memory
+    budget, and chunk spectra are accumulated via simple addition.
+    Args:
+        scatterers: Scatterer positions (x, z) in meters. Shape ``(n_scat, 2)``.
+        rc: Reflection coefficients. Shape ``(n_scat,)``.
+        params: Transducer parameters.
+        plan: Precomputed frequency plan from ``simus_precompute``.
+        medium: Medium parameters.
+        delays_clean: NaN-cleaned delays. Shape ``(n_elements,)``.
+        tx_apodization: Per-element apodization (NaN-zeroed). Shape ``(n_elements,)``.
+    Returns:
+        Complex RF spectrum, shape ``(n_freq, n_elements)``.
+    """
+    d = _prepare_common(scatterers, rc, params, plan, medium, delays_clean, tx_apodization)
+    return _dispatch_split(d)

fast_simus/kernels/pfield.metal ADDED Viewed

@@ -0,0 +1,97 @@
+// Kernel body for pfield pressure-field computation via mx.fast.metal_kernel().
+//
+// This file contains ONLY the kernel body -- the code that runs inside the
+// auto-generated [[kernel]] void ...() { ... } wrapper.  mx.fast.metal_kernel()
+// injects input/output buffer parameters automatically based on input_names
+// and output_names.
+//
+// Compile-time constants (injected via header=):
+//   N_ELEM  -- number of transducer elements
+//   N_SUB   -- number of sub-elements per element
+//   N_FREQ  -- number of frequency samples
+//   N_ES    -- N_ELEM * N_SUB (total element-subelement pairs)
+    uint g = thread_position_in_grid.x;
+    float gx = grid_x[g];
+    float gz = grid_z[g];
+    float kw_init    = scalars[0];
+    float alpha_init = scalars[1];
+    float kw_step    = scalars[2];
+    float alpha_step = scalars[3];
+    float min_dist   = scalars[4];
+    float seg_len    = scalars[5];
+    float center_kw  = scalars[6];
+    float eff_corr   = scalars[7];
+    float2 cur[N_ES];
+    float2 stp[N_ES];
+    for (int e = 0; e < N_ELEM; e++) {
+        float ex = elem_x[e];
+        float ez = elem_z[e];
+        float te = theta_e[e];
+        float di_re = da_init_re[e], di_im = da_init_im[e];
+        float ds_re = da_step_re[e], ds_im = da_step_im[e];
+        for (int s = 0; s < N_SUB; s++) {
+            int idx = e * N_SUB + s;
+            float dx = gx - ex - sub_dx[idx];
+            float dz = gz - ez - sub_dz[idx];
+            float r = metal::precise::sqrt(dx * dx + dz * dz);
+            float rc = max(r, min_dist);
+            // Angle relative to element normal (unclipped distance for angle)
+            float th = metal::precise::asin((dx + 1e-16f) / (r + 1e-16f)) - te;
+            // Soft baffle obliquity
+            float obliq = (fabs(th) >= M_PI_2_F) ? 1e-16f : metal::precise::cos(th);
+            // Phase init: obliq/sqrt(r) * exp(-alpha*r + j*wrap(k*r, 2pi))
+            float kwr = kw_init * rc;
+            float TWO_PI = 2.0f * M_PI_F;
+            float ph_wrap = kwr - TWO_PI * metal::precise::floor(kwr / TWO_PI);
+            float ai = obliq / metal::precise::sqrt(rc) * metal::precise::exp(-alpha_init * rc);
+            float2 pi_ = float2(ai * metal::precise::cos(ph_wrap),
+                                ai * metal::precise::sin(ph_wrap));
+            // Phase step: exp((-alpha_step + j*k_step) * r)
+            float as_ = metal::precise::exp(-alpha_step * rc);
+            float phs = kw_step * rc;
+            float2 ps_ = float2(as_ * metal::precise::cos(phs),
+                                as_ * metal::precise::sin(phs));
+            // Center-frequency sinc directivity
+            float sa = center_kw * seg_len * 0.5f * metal::precise::sin(th);
+            float sv = (fabs(sa) < 1e-8f) ? 1.0f : metal::precise::sin(sa) / sa;
+            pi_ *= sv;
+            // Absorb delay+apodization (complex multiply)
+            cur[idx] = float2(
+                pi_.x * di_re - pi_.y * di_im,
+                pi_.x * di_im + pi_.y * di_re
+            );
+            stp[idx] = float2(
+                ps_.x * ds_re - ps_.y * ds_im,
+                ps_.x * ds_im + ps_.y * ds_re
+            );
+        }
+    }
+    // Frequency sweep: accumulate sum_f |pulse_probe_f|^2 * |sum_es phase_es_f|^2
+    float acc = 0.0f;
+    for (int f = 0; f < N_FREQ; f++) {
+        float sr = 0.0f, si = 0.0f;
+        for (int j = 0; j < N_ES; j++) {
+            sr += cur[j].x;
+            si += cur[j].y;
+            float cr = cur[j].x, ci = cur[j].y;
+            float tr = stp[j].x, ti = stp[j].y;
+            cur[j] = float2(cr * tr - ci * ti, cr * ti + ci * tr);
+        }
+        acc += pp_mag_sq[f] * (sr * sr + si * si);
+    }
+    pressure[g] = (is_out[g] > 0.5f) ? 0.0f : acc * eff_corr;