PyPI - FastSIMUS - Versions diffs - 0.0.1__py3-none-any.whl - Mend

FastSIMUS 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

fast_simus/__init__.py +33 -0
fast_simus/_pfield_math.py +261 -0
fast_simus/_pfield_strategies.py +203 -0
fast_simus/_simus_strategies.py +210 -0
fast_simus/backends/__init__.py +1 -0
fast_simus/backends/mlx.py +101 -0
fast_simus/kernels/__init__.py +9 -0
fast_simus/kernels/cuda_simus.py +321 -0
fast_simus/kernels/metal_pfield.py +219 -0
fast_simus/kernels/metal_simus.py +377 -0
fast_simus/kernels/pfield.metal +97 -0
fast_simus/kernels/simus_fused.cu +332 -0
fast_simus/kernels/simus_rx_simd.metal +128 -0
fast_simus/kernels/simus_tx_tiled.metal +175 -0
fast_simus/medium_params.py +22 -0
fast_simus/pfield.py +475 -0
fast_simus/py.typed +0 -0
fast_simus/simus.py +567 -0
fast_simus/spectrum.py +107 -0
fast_simus/transducer_params.py +160 -0
fast_simus/transducer_presets.py +102 -0
fast_simus/tx_delay.py +276 -0
fast_simus/utils/__init__.py +5 -0
fast_simus/utils/_array_api.py +294 -0
fast_simus/utils/geometry.py +88 -0
fastsimus-0.0.1.dist-info/METADATA +594 -0
fastsimus-0.0.1.dist-info/RECORD +28 -0
fastsimus-0.0.1.dist-info/WHEEL +4 -0

fast_simus/kernels/simus_fused.cu ADDED Viewed

@@ -0,0 +1,332 @@
+/*
+ * Fused TX+RX SIMUS kernel -- v25c: register-resident TX, sv_arr in
+ *  shmem (correct, fp32).
+ *
+ * v25b cached sv_arr[B*ELEM_TILE] in registers (56 floats at B=7 ET=4).
+ * That competes with tk_re/tk_im for the 255-reg cap and forces 400 B
+ * spill of tk into local memory -- which NCU showed saturates L2 at
+ * 76.5 % throughput.
+ *
+ * v25c drops sv_arr from registers and reads GEO_STP_RX_RE/IM directly
+ * from shmem inside the cmul, freeing those 56 regs for tk. Cost: an
+ * extra shmem read per (si, et, fi) cmul advance -- bank-conflict free
+ * since each thread reads its own row.
+ *
+ * v25 with one structural fix: every loop that indexes tk_re/tk_im is
+ * now `for fi in 0..MAX_FPT` with `#pragma unroll` and predicated
+ * validity, so fi is statically known. v25 spilled tk entirely
+ * (576 B local mem at B=9 ET=4) because dynamic fi forced
+ * tk_re[si*MAX_FPT + fi] off-register. Static fi unrolling makes tk
+ * actually register-resident, eliminating the local-memory traffic.
+ *
+ * Why this is safe: in v11 each thread `lid` writes to sh_tx[si*N_FREQ + f]
+ * for f in {lid, lid+TG_SIZE, lid+2*TG_SIZE, ...} during Phase 2, and reads
+ * the same slots during Phase 3. There is no cross-thread sharing of TX --
+ * the shmem allocation was a temporary, not a broadcast surface.
+ *
+ * Storing TX in per-thread register arrays tk_re[B_SCAT*MAX_FPT],
+ * tk_im[B_SCAT*MAX_FPT] eliminates the dominant shmem cost
+ * (2*B_SCAT*N_FREQ floats; 60 KB at B=9 N_FREQ=854) without changing
+ * precision or arithmetic. Also lets us drop the pre-Phase-3 sync that
+ * was only needed to publish sh_tx writes.
+ *
+ * Per-thread TX register cost: 2*B_SCAT*MAX_FPT floats. For
+ * B=9 N_FREQ=854 TG=128 -> MAX_FPT=7 -> 126 floats. May force 1->something
+ * trade vs spill; expected to come out well ahead given shmem savings
+ * (76.5 KB -> 16.5 KB at B=9 ET=4 unlocks ~5 blk/SM vs v11's 1).
+ *
+ * Compile-time: N_ELEM, N_SUB, N_FREQ, N_ES, TILE_SE, TG_SIZE, MAX_FPT,
+ *               B_SCAT, ELEM_TILE
+ *
+ * Shared memory: (7*B_SCAT*N_ES + 3*N_ELEM) * 4 bytes
+ */
+#ifndef M_PI_F
+#define M_PI_F 3.14159265358979323846f
+#endif
+struct f2 { float x, y; };
+__device__ __forceinline__ f2 cmul(f2 a, f2 b) {
+    return {a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
+}
+#define GEO_AMP(s)       (shmem + ((0*B_SCAT + (s)) * N_ES))
+#define GEO_KW_R(s)      (shmem + ((1*B_SCAT + (s)) * N_ES))
+#define GEO_KR_STEP(s)   (shmem + ((2*B_SCAT + (s)) * N_ES))
+#define GEO_ALPHA_R(s)   (shmem + ((3*B_SCAT + (s)) * N_ES))
+#define GEO_AR_STEP(s)   (shmem + ((4*B_SCAT + (s)) * N_ES))
+#define GEO_STP_RX_RE(s) (shmem + ((5*B_SCAT + (s)) * N_ES))
+#define GEO_STP_RX_IM(s) (shmem + ((6*B_SCAT + (s)) * N_ES))
+extern "C" __global__
+void simus_fused_kernel(
+    const float* __restrict__ scat_x,
+    const float* __restrict__ scat_z,
+    const float* __restrict__ rc_arr,
+    const float* __restrict__ elem_x,
+    const float* __restrict__ elem_z,
+    const float* __restrict__ cos_te,
+    const float* __restrict__ sin_neg_te,
+    const float* __restrict__ sub_dx,
+    const float* __restrict__ sub_dz,
+    const float* __restrict__ da_init_re,
+    const float* __restrict__ da_init_im,
+    const float* __restrict__ dps,
+    const float* __restrict__ pp_re,
+    const float* __restrict__ pp_im,
+    const float* __restrict__ probe,
+    float* __restrict__ spect_re,
+    float* __restrict__ spect_im,
+    int   n_scat,
+    float kw_init, float alpha_init,
+    float kw_step, float alpha_step,
+    float min_dist, float seg_len,
+    float center_kw, float inv_nsub,
+    float radius, float apex_offset
+) {
+    int lid = threadIdx.x;
+    float lid_f = (float)lid;
+    float stride_f = (float)TG_SIZE;
+    extern __shared__ float shmem[];
+    /* TX moved to per-thread registers; shmem only holds geometry + per-elem broadcast. */
+    float tk_re[B_SCAT * MAX_FPT];
+    float tk_im[B_SCAT * MAX_FPT];
+    float* sh_da_init_re_l = shmem + 7 * B_SCAT * N_ES;
+    float* sh_da_init_im_l = sh_da_init_re_l + N_ELEM;
+    float* sh_dps_l        = sh_da_init_im_l + N_ELEM;
+    for (int e = lid; e < N_ELEM; e += TG_SIZE) {
+        sh_da_init_re_l[e] = da_init_re[e];
+        sh_da_init_im_l[e] = da_init_im[e];
+        sh_dps_l[e]        = dps[e];
+    }
+    __syncthreads();
+    int my_n_freq = 0;
+    for (int f = lid; f < N_FREQ; f += TG_SIZE) my_n_freq++;
+    const int N_TILES = (N_ES + TILE_SE - 1) / TILE_SE;
+    const int N_ELEM_GROUPS = (N_ES + ELEM_TILE - 1) / ELEM_TILE;
+    bool out_flag[B_SCAT];
+    for (int scat_base = blockIdx.x * B_SCAT;
+         scat_base < n_scat;
+         scat_base += gridDim.x * B_SCAT)
+    {
+        int actual_b = B_SCAT;
+        if (scat_base + B_SCAT > n_scat)
+            actual_b = n_scat - scat_base;
+        /* Zero-init register TX so si >= actual_b reads finite zeros in
+         * Phase 3 (cv is also forced to 0 there, but 0*NaN would propagate). */
+        #pragma unroll
+        for (int si = 0; si < B_SCAT; si++) {
+            #pragma unroll
+            for (int fi = 0; fi < MAX_FPT; fi++) {
+                tk_re[si * MAX_FPT + fi] = 0.0f;
+                tk_im[si * MAX_FPT + fi] = 0.0f;
+            }
+        }
+        /* ---- Phase 1+2: geometry + TX for each scatterer in batch ---- */
+        for (int si = 0; si < actual_b; si++) {
+            int scat_idx = scat_base + si;
+            float sx = scat_x[scat_idx];
+            float sz = scat_z[scat_idx];
+            float rc = rc_arr[scat_idx];
+            bool is_out = (sz < 0.0f);
+            if (radius < 1e30f) {
+                float da = sx, db = sz + apex_offset;
+                is_out = is_out || ((da*da + db*db) <= radius*radius);
+            }
+            out_flag[si] = is_out;
+            for (int se = lid; se < N_ES; se += TG_SIZE) {
+                int elem = se / N_SUB;
+                float ex_ = elem_x[elem], ez_ = elem_z[elem];
+                float ct = cos_te[elem], snt = sin_neg_te[elem];
+                float dx = sx - ex_ - sub_dx[se];
+                float dz = sz - ez_ - sub_dz[se];
+                float r2 = dx*dx + dz*dz;
+                float inv_r = rsqrtf(r2 + 1e-30f);
+                float r = r2 * inv_r;
+                float rc_ = fmaxf(r, min_dist);
+                float sin_th = (dx*ct + dz*snt) * inv_r;
+                float cos_th = (dz*ct - dx*snt) * inv_r;
+                float obliq = (cos_th <= 0.0f) ? 1e-16f : cos_th;
+                float sa = center_kw * seg_len * 0.5f * sin_th;
+                float sv = (fabsf(sa) < 1e-8f) ? 1.0f : __fdividef(__sinf(sa), sa);
+                GEO_AMP(si)[se]       = obliq * sv * rsqrtf(rc_);
+                GEO_KW_R(si)[se]      = kw_init * rc_;
+                GEO_KR_STEP(si)[se]   = kw_step * rc_;
+                GEO_ALPHA_R(si)[se]   = alpha_init * rc_;
+                GEO_AR_STEP(si)[se]   = alpha_step * rc_;
+                float stp_phase = stride_f * kw_step * rc_;
+                float stp_alpha = stride_f * alpha_step * rc_;
+                float sm = expf(-stp_alpha);
+                float sp_re, sp_im;
+                __sincosf(stp_phase, &sp_im, &sp_re);
+                sp_re *= sm; sp_im *= sm;
+                GEO_STP_RX_RE(si)[se] = sp_re;
+                GEO_STP_RX_IM(si)[se] = sp_im;
+            }
+            __syncthreads();
+            if (is_out) {
+                #pragma unroll
+                for (int fi = 0; fi < MAX_FPT; fi++) {
+                    tk_re[si * MAX_FPT + fi] = 0.0f;
+                    tk_im[si * MAX_FPT + fi] = 0.0f;
+                }
+                continue;
+            }
+            /* Phase 2: TX sweep */
+            float sum_re[MAX_FPT], sum_im[MAX_FPT];
+            for (int i = 0; i < MAX_FPT; i++) { sum_re[i] = 0.0f; sum_im[i] = 0.0f; }
+            for (int tile = 0; tile < N_TILES; tile++) {
+                int ts = tile * TILE_SE;
+                int te = ts + TILE_SE;
+                if (te > N_ES) te = N_ES;
+                int tl = te - ts;
+                f2 cv[TILE_SE], sv[TILE_SE];
+                #pragma unroll
+                for (int j = 0; j < TILE_SE; j++) {
+                    if (j >= tl) { cv[j] = {0.0f, 0.0f}; sv[j] = {1.0f, 0.0f}; continue; }
+                    int se = ts + j, em = se / N_SUB;
+                    float ph = GEO_KW_R(si)[se] + lid_f * GEO_KR_STEP(si)[se];
+                    float av = GEO_ALPHA_R(si)[se] + lid_f * GEO_AR_STEP(si)[se];
+                    float ai = GEO_AMP(si)[se] * expf(-av);
+                    float vr, vi;
+                    __sincosf(ph, &vi, &vr);
+                    vr *= ai; vi *= ai;
+                    float dp = lid_f * sh_dps_l[em];
+                    float dr, di;
+                    __sincosf(dp, &di, &dr);
+                    float dvr = sh_da_init_re_l[em]*dr - sh_da_init_im_l[em]*di;
+                    float dvi = sh_da_init_re_l[em]*di + sh_da_init_im_l[em]*dr;
+                    cv[j] = {vr*dvr - vi*dvi, vr*dvi + vi*dvr};
+                    float sp_re = GEO_STP_RX_RE(si)[se];
+                    float sp_im = GEO_STP_RX_IM(si)[se];
+                    float das_phase = stride_f * sh_dps_l[em];
+                    float das_re, das_im;
+                    __sincosf(das_phase, &das_im, &das_re);
+                    sv[j] = {sp_re*das_re - sp_im*das_im, sp_re*das_im + sp_im*das_re};
+                }
+                #pragma unroll
+                for (int fi = 0; fi < MAX_FPT; fi++) {
+                    int f_chk = lid + fi * TG_SIZE;
+                    if (f_chk >= N_FREQ) break;
+                    #pragma unroll
+                    for (int j = 0; j < TILE_SE; j++) {
+                        sum_re[fi] += cv[j].x; sum_im[fi] += cv[j].y;
+                        cv[j] = cmul(cv[j], sv[j]);
+                    }
+                }
+            }
+            #pragma unroll
+            for (int fi = 0; fi < MAX_FPT; fi++) {
+                int f = lid + fi * TG_SIZE;
+                bool valid = (f < N_FREQ);
+                float tr = sum_re[fi] * inv_nsub;
+                float ti = sum_im[fi] * inv_nsub;
+                float ppr = valid ? pp_re[f] : 0.0f;
+                float ppi = valid ? pp_im[f] : 0.0f;
+                tk_re[si * MAX_FPT + fi] = valid ? (ppr*tr - ppi*ti) * rc : 0.0f;
+                tk_im[si * MAX_FPT + fi] = valid ? (ppr*ti + ppi*tr) * rc : 0.0f;
+            }
+            /* No __syncthreads here -- TX is private to this thread. */
+        }
+        /* ---- Phase 3: element-tiled RX with B_SCAT accumulation ---- */
+        for (int eg = 0; eg < N_ELEM_GROUPS; eg++) {
+            int se_base = eg * ELEM_TILE;
+            int etl = ELEM_TILE;
+            if (se_base + etl > N_ES) etl = N_ES - se_base;
+            /* Initialize B_SCAT * ELEM_TILE RX states. sv_arr stays in shmem
+             * (re-read per cmul advance) to free registers for tk. */
+            f2 cv[B_SCAT * ELEM_TILE];
+            #pragma unroll
+            for (int si = 0; si < B_SCAT; si++) {
+                #pragma unroll
+                for (int et = 0; et < ELEM_TILE; et++) {
+                    int idx = si * ELEM_TILE + et;
+                    if (si >= actual_b || out_flag[si] || et >= etl) {
+                        cv[idx] = {0.0f, 0.0f};
+                        continue;
+                    }
+                    int se = se_base + et;
+                    float ph = GEO_KW_R(si)[se] + lid_f * GEO_KR_STEP(si)[se];
+                    float av = GEO_ALPHA_R(si)[se] + lid_f * GEO_AR_STEP(si)[se];
+                    float ai = GEO_AMP(si)[se] * expf(-av);
+                    float vr, vi;
+                    __sincosf(ph, &vi, &vr);
+                    cv[idx] = {vr * ai, vi * ai};
+                }
+            }
+            /* Sweep frequencies with B_SCAT * ELEM_TILE independent chains.
+             * fi is statically unrolled so tk_re[si*MAX_FPT + fi] uses a
+             * compile-time index, keeping tk truly register-resident. */
+            #pragma unroll
+            for (int fi = 0; fi < MAX_FPT; fi++) {
+                int f = lid + fi * TG_SIZE;
+                bool valid = (f < N_FREQ);
+                float pf = valid ? probe[f] : 0.0f;
+                float acc_re[ELEM_TILE];
+                float acc_im[ELEM_TILE];
+                #pragma unroll
+                for (int et = 0; et < ELEM_TILE; et++) {
+                    acc_re[et] = 0.0f;
+                    acc_im[et] = 0.0f;
+                }
+                #pragma unroll
+                for (int si = 0; si < B_SCAT; si++) {
+                    float tkr = tk_re[si * MAX_FPT + fi];
+                    float tki = tk_im[si * MAX_FPT + fi];
+                    #pragma unroll
+                    for (int et = 0; et < ELEM_TILE; et++) {
+                        int idx = si * ELEM_TILE + et;
+                        int se = se_base + et;
+                        float rr = cv[idx].x * inv_nsub;
+                        float ri = cv[idx].y * inv_nsub;
+                        acc_re[et] += (tkr*rr - tki*ri) * pf;
+                        acc_im[et] += (tkr*ri + tki*rr) * pf;
+                        f2 sv_local = {GEO_STP_RX_RE(si)[se], GEO_STP_RX_IM(si)[se]};
+                        cv[idx] = cmul(cv[idx], sv_local);
+                    }
+                }
+                if (!valid) continue;
+                #pragma unroll
+                for (int et = 0; et < ELEM_TILE; et++) {
+                    if (et >= etl) break;
+                    int elem = (se_base + et) / N_SUB;
+                    atomicAdd(&spect_re[elem * N_FREQ + f], acc_re[et]);
+                    atomicAdd(&spect_im[elem * N_FREQ + f], acc_im[et]);
+                }
+            }
+        }
+        __syncthreads();
+    }
+}

fast_simus/kernels/simus_rx_simd.metal ADDED Viewed

@@ -0,0 +1,128 @@
+// Kernel B: SIMD-reduce RX -- multiple scatterers per threadgroup with
+// cross-scatterer SIMD reduction to cut atomic writes by SCAT_REDUCE.
+//
+// Thread layout: tid = elem_idx * SCAT_REDUCE + scat_batch
+//   - Adjacent threads handle the SAME element from DIFFERENT scatterers
+//   - Within a SIMD group (32 threads): 32/SR elements * SR scatterers
+//   - simd_shuffle_xor reduces groups of SR threads (same element, different scat)
+//   - Only scat_batch==0 threads write atomics -> SR fewer atomics
+//
+// Coalescing: writing threads (scat_batch==0) are at stride SR in the SIMD group.
+// They write to consecutive element addresses -> coalesced atomics.
+//
+// TG = N_ELEM * SCAT_REDUCE (e.g., 64*2 = 128 for P4-2v with SR=2)
+//
+// Compile-time constants:
+//   N_ELEM, N_SUB, N_FREQ, N_SCAT, SCAT_REDUCE
+    uint tg_scat_base = threadgroup_position_in_grid.x * SCAT_REDUCE;
+    uint lid = thread_position_in_threadgroup.x;
+    uint scat_batch = lid % SCAT_REDUCE;
+    uint elem_idx = lid / SCAT_REDUCE;
+    uint scat_idx = tg_scat_base + scat_batch;
+    bool valid = (scat_idx < (uint)N_SCAT && elem_idx < (uint)N_ELEM);
+    float sx, sz, rc_i, ex, ez, te;
+    float kw_init, alpha_init, kw_step, alpha_step, min_dist, seg_len, center_kw, inv_nsub;
+    kw_init    = scalars[0];
+    alpha_init = scalars[1];
+    kw_step    = scalars[2];
+    alpha_step = scalars[3];
+    min_dist   = scalars[4];
+    seg_len    = scalars[5];
+    center_kw  = scalars[6];
+    inv_nsub   = scalars[7];
+    float2 cur[N_SUB];
+    float2 stp_arr[N_SUB];
+    if (valid) {
+        sx = scat_x[scat_idx];
+        sz = scat_z[scat_idx];
+        rc_i = rc[scat_idx];
+        ex = elem_x[elem_idx];
+        ez = elem_z[elem_idx];
+        te = theta_e[elem_idx];
+        for (int s = 0; s < N_SUB; s++) {
+            int sub_idx = elem_idx * N_SUB + s;
+            float dx = sx - ex - sub_dx[sub_idx];
+            float dz = sz - ez - sub_dz[sub_idx];
+            float r = metal::precise::sqrt(dx * dx + dz * dz);
+            float rc_ = max(r, min_dist);
+            float th = metal::precise::asin((dx + 1e-16f) / (r + 1e-16f)) - te;
+            float obliq = (fabs(th) >= M_PI_2_F) ? 1e-16f : metal::precise::cos(th);
+            float kwr = kw_init * rc_;
+            float TWO_PI = 2.0f * M_PI_F;
+            float ph_wrap = kwr - TWO_PI * metal::precise::floor(kwr / TWO_PI);
+            float ai = obliq / metal::precise::sqrt(rc_) * metal::precise::exp(-alpha_init * rc_);
+            float2 pi_ = float2(ai * metal::precise::cos(ph_wrap),
+                                ai * metal::precise::sin(ph_wrap));
+            float as_ = metal::precise::exp(-alpha_step * rc_);
+            float phs = kw_step * rc_;
+            float2 ps_ = float2(as_ * metal::precise::cos(phs),
+                                as_ * metal::precise::sin(phs));
+            float sa = center_kw * seg_len * 0.5f * metal::precise::sin(th);
+            float sv = (fabs(sa) < 1e-8f) ? 1.0f : metal::precise::sin(sa) / sa;
+            pi_ *= sv;
+            cur[s] = pi_;
+            stp_arr[s] = ps_;
+        }
+    }
+    for (int f = 0; f < N_FREQ; f++) {
+        float c_re = 0.0f, c_im = 0.0f;
+        if (valid) {
+            float sr = 0.0f, si = 0.0f;
+            for (int s = 0; s < N_SUB; s++) {
+                sr += cur[s].x;
+                si += cur[s].y;
+                float cr = cur[s].x, ci = cur[s].y;
+                float tr = stp_arr[s].x, ti = stp_arr[s].y;
+                cur[s] = float2(cr * tr - ci * ti, cr * ti + ci * tr);
+            }
+            float rp_re = sr * inv_nsub;
+            float rp_im = si * inv_nsub;
+            int tx_idx = scat_idx * N_FREQ + f;
+            float pk_re = tx_re[tx_idx];
+            float pk_im = tx_im[tx_idx];
+            float probe_f = probe[f];
+            c_re = rc_i * (pk_re * rp_re - pk_im * rp_im) * probe_f;
+            c_im = rc_i * (pk_re * rp_im + pk_im * rp_re) * probe_f;
+        }
+        // SIMD reduce across SCAT_REDUCE scatterers for the same element.
+        // All threads participate (invalid threads contribute 0).
+#if SCAT_REDUCE >= 2
+        c_re += simd_shuffle_xor(c_re, 1);
+        c_im += simd_shuffle_xor(c_im, 1);
+#endif
+#if SCAT_REDUCE >= 4
+        c_re += simd_shuffle_xor(c_re, 2);
+        c_im += simd_shuffle_xor(c_im, 2);
+#endif
+#if SCAT_REDUCE >= 8
+        c_re += simd_shuffle_xor(c_re, 4);
+        c_im += simd_shuffle_xor(c_im, 4);
+#endif
+#if SCAT_REDUCE >= 16
+        c_re += simd_shuffle_xor(c_re, 8);
+        c_im += simd_shuffle_xor(c_im, 8);
+#endif
+        if (scat_batch == 0 && valid) {
+            int offset = f * N_ELEM + elem_idx;
+            atomic_fetch_add_explicit(&spect_re[offset], c_re, memory_order_relaxed);
+            atomic_fetch_add_explicit(&spect_im[offset], c_im, memory_order_relaxed);
+        }
+    }

fast_simus/kernels/simus_tx_tiled.metal ADDED Viewed

@@ -0,0 +1,175 @@
+// Kernel: Element-tiled progression with shared-memory geometry.
+// One threadgroup per scatterer; threads cooperatively compute geometry
+// AND da-absorbed stride steps into shared memory, then each thread
+// processes sub-element tiles with geometric progression (ALU-only inner loop).
+//
+// Low register pressure: only TILE_SE*2 float2 per thread (256 bytes for
+// TILE_SE=16). ALU-only inner loop (0 SFU calls in the frequency sweep).
+//
+// Shared memory layout:
+//   amp[N_ES]         frequency-independent amplitude
+//   kw_r[N_ES]        kw_init * r  (base phase)
+//   kr_step[N_ES]     kw_step * r  (phase increment per freq index)
+//   alpha_r[N_ES]     alpha_init * r  (base attenuation)
+//   ar_step[N_ES]     alpha_step * r  (attenuation increment per freq)
+//   stp[N_ES]         float2 stride step, da-absorbed (same for all threads)
+//   da_init_re[N_ELEM] delay+apod init real part
+//   da_init_im[N_ELEM] delay+apod init imag part
+//   dps[N_ELEM]       delay_phase_step per element
+//
+//   Total: N_ES*(5*4 + 8) + N_ELEM*3*4 bytes
+//          = 64*(20+8) + 64*12 = 1792 + 768 = 2560 bytes (N_ES=64)
+//
+// Output: tx_re[N_SCAT * N_FREQ], tx_im[N_SCAT * N_FREQ]
+//
+// Compile-time constants:
+//   N_ELEM, N_SUB, N_FREQ, N_ES, N_SCAT, TILE_SE, TG_SIZE, MAX_FPT
+    threadgroup float sh_amp[N_ES];
+    threadgroup float sh_kw_r[N_ES];
+    threadgroup float sh_kr_step[N_ES];
+    threadgroup float sh_alpha_r[N_ES];
+    threadgroup float sh_ar_step[N_ES];
+    threadgroup float2 sh_stp[N_ES];
+    threadgroup float sh_da_init_re[N_ELEM];
+    threadgroup float sh_da_init_im[N_ELEM];
+    threadgroup float sh_dps[N_ELEM];
+    uint scat_idx = threadgroup_position_in_grid.x;
+    uint lid = thread_position_in_threadgroup.x;
+    uint tpg = threads_per_threadgroup.x;
+    if (scat_idx >= N_SCAT) return;
+    float sx = scat_x[scat_idx];
+    float sz = scat_z[scat_idx];
+    float is_out_i = is_out[scat_idx];
+    float kw_init_v    = scalars[0];
+    float alpha_init_v = scalars[1];
+    float kw_step_v    = scalars[2];
+    float alpha_step_v = scalars[3];
+    float min_dist     = scalars[4];
+    float seg_len      = scalars[5];
+    float center_kw    = scalars[6];
+    float inv_nsub     = scalars[7];
+    float lid_f    = float(lid);
+    float stride_f = float(TG_SIZE);
+    // ---- Phase 1A: Cooperatively compute per-sub-element geometry ----
+    for (uint se = lid; se < (uint)N_ES; se += tpg) {
+        int elem = se / N_SUB;
+        int sub_global = elem * N_SUB + (se % N_SUB);
+        float ex = elem_x[elem];
+        float ez = elem_z[elem];
+        float te = theta_e[elem];
+        float dx = sx - ex - sub_dx[sub_global];
+        float dz = sz - ez - sub_dz[sub_global];
+        float r = metal::precise::sqrt(dx * dx + dz * dz);
+        float rc_ = max(r, min_dist);
+        float th = metal::precise::asin((dx + 1e-16f) / (r + 1e-16f)) - te;
+        float obliq = (fabs(th) >= M_PI_2_F) ? 1e-16f : metal::fast::cos(th);
+        float sa = center_kw * seg_len * 0.5f * metal::fast::sin(th);
+        float sv = (fabs(sa) < 1e-8f) ? 1.0f : metal::fast::sin(sa) / sa;
+        sh_amp[se]     = obliq * sv / metal::precise::sqrt(rc_);
+        sh_kw_r[se]    = kw_init_v * rc_;
+        sh_kr_step[se] = kw_step_v * rc_;
+        sh_alpha_r[se] = alpha_init_v * rc_;
+        sh_ar_step[se] = alpha_step_v * rc_;
+        // Precompute da-absorbed stride step (same for ALL threads).
+        // stp = exp((-alpha_step*stride + j*kw_step*stride) * r) * da_step^stride
+        float stp_phase = stride_f * kw_step_v * rc_;
+        float stp_alpha = stride_f * alpha_step_v * rc_;
+        float sm = metal::fast::exp(-stp_alpha);
+        float sp_re = sm * metal::fast::cos(stp_phase);
+        float sp_im = sm * metal::fast::sin(stp_phase);
+        float das_phase = stride_f * delay_phase_step[elem];
+        float das_re = metal::fast::cos(das_phase);
+        float das_im = metal::fast::sin(das_phase);
+        sh_stp[se] = float2(sp_re * das_re - sp_im * das_im,
+                            sp_re * das_im + sp_im * das_re);
+    }
+    // ---- Phase 1B: Cooperatively load per-element da info ----
+    for (uint e = lid; e < (uint)N_ELEM; e += tpg) {
+        sh_da_init_re[e] = da_init_re[e];
+        sh_da_init_im[e] = da_init_im[e];
+        sh_dps[e] = delay_phase_step[e];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // ---- Phase 2: Tiled progression sweep ----
+    constexpr int N_TILES = (N_ES + TILE_SE - 1) / TILE_SE;
+    float sum_re[MAX_FPT];
+    float sum_im[MAX_FPT];
+    int my_n_freq = 0;
+    for (uint f = lid; f < (uint)N_FREQ; f += tpg) my_n_freq++;
+    for (int i = 0; i < MAX_FPT; i++) { sum_re[i] = 0.0f; sum_im[i] = 0.0f; }
+    for (int tile = 0; tile < N_TILES; tile++) {
+        int tile_start = tile * TILE_SE;
+        int tile_end = min(tile_start + TILE_SE, N_ES);
+        int tile_len = tile_end - tile_start;
+        float2 cur_t[TILE_SE];
+        float2 stp_t[TILE_SE];
+        // Init cur at this thread's starting frequency, read stp from shared
+        for (int te = 0; te < tile_len; te++) {
+            int se = tile_start + te;
+            int elem = se / N_SUB;
+            float phase = sh_kw_r[se] + lid_f * sh_kr_step[se];
+            float alpha_val = sh_alpha_r[se] + lid_f * sh_ar_step[se];
+            float ai = sh_amp[se] * metal::fast::exp(-alpha_val);
+            float pi_re = ai * metal::fast::cos(phase);
+            float pi_im = ai * metal::fast::sin(phase);
+            float da_ph = lid_f * sh_dps[elem];
+            float da_cs_re = metal::fast::cos(da_ph);
+            float da_cs_im = metal::fast::sin(da_ph);
+            float da_re = sh_da_init_re[elem] * da_cs_re - sh_da_init_im[elem] * da_cs_im;
+            float da_im = sh_da_init_re[elem] * da_cs_im + sh_da_init_im[elem] * da_cs_re;
+            cur_t[te] = float2(pi_re * da_re - pi_im * da_im,
+                               pi_re * da_im + pi_im * da_re);
+            stp_t[te] = sh_stp[se];
+        }
+        // Sweep: ALU-only inner loop
+        for (int fi = 0; fi < my_n_freq; fi++) {
+            for (int te = 0; te < tile_len; te++) {
+                sum_re[fi] += cur_t[te].x;
+                sum_im[fi] += cur_t[te].y;
+                float cr = cur_t[te].x, ci = cur_t[te].y;
+                float tr = stp_t[te].x, ti = stp_t[te].y;
+                cur_t[te] = float2(cr * tr - ci * ti, cr * ti + ci * tr);
+            }
+        }
+    }
+    // ---- Phase 3: Apply inv_nsub, pulse*probe spectrum, write output ----
+    int fi = 0;
+    for (uint f = lid; f < (uint)N_FREQ; f += tpg, fi++) {
+        float tx_re_v = sum_re[fi] * inv_nsub;
+        float tx_im_v = sum_im[fi] * inv_nsub;
+        float pp_re_f = pp_re[f], pp_im_f = pp_im[f];
+        float pk_re = pp_re_f * tx_re_v - pp_im_f * tx_im_v;
+        float pk_im = pp_re_f * tx_im_v + pp_im_f * tx_re_v;
+        if (is_out_i > 0.5f) { pk_re = 0.0f; pk_im = 0.0f; }
+        int out_idx = scat_idx * N_FREQ + f;
+        tx_re[out_idx] = pk_re;
+        tx_im[out_idx] = pk_im;
+    }

fast_simus/medium_params.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""Medium parameter definitions for ultrasound propagation."""
+from pydantic import BaseModel, ConfigDict, Field
+class MediumParams(BaseModel):
+    """Medium parameters for ultrasound propagation.
+    This class encapsulates physical properties of the propagation medium
+    (e.g., soft tissue, water) that affect ultrasound wave propagation.
+    """
+    model_config = ConfigDict(
+        use_attribute_docstrings=True,
+        frozen=True,
+    )
+    speed_of_sound: float = Field(default=1540.0, gt=0)
+    """Speed of sound in m/s."""
+    attenuation: float = Field(default=0.0, ge=0)
+    """Attenuation coefficient in dB/cm/MHz."""