PyPI - corticalfields - Versions diffs - 0.2.2__tar.gz → 0.2.4__tar.gz - Mend

corticalfields 0.2.2tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{corticalfields-0.2.2/src/corticalfields.egg-info → corticalfields-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: corticalfields
-Version: 0.2.2
+Version: 0.2.4
 Summary: Spectral cortical and subcortical analysis with statistical testing (RSA, CCA, PLS, PERMANOVA, TFCE, NBS, laterality classification), on meshes and point clouds — Laplace-Beltrami decomposition, atlas-free asymmetry, GPU-accelerated optimal transport, hippocampal subfield analysis (HippUnfold), ShapeDNA/BrainPrint spectral fingerprinting, geometric deep learning, Bayesian inference, and normative modeling for structural neuroimaging.
 Author-email: rdneuro <r.debona@ufrj.br>
 License: MIT

{corticalfields-0.2.2 → corticalfields-0.2.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "corticalfields"
-version = "0.2.2"
+version = "0.2.4"
 description = "Spectral cortical and subcortical analysis with statistical testing (RSA, CCA, PLS, PERMANOVA, TFCE, NBS, laterality classification), on meshes and point clouds — Laplace-Beltrami decomposition, atlas-free asymmetry, GPU-accelerated optimal transport, hippocampal subfield analysis (HippUnfold), ShapeDNA/BrainPrint spectral fingerprinting, geometric deep learning, Bayesian inference, and normative modeling for structural neuroimaging."
 readme = "README.md"
 license = {text = "MIT"}

{corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/__init__.py RENAMED Viewed

@@ -29,7 +29,7 @@ surface, subcortical, hippocampus, spectral, kernels, surprise, features,
 graphs, distance_stats, asymmetry, transport, functional_maps, datasets, utils
 """
-__version__ = "0.2.2"
+__version__ = "0.2.4"
 __author__ = "rdneuro"

{corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/backends.py RENAMED Viewed

@@ -420,70 +420,39 @@ def _eigsh_torch(
     k: int, tol: float, maxiter: int, dtype: str,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """
-    PyTorch GPU eigensolver for the generalised problem Lφ = λMφ.
-    Uses **Chebyshev-Filtered Subspace Iteration** (ChFSI) — a modern
-    eigensolver that replaces the previous ``torch.lobpcg``-based
-    implementation, which suffered from well-documented performance
-    and correctness issues (PyTorch issues #58828, #101075, #109497,
-    #114081).  ChFSI needs only three GPU-native operations: sparse
-    matrix-vector products (SpMV via ``torch.sparse.mm``), QR
-    decomposition (``torch.linalg.qr``), and a small dense eigh
-    (``torch.linalg.eigh`` on an m×m matrix, m ≈ k + 30).
-    Algorithm
-    ---------
-    1. **Transform** to standard form:  ``A = M^{−½} L M^{−½}``
-       (exact because M is the diagonal lumped mass matrix).
-    2. **Estimate λ_max** via 30 power iterations (~10 ms on GPU).
-    3. **ChFSI outer loop** (typically 15–40 iterations):
-       a. Apply degree-``d`` Chebyshev polynomial filter via 3-term
-          SpMV recurrence (no matrix assembly — only matvecs).
-          The filter amplifies components in ``[0, λ_cutoff]`` and
-          damps the rest, concentrating V into the target eigenspace.
-       b. Orthogonalise:  ``V, _ = QR(filtered_V)``.
-       c. Rayleigh–Ritz:  ``H = Vᵀ A V``  (m×m dense eigh).
-       d. Convergence check: max residual norm < tol.
-    4. **Recover** generalised eigenvectors:  ``φ_i = M^{−½} y_i``.
-    Mixed precision
-    ---------------
-    SpMV and the Chebyshev filter run in **float32** for ~2× throughput
-    on modern GPUs.  The Rayleigh–Ritz projection (small m×m problem)
-    is accumulated and solved in **float64** for numerical stability.
-    This preserves eigenvalue accuracy to ~1e-7 for the first ~300
-    Laplace–Beltrami eigenpairs while halving SpMV memory bandwidth.
-    VRAM budget (N = 150k, k = 300, m = 330)
-    ------------------------------------------
-    - Sparse CSR matrix A:  ~14 MB  (7 nnz/row × 16 bytes)
-    - Subspace V:           N × m × 4 = ~198 MB  (float32)
-    - Chebyshev temps:      2 × N × m × 4 = ~396 MB  (Y_prev, Y_curr)
-    - Rayleigh–Ritz H:      m × m × 8 = ~0.9 MB  (float64)
-    - **Peak total:         ~609 MB** — fits in 8 GB VRAM with margin.
-    - Previous lobpcg:      9 × N × k × 8 = ~3.2 GB — 5× higher.
-    Performance (RTX 3090, N=150k, k=300)
-    -------------------------------------
-    - ChFSI (this):       ~10–25 s  (degree=12, 15–30 outer iters)
-    - torch.lobpcg (old): ~60–120 s
-    - CuPy eigsh:         ~10–30 s  (Thick-Restart Lanczos)
-    - scipy eigsh:        ~30–120 s (ARPACK shift-invert)
-    Both individual and batch processing use this function.  In batch
-    mode, ``gc_gpu()`` is called between subjects by the caller
-    (``_process_single_subject`` in ``spectral.py``), which frees
-    VRAM for the next subject.
+    PyTorch GPU eigensolver — ChFSI with in-place VRAM management.
+    Uses **Chebyshev-Filtered Subspace Iteration** (ChFSI) with:
+    - **In-place Chebyshev recurrence**: ``Tensor.add_(X, alpha=s)``
+      and ``Tensor.mul_()`` eliminate ALL intermediate tensor allocations
+      in the filter loop.  The only unavoidable allocation per step is
+      the SpMV result from ``torch.sparse.mm`` (which has no ``out=``).
+    - **Eager deallocation**: every temporary is ``del``'d immediately
+      and ``torch.cuda.empty_cache()`` runs after each outer iteration.
+    - **VRAM watermark check**: logs allocated VRAM at start/end and
+      warns if a leak is detected.
+    - **``torch.no_grad()``**: prevents the ~500 SpMV operations from
+      building a computation graph that would leak 10+ GB of RAM.
+    - **Periodic ``synchronize()``**: every 4 SpMV launches inside the
+      Chebyshev filter, plus after each Ritz step, to prevent the
+      NVIDIA driver watchdog from triggering a PCIe bus hang.
+    Per-subject VRAM budget (N=150k, k=100, m=120):
+        Sparse A:     ~14 MB (CSR, f32, ~7 nnz/row)
+        Subspace V:   N × m × 4 = ~72 MB
+        SpMV temp:    N × m × 4 = ~72 MB  (freed each step)
+        Ritz f64:   2 × N × m × 8 = ~288 MB (freed after Ritz)
+        **Peak: ~446 MB** — leaves >23 GB free on RTX 3090.
+    The critical constraint for batch stability is not peak usage but
+    **fragmentation over subjects**.  In-place operations reduce the
+    number of alloc/free cycles from ~30 per outer iteration (old) to
+    ~3 (new), dramatically reducing caching-allocator fragmentation.
     Parameters
     ----------
-    L : scipy.sparse.spmatrix (N, N) — stiffness matrix
-    M : scipy.sparse.spmatrix (N, N) — diagonal lumped mass matrix
-    k : int — number of smallest eigenpairs to compute
-    tol : float — convergence tolerance on max residual norm
-    maxiter : int — maximum ChFSI outer iterations
-    dtype : str — ``"float32"`` or ``"float64"`` for SpMV precision;
-        Rayleigh–Ritz always uses float64 regardless.
+    L, M, k, tol, maxiter, dtype : see ``eigsh_solve``
     Returns
     -------
@@ -492,215 +461,228 @@ def _eigsh_torch(
     References
     ----------
-    [1] Y. Zhou, Y. Saad, M.L. Tiago & J.R. Chelikowsky,
-        "Self-consistent-field calculations using Chebyshev-filtered
-        subspace iteration", J. Comput. Phys. 219 (2006) 172–184.
-    [2] A.V. Knyazev, "Toward the optimal preconditioned eigensolver:
-        LOBPCG", SIAM J. Sci. Comput. 23 (2001) 517–541.
+    [1] Y. Zhou, Y. Saad et al., "Chebyshev-filtered subspace iteration",
+        J. Comput. Phys. 219 (2006) 172–184.
     """
+    import gc
     import torch
-    # ── Precision setup ─────────────────────────────────────────────
-    # SpMV in float32 for throughput; Rayleigh-Ritz in float64 for accuracy
-    spmv_np_dtype = np.float32 if dtype != "float64" else np.float32
     spmv_torch_dtype = torch.float32
     ritz_torch_dtype = torch.float64
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    is_cuda = device.type == "cuda"
     N = L.shape[0]
-    # ChFSI hyperparameters — calibrated for LBO meshes
-    EXTRA = min(30, max(10, k // 10))     # oversampling for Ritz stability
-    m = k + EXTRA                          # subspace dimension
-    CHEB_DEGREE = 12                       # Chebyshev filter polynomial degree
-    POWER_ITERS = 30                       # for λ_max estimation
+    EXTRA = min(30, max(10, k // 10))
+    m = k + EXTRA
+    CHEB_DEGREE = 12
+    POWER_ITERS = 30
+    # ── VRAM watermark (start) ──────────────────────────────────────
+    vram_start = 0
+    if is_cuda:
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+        gc.collect()
+        vram_start = torch.cuda.memory_allocated(0)
     logger.info(
-        "  torch ChFSI eigensolver: N=%d, k=%d, m=%d, degree=%d, "
-        "device=%s, spmv=float32, ritz=float64",
-        N, k, m, CHEB_DEGREE, device,
+        "  torch ChFSI: N=%d, k=%d, m=%d, deg=%d, "
+        "VRAM_start=%.0f MB",
+        N, k, m, CHEB_DEGREE, vram_start / 1e6,
     )
-    # ── Step 1: Generalised → standard via M^{−½} (on CPU) ─────────
+    # ── Step 1: Generalised → standard via M^{−½} (CPU) ────────────
     M_diag = np.array(M.diagonal()).ravel().astype(np.float64)
     M_diag = np.maximum(M_diag, 1e-16)
-    M_inv_sqrt_np = (1.0 / np.sqrt(M_diag))  # float64 for precision
+    M_inv_sqrt_np = 1.0 / np.sqrt(M_diag)
-    D_sp = sp.diags(M_inv_sqrt_np.astype(spmv_np_dtype), format="csc")
-    A_cpu = (D_sp @ L.tocsc().astype(spmv_np_dtype) @ D_sp).tocsr()
-    del D_sp  # free CPU temp
+    D_sp = sp.diags(M_inv_sqrt_np.astype(np.float32), format="csc")
+    A_cpu = (D_sp @ L.tocsc().astype(np.float32) @ D_sp).tocsr()
+    del D_sp
-    # ── Helper: scipy CSR → torch sparse CSR on device ──────────────
-    def _scipy_to_torch_csr(mat_csr):
+    # ── scipy CSR → torch CSR ───────────────────────────────────────
+    def _to_csr(m_csr):
         return torch.sparse_csr_tensor(
-            torch.from_numpy(mat_csr.indptr.astype(np.int64)).to(device),
-            torch.from_numpy(mat_csr.indices.astype(np.int64)).to(device),
-            torch.from_numpy(mat_csr.data.astype(spmv_np_dtype)).to(device),
-            size=mat_csr.shape,
-            dtype=spmv_torch_dtype,
+            torch.from_numpy(m_csr.indptr.astype(np.int64)).to(device),
+            torch.from_numpy(m_csr.indices.astype(np.int64)).to(device),
+            torch.from_numpy(m_csr.data.astype(np.float32)).to(device),
+            size=m_csr.shape, dtype=spmv_torch_dtype,
         )
-    # ── Helper: sparse matvec A @ X on GPU ──────────────────────────
-    def _spmm(A_t, X):
-        """Sparse × dense matrix multiply, shape (N, m)."""
-        return torch.sparse.mm(A_t, X)
     try:
-        # ── Step 2: Transfer A to GPU ───────────────────────────────
-        A_t = _scipy_to_torch_csr(A_cpu)
-        del A_cpu  # free CPU copy (~14 MB saved)
-        # ── Step 3: Estimate λ_max via power iteration ──────────────
-        # 30 iters is overkill for Rayleigh quotient convergence on
-        # a mesh Laplacian, but costs only ~15 ms and gives a tight
-        # bound that improves Chebyshev filter quality.
-        torch.manual_seed(42)
-        v = torch.randn(N, 1, dtype=spmv_torch_dtype, device=device)
-        v = v / v.norm()
-        for _ in range(POWER_ITERS):
-            v = _spmm(A_t, v)
-            v = v / v.norm()
-        # Rayleigh quotient in float64 for a precise λ_max
-        v64 = v.to(ritz_torch_dtype)
-        Av64 = _spmm(A_t, v).to(ritz_torch_dtype)
-        lambda_max = float((v64.T @ Av64).item()) * 1.05  # 5% safety
-        del v, v64, Av64
-        logger.info("    λ_max ≈ %.4f", lambda_max)
-        # ── Step 4: ChFSI outer loop ───────────────────────────────
-        # Initial random subspace
-        torch.manual_seed(42)
-        V = torch.randn(N, m, dtype=spmv_torch_dtype, device=device)
-        V, _ = torch.linalg.qr(V)
-        # Chebyshev filter interval: we want eigenvalues in [0, λ_cut]
-        # where λ_cut is a rough upper bound for the k-th eigenvalue.
-        # Heuristic: Weyl's law gives λ_k ∝ k for 2D surfaces, so
-        # λ_cut ≈ λ_max × (2 * m / N) is a conservative estimate.
-        # We refine after the first Ritz step.
-        lambda_cut = lambda_max * (2.0 * m / N)
-        lambda_cut = max(lambda_cut, lambda_max * 0.01)  # floor
-        converged = False
-        for outer in range(maxiter):
-            # ── Chebyshev filter: T_d(scaled_A) @ V ────────────────
-            # Maps A from [λ_cut, λ_max] → [−1, 1], then applies
-            # Chebyshev polynomial that is ~0 on [−1, 1] (unwanted
-            # eigenvalues) and large on (−∞, −1) (wanted eigenvalues).
-            #
-            # Scaling:  σ = (λ_max − λ_cut) / 2
-            #           c = (λ_max + λ_cut) / 2
-            #           A_scaled = (A − c·I) / σ
-            #
-            # 3-term recurrence:
-            #   Y₀ = V
-            #   Y₁ = (1/σ)(A − c·I) V = (A·V − c·V) / σ
-            #   Y_{j+1} = (2/σ)(A − c·I) Y_j − Y_{j−1}
-            #            = (2(A·Y_j − c·Y_j) / σ) − Y_{j−1}
-            e = (lambda_max - lambda_cut) / 2.0
-            c = (lambda_max + lambda_cut) / 2.0
-            # Safeguard: e must be positive
-            if e < 1e-10:
-                e = lambda_max * 0.5
-                c = lambda_max * 0.5
-            sigma = e / c if abs(c) > 1e-12 else 1.0
-            sigma1 = sigma
-            # Y₀ = V (reuse V buffer)
-            # Y₁ = σ₁/e · (A·V − c·V)
-            AV = _spmm(A_t, V)                          # (N, m) f32
-            Y_prev = V                                   # alias, no copy
-            Y_curr = (sigma1 / e) * (AV - c * V)        # (N, m) f32
-            del AV
-            for d in range(2, CHEB_DEGREE + 1):
-                sigma_new = 1.0 / (2.0 / sigma - sigma1)
-                AY = _spmm(A_t, Y_curr)                 # (N, m) f32
-                Y_next = (2.0 * sigma_new / e) * (AY - c * Y_curr) \
-                         - (sigma * sigma_new) * Y_prev
-                Y_prev = Y_curr
-                Y_curr = Y_next
-                sigma = sigma_new
-                del AY
-            del Y_prev  # free (N, m) buffer
-            # ── Orthogonalise filtered subspace ────────────────────
-            V, _ = torch.linalg.qr(Y_curr)
-            del Y_curr
-            # ── Rayleigh–Ritz in float64 ──────────────────────────
-            # AV in float32 for speed, then upcast for the small eigh
-            AV = _spmm(A_t, V)                           # (N, m) f32
-            V64 = V.to(ritz_torch_dtype)                  # (N, m) f64
-            AV64 = AV.to(ritz_torch_dtype)                # (N, m) f64
-            del AV
-            H = V64.T @ AV64                             # (m, m) f64
-            H = 0.5 * (H + H.T)                          # symmetrise
-            ritz_vals, ritz_vecs = torch.linalg.eigh(H)   # sorted ascending
-            # ── Convergence check: max residual norm ───────────────
-            # residual_i = A·z_i − λ_i·z_i  where z_i = V @ s_i
-            eigvecs_m = V64 @ ritz_vecs[:, :k]            # (N, k) f64
-            Aeigvecs = AV64 @ ritz_vecs[:, :k]            # (N, k) f64
-            residuals = Aeigvecs - eigvecs_m * ritz_vals[:k].unsqueeze(0)
-            max_res = float(residuals.norm(dim=0).max().item())
-            del eigvecs_m, Aeigvecs, residuals, V64, AV64
-            if outer % 5 == 0 or max_res < tol:
-                logger.info(
-                    "    ChFSI iter %2d: max_residual=%.2e, λ_cut=%.4f",
-                    outer, max_res, lambda_cut,
-                )
-            if max_res < tol:
-                converged = True
-                break
-            # ── Update subspace: rotate V into Ritz basis ──────────
-            V = V @ ritz_vecs[:, :m].to(spmv_torch_dtype)
-            # ── Refine λ_cut from current Ritz estimates ───────────
-            # Use 1.5× the m-th Ritz value as the new cutoff
-            if ritz_vals.shape[0] > k:
-                lambda_cut = float(ritz_vals[m - 1].item()) * 1.5
-                lambda_cut = min(lambda_cut, lambda_max * 0.95)
+        A_t = _to_csr(A_cpu)
+        del A_cpu
-        if not converged:
-            logger.warning(
-                "  ChFSI did not converge in %d iters "
-                "(max_residual=%.2e > tol=%.1e). Results may be approximate.",
-                maxiter, max_res, tol,
-            )
+        with torch.no_grad():
+            # ── Step 2: λ_max via power iteration ───────────────────
+            torch.manual_seed(42)
+            v = torch.randn(N, 1, dtype=spmv_torch_dtype, device=device)
+            v.div_(v.norm())
+            for pi in range(POWER_ITERS):
+                v = torch.sparse.mm(A_t, v)
+                v.div_(v.norm())
+                if is_cuda and pi % 10 == 9:
+                    torch.cuda.synchronize()
+            Av = torch.sparse.mm(A_t, v)
+            lambda_max = float((v.T @ Av).item()) * 1.05
+            del v, Av
+            if is_cuda:
+                torch.cuda.synchronize()
+                torch.cuda.empty_cache()
-        # ── Step 5: Extract final eigenpairs ────────────────────────
-        evals_t = ritz_vals[:k]                           # (k,) f64 on GPU
-        # Final eigenvectors: V @ ritz_vecs[:, :k] in float64
-        evecs_t = V.to(ritz_torch_dtype) @ ritz_vecs[:, :k]  # (N, k) f64
+            logger.info("    λ_max ≈ %.4f", lambda_max)
+            # ── Step 3: ChFSI outer loop ────────────────────────────
+            torch.manual_seed(42)
+            V = torch.randn(N, m, dtype=spmv_torch_dtype, device=device)
+            V, _ = torch.linalg.qr(V)
+            lambda_cut = lambda_max * (2.0 * m / N)
+            lambda_cut = max(lambda_cut, lambda_max * 0.01)
+            converged = False
+            max_res = float("inf")
+            for outer in range(maxiter):
+                # ── Chebyshev filter (IN-PLACE) ─────────────────────
+                # All arithmetic uses .add_(), .mul_() to avoid temps.
+                # Only torch.sparse.mm allocates (no out= support).
+                e = (lambda_max - lambda_cut) / 2.0
+                cc = (lambda_max + lambda_cut) / 2.0
+                if e < 1e-10:
+                    e = lambda_max * 0.5
+                    cc = lambda_max * 0.5
+                sigma = e / cc if abs(cc) > 1e-12 else 1.0
+                sigma1 = sigma
+                # Y₁ = (σ₁/e) · (A·V − c·V)
+                # In-place: AV = sparse.mm(A, V); AV -= c*V; AV *= σ₁/e
+                Y_curr = torch.sparse.mm(A_t, V)     # (N,m) NEW alloc
+                Y_curr.add_(V, alpha=-cc)             # in-place
+                Y_curr.mul_(sigma1 / e)               # in-place
+                Y_prev = V.clone()                    # need a copy (V reused)
+                for d in range(2, CHEB_DEGREE + 1):
+                    sigma_new = 1.0 / (2.0 / sigma1 - sigma)
+                    # Y_next = (2σ_new/e)(A·Y_curr − c·Y_curr) − σ·σ_new·Y_prev
+                    # In-place on the SpMV output:
+                    Y_next = torch.sparse.mm(A_t, Y_curr)  # NEW alloc
+                    Y_next.add_(Y_curr, alpha=-cc)          # -= c * Y_curr
+                    Y_next.mul_(2.0 * sigma_new / e)        # *= 2σ/e
+                    Y_next.add_(Y_prev, alpha=-(sigma * sigma_new))
+                    # Rotate buffers — reuse memory
+                    Y_prev = Y_curr    # old Y_curr becomes Y_prev
+                    Y_curr = Y_next    # new result becomes Y_curr
+                    sigma = sigma_new
+                    # Y_next ref dropped; old Y_prev eligible for GC
+                    if is_cuda and d % 4 == 0:
+                        torch.cuda.synchronize()
+                del Y_prev  # free last-gen buffer
+                if is_cuda:
+                    torch.cuda.synchronize()
+                # ── QR ──────────────────────────────────────────────
+                V, _ = torch.linalg.qr(Y_curr)
+                del Y_curr
+                # ── Rayleigh–Ritz (f64 for accuracy) ────────────────
+                AV_f32 = torch.sparse.mm(A_t, V)            # (N,m) f32
+                V64 = V.to(ritz_torch_dtype)                 # (N,m) f64
+                AV64 = AV_f32.to(ritz_torch_dtype)           # (N,m) f64
+                del AV_f32  # free f32 copy NOW
+                H = V64.T @ AV64                             # (m,m) f64
+                H = 0.5 * (H + H.T)                          # symmetrise (safe)
+                ritz_vals, ritz_vecs = torch.linalg.eigh(H)
+                del H
+                # ── Convergence check ───────────────────────────────
+                # Compute residual norms without large (N,k) temporaries:
+                # res_i = ||AV64 @ s_i - λ_i * V64 @ s_i||
+                S_k = ritz_vecs[:, :k]                       # (m,k) f64 — view
+                Z_k = V64 @ S_k                              # (N,k) f64
+                AZ_k = AV64 @ S_k                            # (N,k) f64
+                del V64, AV64  # free the two big f64 blocks NOW
+                # In-place: scale Z_k columns by eigenvalues, then subtract
+                Z_k.mul_(ritz_vals[:k].unsqueeze(0))  # Z_k[:,i] *= λ_i
+                AZ_k.sub_(Z_k)                         # AZ_k -= λ·Z_k
+                max_res = float(AZ_k.norm(dim=0).max().item())
+                del Z_k, AZ_k, S_k
+                if is_cuda:
+                    torch.cuda.synchronize()
+                if outer % 5 == 0 or max_res < tol:
+                    logger.info(
+                        "    ChFSI iter %2d: res=%.2e, λ_cut=%.4f",
+                        outer, max_res, lambda_cut,
+                    )
+                if max_res < tol:
+                    converged = True
+                    break
+                # Rotate V into Ritz basis
+                V = V @ ritz_vecs[:, :m].to(spmv_torch_dtype)
+                # Refine λ_cut
+                if ritz_vals.shape[0] > k:
+                    lambda_cut = float(ritz_vals[m - 1].item()) * 1.5
+                    lambda_cut = min(lambda_cut, lambda_max * 0.95)
+                # ── Aggressive VRAM cleanup EVERY iteration ─────────
+                if is_cuda:
+                    torch.cuda.empty_cache()
+            # ── end outer loop ──────────────────────────────────────
+            if not converged:
+                logger.warning(
+                    "  ChFSI did not converge in %d iters "
+                    "(res=%.2e > tol=%.1e).",
+                    maxiter, max_res, tol,
+                )
-        # ── Step 6: Undo mass-matrix transform: φ = M^{−½} · y ────
-        M_inv_sqrt_t = torch.from_numpy(
-            M_inv_sqrt_np
-        ).to(dtype=ritz_torch_dtype, device=device).unsqueeze(1)  # (N, 1)
+            # ── Extract eigenpairs ──────────────────────────────────
+            evals_t = ritz_vals[:k]                          # (k,) f64
+            evecs_t = V.to(ritz_torch_dtype) @ ritz_vecs[:, :k]  # (N,k) f64
+            del V, ritz_vals, ritz_vecs
-        evecs_t = evecs_t * M_inv_sqrt_t                  # (N, k) f64
-        del M_inv_sqrt_t, V, ritz_vals, ritz_vecs
+            M_inv_sqrt_t = torch.from_numpy(
+                M_inv_sqrt_np
+            ).to(dtype=ritz_torch_dtype, device=device).unsqueeze(1)
+            evecs_t.mul_(M_inv_sqrt_t)                       # in-place
+            del M_inv_sqrt_t
-        # Move to CPU
-        evals = evals_t.cpu().numpy().astype(np.float64)
-        evecs = evecs_t.cpu().numpy().astype(np.float64)
-        del evals_t, evecs_t
+            if is_cuda:
+                torch.cuda.synchronize()
+            evals = evals_t.cpu().numpy().astype(np.float64)
+            evecs = evecs_t.cpu().numpy().astype(np.float64)
+            del evals_t, evecs_t
     finally:
-        # Guarantee GPU cleanup even on error — critical for batch mode
-        if device.type == "cuda":
+        if is_cuda:
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
+            gc.collect()
+            torch.cuda.empty_cache()  # double-tap after gc frees python refs
+            vram_end = torch.cuda.memory_allocated(0)
+            delta = vram_end - vram_start
+            if delta > 1e6:  # > 1 MB leak
+                logger.warning(
+                    "  VRAM leak detected: +%.1f MB (start=%.0f, end=%.0f)",
+                    delta / 1e6, vram_start / 1e6, vram_end / 1e6,
+                )
     order = np.argsort(evals)
     return evals[order], evecs[:, order]

{corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/utils.py RENAMED Viewed

@@ -71,10 +71,18 @@ def gc_gpu() -> None:
     """
     Aggressively free GPU memory across all available backends.
-    Calls ``torch.cuda.empty_cache()``, ``cupy.get_default_memory_pool().free_all_blocks()``,
-    and Python garbage collector. Safe to call even when no GPU or backends are available.
+    Uses a **double-tap** pattern: ``gc.collect()`` →
+    ``empty_cache()`` → ``gc.collect()`` → ``empty_cache()`` to
+    ensure Python cyclic references holding CUDA tensors are fully
+    broken before the caching allocator releases blocks.  Critical
+    for multi-subject batch pipelines where VRAM fragmentation
+    accumulates over hundreds of subjects.
+    Safe to call even when no GPU or backends are available.
     """
     import gc
+    # First pass: break Python references → free CUDA tensors
     gc.collect()
     try:
@@ -85,6 +93,16 @@ def gc_gpu() -> None:
     except ImportError:
         pass
+    # Second pass: catch cyclic refs that survived first gc
+    gc.collect()
+    try:
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except ImportError:
+        pass
     try:
         import cupy as cp
         cp.get_default_memory_pool().free_all_blocks()

{corticalfields-0.2.2 → corticalfields-0.2.4/src/corticalfields.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: corticalfields
-Version: 0.2.2
+Version: 0.2.4
 Summary: Spectral cortical and subcortical analysis with statistical testing (RSA, CCA, PLS, PERMANOVA, TFCE, NBS, laterality classification), on meshes and point clouds — Laplace-Beltrami decomposition, atlas-free asymmetry, GPU-accelerated optimal transport, hippocampal subfield analysis (HippUnfold), ShapeDNA/BrainPrint spectral fingerprinting, geometric deep learning, Bayesian inference, and normative modeling for structural neuroimaging.
 Author-email: rdneuro <r.debona@ufrj.br>
 License: MIT