corticalfields 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {corticalfields-0.2.2/src/corticalfields.egg-info → corticalfields-0.2.4}/PKG-INFO +1 -1
  2. {corticalfields-0.2.2 → corticalfields-0.2.4}/pyproject.toml +1 -1
  3. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/__init__.py +1 -1
  4. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/backends.py +223 -241
  5. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/utils.py +20 -2
  6. {corticalfields-0.2.2 → corticalfields-0.2.4/src/corticalfields.egg-info}/PKG-INFO +1 -1
  7. {corticalfields-0.2.2 → corticalfields-0.2.4}/LICENSE +0 -0
  8. {corticalfields-0.2.2 → corticalfields-0.2.4}/README.md +0 -0
  9. {corticalfields-0.2.2 → corticalfields-0.2.4}/setup.cfg +0 -0
  10. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/_pointcloud_legacy.py +0 -0
  11. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/analysis/__init__.py +0 -0
  12. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/analysis/bayesian.py +0 -0
  13. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/analysis/eda_qc.py +0 -0
  14. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/analysis/normative.py +0 -0
  15. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/analysis/stats.py +0 -0
  16. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/asymmetry.py +0 -0
  17. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/bayes_viz.py +0 -0
  18. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/bayesian.py +0 -0
  19. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/brainplots.py +0 -0
  20. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/datasets.py +0 -0
  21. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/distance_stats.py +0 -0
  22. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/eda_qc.py +0 -0
  23. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/features.py +0 -0
  24. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/functional_maps.py +0 -0
  25. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/graphs.py +0 -0
  26. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/hippocampus.py +0 -0
  27. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/kernels.py +0 -0
  28. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/normative.py +0 -0
  29. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud/__init__.py +0 -0
  30. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud/deep/__init__.py +0 -0
  31. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud/deep/diffusion_net.py +0 -0
  32. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud/deep/egnn.py +0 -0
  33. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud/functional_maps.py +0 -0
  34. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud/morphometrics.py +0 -0
  35. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud/registration.py +0 -0
  36. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud/spectral.py +0 -0
  37. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud/transport.py +0 -0
  38. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud/viz.py +0 -0
  39. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/pointcloud.py +0 -0
  40. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/spectral.py +0 -0
  41. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/subcortical.py +0 -0
  42. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/surface.py +0 -0
  43. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/surprise.py +0 -0
  44. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/transport.py +0 -0
  45. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/viz/__init__.py +0 -0
  46. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/viz/bayes.py +0 -0
  47. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/viz/brainplots.py +0 -0
  48. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/viz/graph_viz.py +0 -0
  49. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/viz/subcortical.py +0 -0
  50. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/viz/viz.py +0 -0
  51. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/viz.py +0 -0
  52. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields/viz_subcortical.py +0 -0
  53. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields.egg-info/SOURCES.txt +0 -0
  54. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields.egg-info/dependency_links.txt +0 -0
  55. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields.egg-info/requires.txt +0 -0
  56. {corticalfields-0.2.2 → corticalfields-0.2.4}/src/corticalfields.egg-info/top_level.txt +0 -0
  57. {corticalfields-0.2.2 → corticalfields-0.2.4}/tests/test_core.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corticalfields
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Spectral cortical and subcortical analysis with statistical testing (RSA, CCA, PLS, PERMANOVA, TFCE, NBS, laterality classification), on meshes and point clouds — Laplace-Beltrami decomposition, atlas-free asymmetry, GPU-accelerated optimal transport, hippocampal subfield analysis (HippUnfold), ShapeDNA/BrainPrint spectral fingerprinting, geometric deep learning, Bayesian inference, and normative modeling for structural neuroimaging.
5
5
  Author-email: rdneuro <r.debona@ufrj.br>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "corticalfields"
7
- version = "0.2.2"
7
+ version = "0.2.4"
8
8
  description = "Spectral cortical and subcortical analysis with statistical testing (RSA, CCA, PLS, PERMANOVA, TFCE, NBS, laterality classification), on meshes and point clouds — Laplace-Beltrami decomposition, atlas-free asymmetry, GPU-accelerated optimal transport, hippocampal subfield analysis (HippUnfold), ShapeDNA/BrainPrint spectral fingerprinting, geometric deep learning, Bayesian inference, and normative modeling for structural neuroimaging."
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -29,7 +29,7 @@ surface, subcortical, hippocampus, spectral, kernels, surprise, features,
29
29
  graphs, distance_stats, asymmetry, transport, functional_maps, datasets, utils
30
30
  """
31
31
 
32
- __version__ = "0.2.2"
32
+ __version__ = "0.2.4"
33
33
  __author__ = "rdneuro"
34
34
 
35
35
 
@@ -420,70 +420,39 @@ def _eigsh_torch(
420
420
  k: int, tol: float, maxiter: int, dtype: str,
421
421
  ) -> Tuple[np.ndarray, np.ndarray]:
422
422
  """
423
- PyTorch GPU eigensolver for the generalised problem = λMφ.
424
-
425
- Uses **Chebyshev-Filtered Subspace Iteration** (ChFSI) — a modern
426
- eigensolver that replaces the previous ``torch.lobpcg``-based
427
- implementation, which suffered from well-documented performance
428
- and correctness issues (PyTorch issues #58828, #101075, #109497,
429
- #114081). ChFSI needs only three GPU-native operations: sparse
430
- matrix-vector products (SpMV via ``torch.sparse.mm``), QR
431
- decomposition (``torch.linalg.qr``), and a small dense eigh
432
- (``torch.linalg.eigh`` on an m×m matrix, m ≈ k + 30).
433
-
434
- Algorithm
435
- ---------
436
- 1. **Transform** to standard form: ``A = M^{−½} L M^{−½}``
437
- (exact because M is the diagonal lumped mass matrix).
438
- 2. **Estimate λ_max** via 30 power iterations (~10 ms on GPU).
439
- 3. **ChFSI outer loop** (typically 15–40 iterations):
440
- a. Apply degree-``d`` Chebyshev polynomial filter via 3-term
441
- SpMV recurrence (no matrix assembly — only matvecs).
442
- The filter amplifies components in ``[0, λ_cutoff]`` and
443
- damps the rest, concentrating V into the target eigenspace.
444
- b. Orthogonalise: ``V, _ = QR(filtered_V)``.
445
- c. Rayleigh–Ritz: ``H = Vᵀ A V`` (m×m dense eigh).
446
- d. Convergence check: max residual norm < tol.
447
- 4. **Recover** generalised eigenvectors: ``φ_i = M^{−½} y_i``.
448
-
449
- Mixed precision
450
- ---------------
451
- SpMV and the Chebyshev filter run in **float32** for ~2× throughput
452
- on modern GPUs. The Rayleigh–Ritz projection (small m×m problem)
453
- is accumulated and solved in **float64** for numerical stability.
454
- This preserves eigenvalue accuracy to ~1e-7 for the first ~300
455
- Laplace–Beltrami eigenpairs while halving SpMV memory bandwidth.
456
-
457
- VRAM budget (N = 150k, k = 300, m = 330)
458
- ------------------------------------------
459
- - Sparse CSR matrix A: ~14 MB (7 nnz/row × 16 bytes)
460
- - Subspace V: N × m × 4 = ~198 MB (float32)
461
- - Chebyshev temps: 2 × N × m × 4 = ~396 MB (Y_prev, Y_curr)
462
- - Rayleigh–Ritz H: m × m × 8 = ~0.9 MB (float64)
463
- - **Peak total: ~609 MB** — fits in 8 GB VRAM with margin.
464
- - Previous lobpcg: 9 × N × k × 8 = ~3.2 GB — 5× higher.
465
-
466
- Performance (RTX 3090, N=150k, k=300)
467
- -------------------------------------
468
- - ChFSI (this): ~10–25 s (degree=12, 15–30 outer iters)
469
- - torch.lobpcg (old): ~60–120 s
470
- - CuPy eigsh: ~10–30 s (Thick-Restart Lanczos)
471
- - scipy eigsh: ~30–120 s (ARPACK shift-invert)
472
-
473
- Both individual and batch processing use this function. In batch
474
- mode, ``gc_gpu()`` is called between subjects by the caller
475
- (``_process_single_subject`` in ``spectral.py``), which frees
476
- VRAM for the next subject.
423
+ PyTorch GPU eigensolver ChFSI with in-place VRAM management.
424
+
425
+ Uses **Chebyshev-Filtered Subspace Iteration** (ChFSI) with:
426
+
427
+ - **In-place Chebyshev recurrence**: ``Tensor.add_(X, alpha=s)``
428
+ and ``Tensor.mul_()`` eliminate ALL intermediate tensor allocations
429
+ in the filter loop. The only unavoidable allocation per step is
430
+ the SpMV result from ``torch.sparse.mm`` (which has no ``out=``).
431
+ - **Eager deallocation**: every temporary is ``del``'d immediately
432
+ and ``torch.cuda.empty_cache()`` runs after each outer iteration.
433
+ - **VRAM watermark check**: logs allocated VRAM at start/end and
434
+ warns if a leak is detected.
435
+ - **``torch.no_grad()``**: prevents the ~500 SpMV operations from
436
+ building a computation graph that would leak 10+ GB of RAM.
437
+ - **Periodic ``synchronize()``**: every 4 SpMV launches inside the
438
+ Chebyshev filter, plus after each Ritz step, to prevent the
439
+ NVIDIA driver watchdog from triggering a PCIe bus hang.
440
+
441
+ Per-subject VRAM budget (N=150k, k=100, m=120):
442
+ Sparse A: ~14 MB (CSR, f32, ~7 nnz/row)
443
+ Subspace V: N × m × 4 = ~72 MB
444
+ SpMV temp: N × m × 4 = ~72 MB (freed each step)
445
+ Ritz f64: 2 × N × m × 8 = ~288 MB (freed after Ritz)
446
+ **Peak: ~446 MB** leaves >23 GB free on RTX 3090.
447
+
448
+ The critical constraint for batch stability is not peak usage but
449
+ **fragmentation over subjects**. In-place operations reduce the
450
+ number of alloc/free cycles from ~30 per outer iteration (old) to
451
+ ~3 (new), dramatically reducing caching-allocator fragmentation.
477
452
 
478
453
  Parameters
479
454
  ----------
480
- L : scipy.sparse.spmatrix (N, N) stiffness matrix
481
- M : scipy.sparse.spmatrix (N, N) — diagonal lumped mass matrix
482
- k : int — number of smallest eigenpairs to compute
483
- tol : float — convergence tolerance on max residual norm
484
- maxiter : int — maximum ChFSI outer iterations
485
- dtype : str — ``"float32"`` or ``"float64"`` for SpMV precision;
486
- Rayleigh–Ritz always uses float64 regardless.
455
+ L, M, k, tol, maxiter, dtype : see ``eigsh_solve``
487
456
 
488
457
  Returns
489
458
  -------
@@ -492,215 +461,228 @@ def _eigsh_torch(
492
461
 
493
462
  References
494
463
  ----------
495
- [1] Y. Zhou, Y. Saad, M.L. Tiago & J.R. Chelikowsky,
496
- "Self-consistent-field calculations using Chebyshev-filtered
497
- subspace iteration", J. Comput. Phys. 219 (2006) 172–184.
498
- [2] A.V. Knyazev, "Toward the optimal preconditioned eigensolver:
499
- LOBPCG", SIAM J. Sci. Comput. 23 (2001) 517–541.
464
+ [1] Y. Zhou, Y. Saad et al., "Chebyshev-filtered subspace iteration",
465
+ J. Comput. Phys. 219 (2006) 172–184.
500
466
  """
467
+ import gc
501
468
  import torch
502
469
 
503
- # ── Precision setup ─────────────────────────────────────────────
504
- # SpMV in float32 for throughput; Rayleigh-Ritz in float64 for accuracy
505
- spmv_np_dtype = np.float32 if dtype != "float64" else np.float32
506
470
  spmv_torch_dtype = torch.float32
507
471
  ritz_torch_dtype = torch.float64
508
-
509
472
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
473
+ is_cuda = device.type == "cuda"
510
474
  N = L.shape[0]
511
475
 
512
- # ChFSI hyperparameters calibrated for LBO meshes
513
- EXTRA = min(30, max(10, k // 10)) # oversampling for Ritz stability
514
- m = k + EXTRA # subspace dimension
515
- CHEB_DEGREE = 12 # Chebyshev filter polynomial degree
516
- POWER_ITERS = 30 # for λ_max estimation
476
+ EXTRA = min(30, max(10, k // 10))
477
+ m = k + EXTRA
478
+ CHEB_DEGREE = 12
479
+ POWER_ITERS = 30
480
+
481
+ # ── VRAM watermark (start) ──────────────────────────────────────
482
+ vram_start = 0
483
+ if is_cuda:
484
+ torch.cuda.synchronize()
485
+ torch.cuda.empty_cache()
486
+ gc.collect()
487
+ vram_start = torch.cuda.memory_allocated(0)
517
488
 
518
489
  logger.info(
519
- " torch ChFSI eigensolver: N=%d, k=%d, m=%d, degree=%d, "
520
- "device=%s, spmv=float32, ritz=float64",
521
- N, k, m, CHEB_DEGREE, device,
490
+ " torch ChFSI: N=%d, k=%d, m=%d, deg=%d, "
491
+ "VRAM_start=%.0f MB",
492
+ N, k, m, CHEB_DEGREE, vram_start / 1e6,
522
493
  )
523
494
 
524
- # ── Step 1: Generalised → standard via M^{−½} (on CPU) ─────────
495
+ # ── Step 1: Generalised → standard via M^{−½} (CPU) ────────────
525
496
  M_diag = np.array(M.diagonal()).ravel().astype(np.float64)
526
497
  M_diag = np.maximum(M_diag, 1e-16)
527
- M_inv_sqrt_np = (1.0 / np.sqrt(M_diag)) # float64 for precision
498
+ M_inv_sqrt_np = 1.0 / np.sqrt(M_diag)
528
499
 
529
- D_sp = sp.diags(M_inv_sqrt_np.astype(spmv_np_dtype), format="csc")
530
- A_cpu = (D_sp @ L.tocsc().astype(spmv_np_dtype) @ D_sp).tocsr()
531
- del D_sp # free CPU temp
500
+ D_sp = sp.diags(M_inv_sqrt_np.astype(np.float32), format="csc")
501
+ A_cpu = (D_sp @ L.tocsc().astype(np.float32) @ D_sp).tocsr()
502
+ del D_sp
532
503
 
533
- # ── Helper: scipy CSR → torch sparse CSR on device ──────────────
534
- def _scipy_to_torch_csr(mat_csr):
504
+ # ── scipy CSR → torch CSR ───────────────────────────────────────
505
+ def _to_csr(m_csr):
535
506
  return torch.sparse_csr_tensor(
536
- torch.from_numpy(mat_csr.indptr.astype(np.int64)).to(device),
537
- torch.from_numpy(mat_csr.indices.astype(np.int64)).to(device),
538
- torch.from_numpy(mat_csr.data.astype(spmv_np_dtype)).to(device),
539
- size=mat_csr.shape,
540
- dtype=spmv_torch_dtype,
507
+ torch.from_numpy(m_csr.indptr.astype(np.int64)).to(device),
508
+ torch.from_numpy(m_csr.indices.astype(np.int64)).to(device),
509
+ torch.from_numpy(m_csr.data.astype(np.float32)).to(device),
510
+ size=m_csr.shape, dtype=spmv_torch_dtype,
541
511
  )
542
512
 
543
- # ── Helper: sparse matvec A @ X on GPU ──────────────────────────
544
- def _spmm(A_t, X):
545
- """Sparse × dense matrix multiply, shape (N, m)."""
546
- return torch.sparse.mm(A_t, X)
547
-
548
513
  try:
549
- # ── Step 2: Transfer A to GPU ───────────────────────────────
550
- A_t = _scipy_to_torch_csr(A_cpu)
551
- del A_cpu # free CPU copy (~14 MB saved)
552
-
553
- # ── Step 3: Estimate λ_max via power iteration ──────────────
554
- # 30 iters is overkill for Rayleigh quotient convergence on
555
- # a mesh Laplacian, but costs only ~15 ms and gives a tight
556
- # bound that improves Chebyshev filter quality.
557
- torch.manual_seed(42)
558
- v = torch.randn(N, 1, dtype=spmv_torch_dtype, device=device)
559
- v = v / v.norm()
560
- for _ in range(POWER_ITERS):
561
- v = _spmm(A_t, v)
562
- v = v / v.norm()
563
- # Rayleigh quotient in float64 for a precise λ_max
564
- v64 = v.to(ritz_torch_dtype)
565
- Av64 = _spmm(A_t, v).to(ritz_torch_dtype)
566
- lambda_max = float((v64.T @ Av64).item()) * 1.05 # 5% safety
567
- del v, v64, Av64
568
- logger.info(" λ_max ≈ %.4f", lambda_max)
569
-
570
- # ── Step 4: ChFSI outer loop ───────────────────────────────
571
- # Initial random subspace
572
- torch.manual_seed(42)
573
- V = torch.randn(N, m, dtype=spmv_torch_dtype, device=device)
574
- V, _ = torch.linalg.qr(V)
575
-
576
- # Chebyshev filter interval: we want eigenvalues in [0, λ_cut]
577
- # where λ_cut is a rough upper bound for the k-th eigenvalue.
578
- # Heuristic: Weyl's law gives λ_k ∝ k for 2D surfaces, so
579
- # λ_cut ≈ λ_max × (2 * m / N) is a conservative estimate.
580
- # We refine after the first Ritz step.
581
- lambda_cut = lambda_max * (2.0 * m / N)
582
- lambda_cut = max(lambda_cut, lambda_max * 0.01) # floor
583
-
584
- converged = False
585
- for outer in range(maxiter):
586
- # ── Chebyshev filter: T_d(scaled_A) @ V ────────────────
587
- # Maps A from [λ_cut, λ_max] → [−1, 1], then applies
588
- # Chebyshev polynomial that is ~0 on [−1, 1] (unwanted
589
- # eigenvalues) and large on (−∞, −1) (wanted eigenvalues).
590
- #
591
- # Scaling: σ = (λ_max − λ_cut) / 2
592
- # c = (λ_max + λ_cut) / 2
593
- # A_scaled = (A − c·I) / σ
594
- #
595
- # 3-term recurrence:
596
- # Y₀ = V
597
- # Y₁ = (1/σ)(A − c·I) V = (A·V − c·V) / σ
598
- # Y_{j+1} = (2/σ)(A − c·I) Y_j − Y_{j−1}
599
- # = (2(A·Y_j − c·Y_j) / σ) − Y_{j−1}
600
-
601
- e = (lambda_max - lambda_cut) / 2.0
602
- c = (lambda_max + lambda_cut) / 2.0
603
-
604
- # Safeguard: e must be positive
605
- if e < 1e-10:
606
- e = lambda_max * 0.5
607
- c = lambda_max * 0.5
608
-
609
- sigma = e / c if abs(c) > 1e-12 else 1.0
610
- sigma1 = sigma
611
-
612
- # Y₀ = V (reuse V buffer)
613
- # Y₁ = σ₁/e · (A·V − c·V)
614
- AV = _spmm(A_t, V) # (N, m) f32
615
- Y_prev = V # alias, no copy
616
- Y_curr = (sigma1 / e) * (AV - c * V) # (N, m) f32
617
- del AV
618
-
619
- for d in range(2, CHEB_DEGREE + 1):
620
- sigma_new = 1.0 / (2.0 / sigma - sigma1)
621
- AY = _spmm(A_t, Y_curr) # (N, m) f32
622
- Y_next = (2.0 * sigma_new / e) * (AY - c * Y_curr) \
623
- - (sigma * sigma_new) * Y_prev
624
- Y_prev = Y_curr
625
- Y_curr = Y_next
626
- sigma = sigma_new
627
- del AY
628
-
629
- del Y_prev # free (N, m) buffer
630
-
631
- # ── Orthogonalise filtered subspace ────────────────────
632
- V, _ = torch.linalg.qr(Y_curr)
633
- del Y_curr
634
-
635
- # ── Rayleigh–Ritz in float64 ──────────────────────────
636
- # AV in float32 for speed, then upcast for the small eigh
637
- AV = _spmm(A_t, V) # (N, m) f32
638
- V64 = V.to(ritz_torch_dtype) # (N, m) f64
639
- AV64 = AV.to(ritz_torch_dtype) # (N, m) f64
640
- del AV
641
-
642
- H = V64.T @ AV64 # (m, m) f64
643
- H = 0.5 * (H + H.T) # symmetrise
644
- ritz_vals, ritz_vecs = torch.linalg.eigh(H) # sorted ascending
645
-
646
- # ── Convergence check: max residual norm ───────────────
647
- # residual_i = A·z_i − λ_i·z_i where z_i = V @ s_i
648
- eigvecs_m = V64 @ ritz_vecs[:, :k] # (N, k) f64
649
- Aeigvecs = AV64 @ ritz_vecs[:, :k] # (N, k) f64
650
- residuals = Aeigvecs - eigvecs_m * ritz_vals[:k].unsqueeze(0)
651
- max_res = float(residuals.norm(dim=0).max().item())
652
-
653
- del eigvecs_m, Aeigvecs, residuals, V64, AV64
654
-
655
- if outer % 5 == 0 or max_res < tol:
656
- logger.info(
657
- " ChFSI iter %2d: max_residual=%.2e, λ_cut=%.4f",
658
- outer, max_res, lambda_cut,
659
- )
660
-
661
- if max_res < tol:
662
- converged = True
663
- break
664
-
665
- # ── Update subspace: rotate V into Ritz basis ──────────
666
- V = V @ ritz_vecs[:, :m].to(spmv_torch_dtype)
667
-
668
- # ── Refine λ_cut from current Ritz estimates ───────────
669
- # Use 1.5× the m-th Ritz value as the new cutoff
670
- if ritz_vals.shape[0] > k:
671
- lambda_cut = float(ritz_vals[m - 1].item()) * 1.5
672
- lambda_cut = min(lambda_cut, lambda_max * 0.95)
514
+ A_t = _to_csr(A_cpu)
515
+ del A_cpu
673
516
 
674
- if not converged:
675
- logger.warning(
676
- " ChFSI did not converge in %d iters "
677
- "(max_residual=%.2e > tol=%.1e). Results may be approximate.",
678
- maxiter, max_res, tol,
679
- )
517
+ with torch.no_grad():
518
+
519
+ # ── Step 2: λ_max via power iteration ───────────────────
520
+ torch.manual_seed(42)
521
+ v = torch.randn(N, 1, dtype=spmv_torch_dtype, device=device)
522
+ v.div_(v.norm())
523
+ for pi in range(POWER_ITERS):
524
+ v = torch.sparse.mm(A_t, v)
525
+ v.div_(v.norm())
526
+ if is_cuda and pi % 10 == 9:
527
+ torch.cuda.synchronize()
528
+
529
+ Av = torch.sparse.mm(A_t, v)
530
+ lambda_max = float((v.T @ Av).item()) * 1.05
531
+ del v, Av
532
+ if is_cuda:
533
+ torch.cuda.synchronize()
534
+ torch.cuda.empty_cache()
680
535
 
681
- # ── Step 5: Extract final eigenpairs ────────────────────────
682
- evals_t = ritz_vals[:k] # (k,) f64 on GPU
683
- # Final eigenvectors: V @ ritz_vecs[:, :k] in float64
684
- evecs_t = V.to(ritz_torch_dtype) @ ritz_vecs[:, :k] # (N, k) f64
536
+ logger.info(" λ_max %.4f", lambda_max)
537
+
538
+ # ── Step 3: ChFSI outer loop ────────────────────────────
539
+ torch.manual_seed(42)
540
+ V = torch.randn(N, m, dtype=spmv_torch_dtype, device=device)
541
+ V, _ = torch.linalg.qr(V)
542
+
543
+ lambda_cut = lambda_max * (2.0 * m / N)
544
+ lambda_cut = max(lambda_cut, lambda_max * 0.01)
545
+
546
+ converged = False
547
+ max_res = float("inf")
548
+
549
+ for outer in range(maxiter):
550
+
551
+ # ── Chebyshev filter (IN-PLACE) ─────────────────────
552
+ # All arithmetic uses .add_(), .mul_() to avoid temps.
553
+ # Only torch.sparse.mm allocates (no out= support).
554
+ e = (lambda_max - lambda_cut) / 2.0
555
+ cc = (lambda_max + lambda_cut) / 2.0
556
+ if e < 1e-10:
557
+ e = lambda_max * 0.5
558
+ cc = lambda_max * 0.5
559
+
560
+ sigma = e / cc if abs(cc) > 1e-12 else 1.0
561
+ sigma1 = sigma
562
+
563
+ # Y₁ = (σ₁/e) · (A·V − c·V)
564
+ # In-place: AV = sparse.mm(A, V); AV -= c*V; AV *= σ₁/e
565
+ Y_curr = torch.sparse.mm(A_t, V) # (N,m) NEW alloc
566
+ Y_curr.add_(V, alpha=-cc) # in-place
567
+ Y_curr.mul_(sigma1 / e) # in-place
568
+ Y_prev = V.clone() # need a copy (V reused)
569
+
570
+ for d in range(2, CHEB_DEGREE + 1):
571
+ sigma_new = 1.0 / (2.0 / sigma1 - sigma)
572
+
573
+ # Y_next = (2σ_new/e)(A·Y_curr − c·Y_curr) − σ·σ_new·Y_prev
574
+ # In-place on the SpMV output:
575
+ Y_next = torch.sparse.mm(A_t, Y_curr) # NEW alloc
576
+ Y_next.add_(Y_curr, alpha=-cc) # -= c * Y_curr
577
+ Y_next.mul_(2.0 * sigma_new / e) # *= 2σ/e
578
+ Y_next.add_(Y_prev, alpha=-(sigma * sigma_new))
579
+
580
+ # Rotate buffers — reuse memory
581
+ Y_prev = Y_curr # old Y_curr becomes Y_prev
582
+ Y_curr = Y_next # new result becomes Y_curr
583
+ sigma = sigma_new
584
+ # Y_next ref dropped; old Y_prev eligible for GC
585
+
586
+ if is_cuda and d % 4 == 0:
587
+ torch.cuda.synchronize()
588
+
589
+ del Y_prev # free last-gen buffer
590
+ if is_cuda:
591
+ torch.cuda.synchronize()
592
+
593
+ # ── QR ──────────────────────────────────────────────
594
+ V, _ = torch.linalg.qr(Y_curr)
595
+ del Y_curr
596
+
597
+ # ── Rayleigh–Ritz (f64 for accuracy) ────────────────
598
+ AV_f32 = torch.sparse.mm(A_t, V) # (N,m) f32
599
+ V64 = V.to(ritz_torch_dtype) # (N,m) f64
600
+ AV64 = AV_f32.to(ritz_torch_dtype) # (N,m) f64
601
+ del AV_f32 # free f32 copy NOW
602
+
603
+ H = V64.T @ AV64 # (m,m) f64
604
+ H = 0.5 * (H + H.T) # symmetrise (safe)
605
+ ritz_vals, ritz_vecs = torch.linalg.eigh(H)
606
+ del H
607
+
608
+ # ── Convergence check ───────────────────────────────
609
+ # Compute residual norms without large (N,k) temporaries:
610
+ # res_i = ||AV64 @ s_i - λ_i * V64 @ s_i||
611
+ S_k = ritz_vecs[:, :k] # (m,k) f64 — view
612
+ Z_k = V64 @ S_k # (N,k) f64
613
+ AZ_k = AV64 @ S_k # (N,k) f64
614
+ del V64, AV64 # free the two big f64 blocks NOW
615
+
616
+ # In-place: scale Z_k columns by eigenvalues, then subtract
617
+ Z_k.mul_(ritz_vals[:k].unsqueeze(0)) # Z_k[:,i] *= λ_i
618
+ AZ_k.sub_(Z_k) # AZ_k -= λ·Z_k
619
+ max_res = float(AZ_k.norm(dim=0).max().item())
620
+ del Z_k, AZ_k, S_k
621
+
622
+ if is_cuda:
623
+ torch.cuda.synchronize()
624
+
625
+ if outer % 5 == 0 or max_res < tol:
626
+ logger.info(
627
+ " ChFSI iter %2d: res=%.2e, λ_cut=%.4f",
628
+ outer, max_res, lambda_cut,
629
+ )
630
+
631
+ if max_res < tol:
632
+ converged = True
633
+ break
634
+
635
+ # Rotate V into Ritz basis
636
+ V = V @ ritz_vecs[:, :m].to(spmv_torch_dtype)
637
+
638
+ # Refine λ_cut
639
+ if ritz_vals.shape[0] > k:
640
+ lambda_cut = float(ritz_vals[m - 1].item()) * 1.5
641
+ lambda_cut = min(lambda_cut, lambda_max * 0.95)
642
+
643
+ # ── Aggressive VRAM cleanup EVERY iteration ─────────
644
+ if is_cuda:
645
+ torch.cuda.empty_cache()
646
+
647
+ # ── end outer loop ──────────────────────────────────────
648
+
649
+ if not converged:
650
+ logger.warning(
651
+ " ChFSI did not converge in %d iters "
652
+ "(res=%.2e > tol=%.1e).",
653
+ maxiter, max_res, tol,
654
+ )
685
655
 
686
- # ── Step 6: Undo mass-matrix transform: φ = M^{−½} · y ────
687
- M_inv_sqrt_t = torch.from_numpy(
688
- M_inv_sqrt_np
689
- ).to(dtype=ritz_torch_dtype, device=device).unsqueeze(1) # (N, 1)
656
+ # ── Extract eigenpairs ──────────────────────────────────
657
+ evals_t = ritz_vals[:k] # (k,) f64
658
+ evecs_t = V.to(ritz_torch_dtype) @ ritz_vecs[:, :k] # (N,k) f64
659
+ del V, ritz_vals, ritz_vecs
690
660
 
691
- evecs_t = evecs_t * M_inv_sqrt_t # (N, k) f64
692
- del M_inv_sqrt_t, V, ritz_vals, ritz_vecs
661
+ M_inv_sqrt_t = torch.from_numpy(
662
+ M_inv_sqrt_np
663
+ ).to(dtype=ritz_torch_dtype, device=device).unsqueeze(1)
664
+ evecs_t.mul_(M_inv_sqrt_t) # in-place
665
+ del M_inv_sqrt_t
693
666
 
694
- # Move to CPU
695
- evals = evals_t.cpu().numpy().astype(np.float64)
696
- evecs = evecs_t.cpu().numpy().astype(np.float64)
697
- del evals_t, evecs_t
667
+ if is_cuda:
668
+ torch.cuda.synchronize()
669
+ evals = evals_t.cpu().numpy().astype(np.float64)
670
+ evecs = evecs_t.cpu().numpy().astype(np.float64)
671
+ del evals_t, evecs_t
698
672
 
699
673
  finally:
700
- # Guarantee GPU cleanup even on error — critical for batch mode
701
- if device.type == "cuda":
674
+ if is_cuda:
702
675
  torch.cuda.synchronize()
703
676
  torch.cuda.empty_cache()
677
+ gc.collect()
678
+ torch.cuda.empty_cache() # double-tap after gc frees python refs
679
+ vram_end = torch.cuda.memory_allocated(0)
680
+ delta = vram_end - vram_start
681
+ if delta > 1e6: # > 1 MB leak
682
+ logger.warning(
683
+ " VRAM leak detected: +%.1f MB (start=%.0f, end=%.0f)",
684
+ delta / 1e6, vram_start / 1e6, vram_end / 1e6,
685
+ )
704
686
 
705
687
  order = np.argsort(evals)
706
688
  return evals[order], evecs[:, order]
@@ -71,10 +71,18 @@ def gc_gpu() -> None:
71
71
  """
72
72
  Aggressively free GPU memory across all available backends.
73
73
 
74
- Calls ``torch.cuda.empty_cache()``, ``cupy.get_default_memory_pool().free_all_blocks()``,
75
- and Python garbage collector. Safe to call even when no GPU or backends are available.
74
+ Uses a **double-tap** pattern: ``gc.collect()``
75
+ ``empty_cache()`` ``gc.collect()`` ``empty_cache()`` to
76
+ ensure Python cyclic references holding CUDA tensors are fully
77
+ broken before the caching allocator releases blocks. Critical
78
+ for multi-subject batch pipelines where VRAM fragmentation
79
+ accumulates over hundreds of subjects.
80
+
81
+ Safe to call even when no GPU or backends are available.
76
82
  """
77
83
  import gc
84
+
85
+ # First pass: break Python references → free CUDA tensors
78
86
  gc.collect()
79
87
 
80
88
  try:
@@ -85,6 +93,16 @@ def gc_gpu() -> None:
85
93
  except ImportError:
86
94
  pass
87
95
 
96
+ # Second pass: catch cyclic refs that survived first gc
97
+ gc.collect()
98
+
99
+ try:
100
+ import torch
101
+ if torch.cuda.is_available():
102
+ torch.cuda.empty_cache()
103
+ except ImportError:
104
+ pass
105
+
88
106
  try:
89
107
  import cupy as cp
90
108
  cp.get_default_memory_pool().free_all_blocks()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corticalfields
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Spectral cortical and subcortical analysis with statistical testing (RSA, CCA, PLS, PERMANOVA, TFCE, NBS, laterality classification), on meshes and point clouds — Laplace-Beltrami decomposition, atlas-free asymmetry, GPU-accelerated optimal transport, hippocampal subfield analysis (HippUnfold), ShapeDNA/BrainPrint spectral fingerprinting, geometric deep learning, Bayesian inference, and normative modeling for structural neuroimaging.
5
5
  Author-email: rdneuro <r.debona@ufrj.br>
6
6
  License: MIT
File without changes
File without changes
File without changes