PyPI - AOT-biomaps - Versions diffs - 2.9.138__py3-none-any.whl → 2.9.279__py3-none-any.whl - Mend

AOT-biomaps 2.9.138py3-none-any.whl → 2.9.279py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (31) hide show

AOT_biomaps/AOT_Acoustic/AcousticTools.py +35 -115
AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +22 -18
AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
AOT_biomaps/AOT_Experiment/_mainExperiment.py +102 -68
AOT_biomaps/AOT_Optic/_mainOptic.py +124 -58
AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +72 -108
AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +474 -289
AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +173 -68
AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +360 -154
AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +150 -111
AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +281 -0
AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +328 -0
AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
AOT_biomaps/AOT_Recon/AlgebraicRecon.py +359 -238
AOT_biomaps/AOT_Recon/AnalyticRecon.py +29 -41
AOT_biomaps/AOT_Recon/BayesianRecon.py +165 -91
AOT_biomaps/AOT_Recon/DeepLearningRecon.py +4 -1
AOT_biomaps/AOT_Recon/PrimalDualRecon.py +175 -31
AOT_biomaps/AOT_Recon/ReconEnums.py +38 -3
AOT_biomaps/AOT_Recon/ReconTools.py +184 -77
AOT_biomaps/AOT_Recon/__init__.py +1 -0
AOT_biomaps/AOT_Recon/_mainRecon.py +144 -74
AOT_biomaps/__init__.py +4 -36
{aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/METADATA +2 -1
aot_biomaps-2.9.279.dist-info/RECORD +47 -0
aot_biomaps-2.9.138.dist-info/RECORD +0 -43
{aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/WHEEL +0 -0
{aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/top_level.txt +0 -0

AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py CHANGED Viewed

@@ -1,314 +1,499 @@
 from AOT_biomaps.Config import config
-import numba
+from AOT_biomaps.AOT_Recon.ReconTools import calculate_memory_requirement, check_gpu_memory
+from AOT_biomaps.AOT_Recon.ReconEnums import SMatrixType
 import torch
 import numpy as np
-import os
 from tqdm import trange
+import pycuda.driver as drv
+import torch.cuda
+import gc
+def LS(
+    SMatrix,
+    y,
+    numIterations=100,
+    isSavingEachIteration=True,
+    withTumor=True,
+    alpha=1e-1,
+    device=None,
+    use_numba=False,
+    denominator_threshold=1e-6,
+    max_saves=5000,
+    show_logs=True,
+    smatrixType=SMatrixType.SELL,
+    Z=350,
+):
+    """
+    Least Squares reconstruction using Projected Gradient Descent (PGD) with non-negativity constraint.
+    Currently only implements the stable GPU version.
+    """
+    tumor_str = "WITH" if withTumor else "WITHOUT"
+    # Auto-select device and method
+    if device is None:
+        if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
+            device = torch.device(f"cuda:{config.select_best_gpu()}")
+            use_gpu = True
+        else:
+            device = torch.device("cpu")
+            use_gpu = False
+    else:
+        use_gpu = device.type == "cuda"
+    # Dispatch to the appropriate implementation
+    if use_gpu:
+            if smatrixType == SMatrixType.CSR:
+                return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
+            elif smatrixType == SMatrixType.SELL:
+                return _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
+            elif smatrixType == SMatrixType.DENSE:
+                return _LS_GPU_stable(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold,show_logs)
+            else:
+                raise ValueError("Unsupported SMatrixType for GPU LS.")
+    else:
+        raise NotImplementedError("Only GPU implementations are currently available for LS.")
-def _LS_GPU_basic(SMatrix, y, numIterations, isSavingEachIteration=True, withTumor=True):
+def _LS_GPU_stable(SMatrix, y, numIterations, alpha, isSavingEachIteration, tumor_str, max_saves=5000, show_logs=True):
+    """
+    Stable GPU implementation of LS using projected gradient descent with diagonal preconditioner.
+    """
     device = torch.device(f"cuda:{config.select_best_gpu()}")
     T, Z, X, N = SMatrix.shape
     ZX = Z * X
     TN = T * N
-    if y.shape != (T, N):
-        raise ValueError(f"Expected y shape: ({T}, {N}), got {y.shape}")
+    # 1. Conversion et normalisation
     A_flat = torch.from_numpy(SMatrix).to(device=device, dtype=torch.float32).permute(0, 3, 1, 2).reshape(TN, ZX)
     y_flat = torch.from_numpy(y).to(device=device, dtype=torch.float32).reshape(TN)
-    # Initialisation uniforme (importante !)
-    theta_flat = torch.ones(ZX, dtype=torch.float32, device=device) / (Z * X)
-    saved_theta = []
-    saved_indices = []
-    if isSavingEachIteration:
-        saved_theta.append(theta_flat.reshape(Z, X).clone())
-        saved_indices.append(0)
-        step = max(1, (numIterations - 1) // 999)
-        save_count = 1
-    # Normalisation de A (par colonne) et de y (par max)
-    col_norms = torch.norm(A_flat, dim=0, keepdim=True)
-    A_normalized = A_flat / (col_norms + 1e-8)
-    y_normalized = y_flat / (torch.max(y_flat) + 1e-8)  # Normalise y entre 0 et ~1
-    description = f"AOT-BioMaps -- LS Reconstruction ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- GPU {torch.cuda.current_device()}"
-    with torch.no_grad():
-        for k in trange(numIterations, desc=description):
-            r = y_normalized - A_normalized @ theta_flat
-            p = A_normalized.T @ r
-            rsold = torch.dot(r, r)
-            for _ in range(2):
-                Ap = A_normalized @ p
-                alpha = rsold / (torch.dot(p, A_normalized.T @ Ap) + 1e-8)
-                theta_flat += alpha * p
-                theta_flat = torch.clamp(theta_flat, min=0)  # Projection sur R+
-                r -= alpha * Ap
-                rsnew = torch.dot(r, r)
-                if rsnew < 1e-8:
-                    break
-                p = A_normalized.T @ r + (rsnew / rsold) * p
-                rsold = rsnew
-            if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
-                # Normalise entre 0 et 1 avant sauvegarde
-                theta_normalized = theta_flat.clone()
-                if torch.max(theta_normalized) > 0:
-                    theta_normalized = theta_normalized / torch.max(theta_normalized)
-                saved_theta.append(theta_normalized.reshape(Z, X).clone())
-                saved_indices.append(k + 1)
-                save_count += 1
-                if save_count >= 1000:
-                    break
-    # Normalisation finale entre 0 et 1
-    if torch.max(theta_flat) > 0:
-        theta_flat = theta_flat / torch.max(theta_flat)
-    del A_flat, y_flat, A_normalized, y_normalized
+    norm_A = A_flat.max()
+    norm_y = y_flat.max()
+    A_flat.div_(norm_A + 1e-8)
+    y_flat.div_(norm_y + 1e-8)
+    # 2. Initialisation
+    lambda_k = torch.zeros(ZX, device=device)
+    lambda_history = [] if isSavingEachIteration else None
+    saved_indices = []  # Pour stocker les indices des itérations sauvegardées
+    # Calculate save indices
+    if numIterations <= max_saves:
+        save_indices = list(range(numIterations))
+    else:
+        step = numIterations // max_saves
+        save_indices = list(range(0, numIterations, step))
+        if save_indices[-1] != numIterations - 1:
+            save_indices.append(numIterations - 1)
+    # Préconditionneur diagonal
+    diag_AAT = torch.sum(A_flat ** 2, dim=0)
+    M_inv = 1.0 / torch.clamp(diag_AAT, min=1e-6)
+    # Pré-allocation des tenseurs
+    r_k = torch.empty_like(y_flat)
+    AT_r = torch.empty(ZX, device=device)
+    description = f"AOT-BioMaps -- Stable LS Reconstruction ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
+    iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+    for it in iterator:
+        # Calcul du résidu (inplace)
+        torch.matmul(A_flat, lambda_k, out=r_k)
+        r_k = y_flat - r_k
+        if isSavingEachIteration and it in save_indices:
+            lambda_history.append(lambda_k.clone().reshape(Z, X) * (norm_y / norm_A))
+            saved_indices.append(it)
+        # Gradient préconditionné (inplace)
+        torch.matmul(A_flat.T, r_k, out=AT_r)
+        AT_r *= M_inv
+        # Mise à jour avec pas fixe et projection (inplace)
+        lambda_k.add_(AT_r, alpha=alpha)
+        lambda_k.clamp_(min=0)
+    # 3. Dénormalisation
+    lambda_final = lambda_k.reshape(Z, X) * (norm_y / norm_A)
+    # Free memory
+    del A_flat, y_flat, r_k, AT_r
     torch.cuda.empty_cache()
     if isSavingEachIteration:
-        return [theta.cpu().numpy() for theta in saved_theta], saved_indices
+        return [t.cpu().numpy() for t in lambda_history], saved_indices
     else:
-        return theta_flat.reshape(Z, X).cpu().numpy(), None
+        return lambda_final.cpu().numpy(), None
+def _LS_GPU_opti(*args, **kwargs):
+    raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
+def _LS_GPU_multi(*args, **kwargs):
+    raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
+def _LS_CPU_opti(*args, **kwargs):
+    raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
+def _LS_CPU_basic(*args, **kwargs):
+    raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
+def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
+    """
+    Reconstruction par Moindres Carrés (LS) via Gradient Conjugué (CG) sur format CSR.
+    Utilise les mêmes arguments que la fonction MLEM, sans sous-fonctions Python.
-def _LS_CPU_basic(SMatrix, y, numIterations, isSavingEachIteration, withTumor):
-    try:
-        T, Z, X, N = SMatrix.shape
-        theta_p = np.ones((Z, X))
-        saved_theta = []
-        saved_indices = []
-        if isSavingEachIteration:
-            saved_theta.append(theta_p.copy())
-            saved_indices.append(0)
-            step = max(1, (numIterations - 1) // 999)
-            save_count = 1
-        description = f"AOT-BioMaps -- LS Reconstruction ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- CPU (basic) ----"
-        for k in trange(numIterations, desc=description):
-            ATA = np.zeros((Z, X, Z, X))
-            ATy = np.zeros((Z, X))
-            for _t in range(T):
-                for _n in range(N):
-                    ATA += np.einsum('ij,kl->ijkl', SMatrix[_t, :, :, _n], SMatrix[_t, :, :, _n])
-                    ATy += SMatrix[_t, :, :, _n] * y[_t, _n]
-            theta_p = np.linalg.solve(ATA.reshape(Z*X, Z*X), ATy.reshape(Z*X)).reshape(Z, X)
-            if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
-                saved_theta.append(theta_p.copy())
-                saved_indices.append(k + 1)
-                save_count += 1
-                if save_count >= 1000:
-                    break
-        if isSavingEachIteration:
-            return saved_theta, saved_indices
-        else:
-            return theta_p, None
-    except Exception as e:
-        print("Error in basic CPU LS:", type(e).__name__, ":", e)
-        return None, None
+    SMatrix: instance de SparseSMatrix_CSR (déjà allouée)
+    y: données mesurées (1D np.float32 de taille TN)
+    """
+    final_result = None
+    # Paramètres non utilisés dans CG mais conservés pour la signature: denominator_threshold, device
+    # --- Logique de Produit Scalaire (Intégrée) ---
+    def _dot_product_gpu(mod, a_ptr, b_ptr, N_int, stream):
+        block_size = 256
+        grid_size = (N_int + block_size - 1) // block_size
+        reduction_host = np.empty(grid_size, dtype=np.float32)
+        reduction_buffer = drv.mem_alloc(reduction_host.nbytes)
+        dot_kernel = mod.get_function("dot_product_reduction_kernel")
+        dot_kernel(reduction_buffer, a_ptr, b_ptr, np.int32(N_int),
+                   block=(block_size, 1, 1), grid=(grid_size, 1, 1), stream=stream)
+        drv.memcpy_dtoh(reduction_host, reduction_buffer)
+        total_dot = np.sum(reduction_host)
+        reduction_buffer.free()
+        return total_dot
+    # -----------------------------------------------
-def _LS_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, withTumor):
     try:
-        T, Z, X, N = SMatrix.shape
-        A_flat = SMatrix.astype(np.float32).transpose(0, 3, 1, 2).reshape(T*N, Z*X)
-        y_flat = y.astype(np.float32).reshape(-1)
-        theta_flat = np.zeros(Z*X, dtype=np.float32)
-        saved_theta = []
-        saved_indices = []
-        if isSavingEachIteration:
-            saved_theta.append(theta_flat.reshape(Z, X).copy())
-            saved_indices.append(0)
-            step = max(1, (numIterations - 1) // 999)
-            save_count = 1
-        A_normalized = A_flat / (np.linalg.norm(A_flat, axis=0, keepdims=True) + 1e-8)
-        y_normalized = y_flat / (np.linalg.norm(y_flat) + 1e-8)
-        description = f"AOT-BioMaps -- LS Reconstruction ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- CPU (optimized) ----"
-        for k in trange(numIterations, desc=description):
-            ATA = A_normalized.T @ A_normalized
-            ATy = A_normalized.T @ y_normalized
-            theta_flat = np.linalg.lstsq(ATA, ATy, rcond=None)[0]
-            if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
-                saved_theta.append(theta_flat.reshape(Z, X).copy())
-                saved_indices.append(k + 1)
-                save_count += 1
-                if save_count >= 1000:
-                    break
-        if isSavingEachIteration:
-            return saved_theta, saved_indices
+        if not isinstance(SMatrix, SMatrix.__class__):
+            raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
+        if SMatrix.ctx:
+            SMatrix.ctx.push()
+        dtype = np.float32
+        TN = SMatrix.N * SMatrix.T
+        ZX = SMatrix.Z * SMatrix.X
+        Z = SMatrix.Z
+        X = SMatrix.X
+        block_size = 256
+        tolerance = 1e-12
+        if show_logs:
+            print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
+            print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
+        stream = drv.Stream()
+        mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
+        # Récupération des Kernels
+        projection_kernel = mod.get_function('projection_kernel__CSR')
+        backprojection_kernel = mod.get_function('backprojection_kernel__CSR')
+        axpby_kernel = mod.get_function("vector_axpby_kernel")
+        minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
+        # --- Allocation des buffers (Pointeurs Bruts) ---
+        y = y.T.flatten().astype(dtype)
+        y_gpu = drv.mem_alloc(y.nbytes)
+        drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
+        theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # lambda
+        drv.memcpy_htod_async(theta_flat_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
+        q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)      # q = A*p
+        r_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)      # r (residue)
+        p_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)      # p (direction)
+        z_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)      # z = A^T A p
+        ATy_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)    # A^T y (constant)
+        # --- Initialisation CG ---
+        # 1. ATy = A^T * y
+        drv.memset_d32_async(ATy_flat_gpu, 0, ZX, stream)
+        backprojection_kernel(ATy_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
+                              y_gpu, np.int32(TN),
+                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+        # 2. q = A * theta_0
+        projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
+                          theta_flat_gpu, np.int32(TN),
+                          block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+        # 3. r_temp = A^T * q = A^T A theta_0
+        drv.memset_d32_async(r_flat_gpu, 0, ZX, stream)
+        backprojection_kernel(r_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
+                              q_flat_gpu, np.int32(TN),
+                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+        # 4. r_0 = ATy - r_temp (r = ATy + (-1)*r_temp)
+        axpby_kernel(r_flat_gpu, ATy_flat_gpu, r_flat_gpu,
+                     np.float32(1.0), np.float32(-1.0), np.int32(ZX),
+                     block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+        # 5. p_0 = r_0
+        drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
+        # 6. rho_prev = ||r_0||^2
+        rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+        # --- Boucle itérative ---
+        saved_theta, saved_indices = [], []
+        if numIterations <= max_saves:
+            save_indices = list(range(numIterations))
         else:
-            return theta_flat.reshape(Z, X), None
+            save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
+            if save_indices[-1] != numIterations - 1:
+                save_indices.append(numIterations - 1)
+        description = f"AOT-BioMaps -- LS-CG (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
+        iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        for it in iterator:
+            # a. q = A * p
+            projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
+                              p_flat_gpu, np.int32(TN),
+                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            # b. z = A^T * q = A^T A p
+            drv.memset_d32_async(z_flat_gpu, 0, ZX, stream)
+            backprojection_kernel(z_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
+                                  q_flat_gpu, np.int32(TN),
+                                  block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            # c. alpha = rho_prev / <p, z>
+            pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
+            if abs(pAp) < 1e-15: break
+            alpha = rho_prev / pAp
+            # d. theta = theta + alpha * p
+            axpby_kernel(theta_flat_gpu, theta_flat_gpu, p_flat_gpu,
+                         np.float32(1.0), alpha, np.int32(ZX),
+                         block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            # e. r = r - alpha * z
+            minus_axpy_kernel(r_flat_gpu, z_flat_gpu, alpha, np.int32(ZX),
+                              block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            # f. rho_curr = ||r||^2
+            rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+            if rho_curr < tolerance: break
+            # g. beta = rho_curr / rho_prev
+            beta = rho_curr / rho_prev
+            # h. p = r + beta * p
+            axpby_kernel(p_flat_gpu, r_flat_gpu, p_flat_gpu,
+                         np.float32(1.0), beta, np.int32(ZX),
+                         block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            rho_prev = rho_curr
+            if show_logs and (it % 10 == 0 or it == numIterations - 1):
+                drv.Context.synchronize()
+            if isSavingEachIteration and it in save_indices:
+                theta_host = np.empty(ZX, dtype=dtype)
+                drv.memcpy_dtoh(theta_host, theta_flat_gpu)
+                saved_theta.append(theta_host.reshape(Z, X))
+                saved_indices.append(it)
+        drv.Context.synchronize()
+        final_result = np.empty(ZX, dtype=dtype)
+        drv.memcpy_dtoh(final_result, theta_flat_gpu)
+        final_result = final_result.reshape(Z, X)
+        # Libération
+        y_gpu.free(); q_flat_gpu.free(); r_flat_gpu.free(); p_flat_gpu.free(); z_flat_gpu.free(); theta_flat_gpu.free(); ATy_flat_gpu.free()
+        return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
     except Exception as e:
-        print("Error in optimized CPU LS:", type(e).__name__, ":", e)
+        print(f"Error in LS_CG_sparseCSR_pycuda: {type(e).__name__}: {e}")
+        gc.collect()
         return None, None
+    finally:
+        if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
+            SMatrix.ctx.pop()
+def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
+    """
+    Reconstruction par Moindres Carrés (LS) via Gradient Conjugué (CG) sur format SELL-C-sigma.
+    Utilise les mêmes arguments que la fonction MLEM, sans sous-fonctions Python.
+    SMatrix: instance de SparseSMatrix_SELL (déjà allouée)
+    y: données mesurées (1D np.float32 de taille TN)
+    """
+    final_result = None
+    # --- Logique de Produit Scalaire (Intégrée) ---
+    def _dot_product_gpu(mod, a_ptr, b_ptr, N_int, stream):
+        block_size = 256
+        grid_size = (N_int + block_size - 1) // block_size
+        reduction_host = np.empty(grid_size, dtype=np.float32)
+        reduction_buffer = drv.mem_alloc(reduction_host.nbytes)
+        dot_kernel = mod.get_function("dot_product_reduction_kernel")
+        dot_kernel(reduction_buffer, a_ptr, b_ptr, np.int32(N_int),
+                   block=(block_size, 1, 1), grid=(grid_size, 1, 1), stream=stream)
+        drv.memcpy_dtoh(reduction_host, reduction_buffer)
+        total_dot = np.sum(reduction_host)
+        reduction_buffer.free()
+        return total_dot
+    # -----------------------------------------------
-def _LS_GPU_multi(SMatrix, y, numIterations, isSavingEachIteration, withTumor):
     try:
-        num_gpus = torch.cuda.device_count()
-        device = torch.device('cuda:0')
-        T, Z, X, N = SMatrix.shape
-        A_matrix_torch = torch.tensor(SMatrix, dtype=torch.float32).to(device).permute(0, 3, 1, 2).reshape(T*N, Z*X)
-        y_torch = torch.tensor(y, dtype=torch.float32).to(device).reshape(-1)
-        saved_theta = []
-        saved_indices = []
-        if isSavingEachIteration:
-            saved_theta.append(torch.zeros(Z, X, device=device).cpu().numpy())
-            saved_indices.append(0)
-            step = max(1, (numIterations - 1) // 999)
-            save_count = 1
-        A_split = torch.chunk(A_matrix_torch, num_gpus, dim=0)
-        y_split = torch.chunk(y_torch, num_gpus)
-        theta_0 = torch.zeros(Z*X, dtype=torch.float32, device=device)
-        theta_list = [theta_0.clone().to(device) for _ in range(num_gpus)]
-        description = f"AOT-BioMaps -- LS Reconstruction ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- multi-GPU ----"
-        for k in trange(numIterations, desc=description):
-            for i in range(num_gpus):
-                with torch.cuda.device(f'cuda:{i}'):
-                    A_i = A_split[i].to(f'cuda:{i}')
-                    y_i = y_split[i].to(f'cuda:{i}')
-                    theta_p = theta_list[i].to(f'cuda:{i}')
-                    r = y_i - A_i @ theta_p
-                    p = r.clone()
-                    rsold = torch.dot(r, r)
-                    for _ in range(2):
-                        Ap = A_i @ p
-                        alpha = rsold / (torch.dot(p, Ap) + 1e-8)
-                        theta_p += alpha * p
-                        r -= alpha * Ap
-                        rsnew = torch.dot(r, r)
-                        if rsnew < 1e-8:
-                            break
-                        p = r + (rsnew / rsold) * p
-                        rsold = rsnew
-                    theta_list[i] = theta_p.to('cuda:0')
-            if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
-                saved_theta.append(torch.stack(theta_list).mean(dim=0).reshape(Z, X).cpu().numpy())
-                saved_indices.append(k + 1)
-                save_count += 1
-                if save_count >= 1000:
-                    break
-        del A_matrix_torch, y_torch, A_split, y_split, theta_0
-        torch.cuda.empty_cache()
-        for i in range(num_gpus):
-            torch.cuda.empty_cache()
-        if isSavingEachIteration:
-            return saved_theta, saved_indices
+        if not isinstance(SMatrix, SMatrix.__class__):
+            raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
+        if SMatrix.sell_values_gpu is None:
+            raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
+        if SMatrix.ctx:
+            SMatrix.ctx.push()
+        dtype = np.float32
+        TN = int(SMatrix.N * SMatrix.T)
+        ZX = int(SMatrix.Z * SMatrix.X)
+        Z = SMatrix.Z
+        X = SMatrix.X
+        block_size = 256
+        tolerance = 1e-12
+        # Accès aux paramètres SELL
+        mod = SMatrix.sparse_mod
+        projection_kernel = mod.get_function("projection_kernel__SELL")
+        backprojection_kernel = mod.get_function("backprojection_kernel__SELL")
+        axpby_kernel = mod.get_function("vector_axpby_kernel")
+        minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
+        slice_height = np.int32(SMatrix.slice_height)
+        grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
+        stream = drv.Stream()
+        # Allocation des buffers
+        y = y.T.flatten().astype(dtype)
+        y_gpu = drv.mem_alloc(y.nbytes)
+        drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
+        theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
+        drv.memcpy_htod_async(theta_flat_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
+        q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
+        r_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
+        p_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
+        z_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
+        ATy_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
+        # --- Initialisation CG ---
+        # 1. ATy = A^T * y
+        drv.memset_d32_async(ATy_flat_gpu, 0, ZX, stream)
+        backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                              y_gpu, ATy_flat_gpu, np.int32(TN), slice_height,
+                              block=(block_size, 1, 1), grid=grid_rows, stream=stream)
+        # 2. q = A * theta_0
+        projection_kernel(q_flat_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                          theta_flat_gpu, np.int32(TN), slice_height,
+                          block=(block_size, 1, 1), grid=grid_rows, stream=stream)
+        # 3. r_temp = A^T * q = A^T A theta_0
+        drv.memset_d32_async(r_flat_gpu, 0, ZX, stream)
+        backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                              q_flat_gpu, r_flat_gpu, np.int32(TN), slice_height,
+                              block=(block_size, 1, 1), grid=grid_rows, stream=stream)
+        # 4. r_0 = ATy - r_temp
+        axpby_kernel(r_flat_gpu, ATy_flat_gpu, r_flat_gpu,
+                     np.float32(1.0), np.float32(-1.0), np.int32(ZX),
+                     block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+        # 5. p_0 = r_0
+        drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
+        # 6. rho_prev = ||r_0||^2
+        rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+        # --- Boucle itérative ---
+        saved_theta, saved_indices = [], []
+        if numIterations <= max_saves:
+            save_indices = list(range(numIterations))
         else:
-            return torch.stack(theta_list).mean(dim=0).reshape(Z, X).cpu().numpy(), None
+            save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
+            if save_indices[-1] != numIterations - 1:
+                save_indices.append(numIterations - 1)
+        description = f"AOT-BioMaps -- LS-CG (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
+        iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        for it in iterator:
+            # a. q = A * p
+            projection_kernel(q_flat_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                              p_flat_gpu, np.int32(TN), slice_height,
+                              block=(block_size, 1, 1), grid=grid_rows, stream=stream)
+            # b. z = A^T * q = A^T A p
+            drv.memset_d32_async(z_flat_gpu, 0, ZX, stream)
+            backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                                  q_flat_gpu, z_flat_gpu, np.int32(TN), slice_height,
+                                  block=(block_size, 1, 1), grid=grid_rows, stream=stream)
+            # c. alpha = rho_prev / <p, z>
+            pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
+            if abs(pAp) < 1e-15: break
+            alpha = rho_prev / pAp
+            # d. theta = theta + alpha * p
+            axpby_kernel(theta_flat_gpu, theta_flat_gpu, p_flat_gpu,
+                         np.float32(1.0), alpha, np.int32(ZX),
+                         block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            # e. r = r - alpha * z
+            minus_axpy_kernel(r_flat_gpu, z_flat_gpu, alpha, np.int32(ZX),
+                              block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            # f. rho_curr = ||r||^2
+            rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+            if rho_curr < tolerance: break
+            # g. beta = rho_curr / rho_prev
+            beta = rho_curr / rho_prev
+            # h. p = r + beta * p
+            axpby_kernel(p_flat_gpu, r_flat_gpu, p_flat_gpu,
+                         np.float32(1.0), beta, np.int32(ZX),
+                         block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            rho_prev = rho_curr
+            stream.synchronize()
+            if isSavingEachIteration and it in save_indices:
+                out = np.empty(ZX, dtype=dtype)
+                drv.memcpy_dtoh(out, theta_flat_gpu)
+                saved_theta.append(out.reshape((Z, X)))
+                saved_indices.append(it)
+        # final copy
+        res = np.empty(ZX, dtype=np.float32)
+        drv.memcpy_dtoh(res, theta_flat_gpu)
+        final_result = res.reshape((Z, X))
+        # free temporaries
+        y_gpu.free(); q_flat_gpu.free(); r_flat_gpu.free(); p_flat_gpu.free(); z_flat_gpu.free(); theta_flat_gpu.free(); ATy_flat_gpu.free()
+        return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
     except Exception as e:
-        print("Error in multi-GPU LS:", type(e).__name__, ":", e)
-        del A_matrix_torch, y_torch, A_split, y_split, theta_0
-        torch.cuda.empty_cache()
-        for i in range(num_gpus):
-            torch.cuda.empty_cache()
+        print(f"Error in LS_CG_sparseSELL_pycuda: {type(e).__name__}: {e}")
+        gc.collect()
         return None, None
-def _LS_TV_GPU(SMatrix, y, numIterations, isSavingEachIteration=True, withTumor=True, lambda_tv=1e-3, L_Factor=1.0, renormalize_output=True):
-    device = torch.device(f"cuda:{torch.cuda.current_device()}")
-    T, Z, X, N = SMatrix.shape
-    ZX = Z * X
-    TN = T * N
-    # Conversion des données
-    A_flat = torch.from_numpy(SMatrix).to(device=device, dtype=torch.float32).permute(0, 3, 1, 2).reshape(TN, ZX)
-    y_flat = torch.from_numpy(y).to(device=device, dtype=torch.float32).reshape(TN)
-    # Vérification des NaN/Inf
-    if torch.isnan(A_flat).any() or torch.isinf(A_flat).any():
-        raise ValueError("SMatrix contient des NaN ou Inf.")
-    if torch.isnan(y_flat).any() or torch.isinf(y_flat).any():
-        raise ValueError("y contient des NaN ou Inf.")
-    # Normalisation
-    A_norm = torch.max(torch.abs(A_flat))
-    y_norm = torch.max(torch.abs(y_flat))
-    if A_norm > 0:
-        A_flat = A_flat / A_norm
-    if y_norm > 0:
-        y_flat = y_flat / y_norm
-    # Initialisation uniforme
-    theta_flat = torch.ones(ZX, device=device) / (Z * X)
-    theta_prev = theta_flat.clone()
-    t = torch.tensor(1.0, device=device)
-    # Constante de Lipschitz
-    L = L_Factor * (torch.norm(A_flat, 2).item() ** 2)
-    # Stockage des itérations
-    theta_history = []
-    saved_indices = []
-    if isSavingEachIteration:
-        theta_history.append(theta_flat.reshape(Z, X).clone())
-        saved_indices.append(0)
-        step = max(1, (numIterations - 1) // 999)
-        save_count = 1
-    description = f"AOT-BioMaps -- LS + TV (λ: {lambda_tv}) ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- GPU {torch.cuda.current_device()}"
-    # Pré-allocation
-    grad_tv = torch.zeros_like(theta_flat)
-    for k in trange(numIterations, desc=description):
-        # Gradient des moindres carrés
-        grad_ls = A_flat.T @ (A_flat @ theta_flat - y_flat)
-        # Calcul du gradient TV (version corrigée avec padding pour les bords)
-        theta_2d = theta_flat.reshape(Z, X)
-        # Dérivées avant (forward differences) avec padding zéro aux bords
-        diff_z = torch.zeros_like(theta_2d)
-        diff_z[1:, :] = theta_2d[1:, :] - theta_2d[:-1, :]  # Dérivée verticale
-        diff_x = torch.zeros_like(theta_2d)
-        diff_x[:, 1:] = theta_2d[:, 1:] - theta_2d[:, :-1]  # Dérivée horizontale
-        # Divergence du gradient (≈ Laplacien)
-        div_grad = torch.zeros_like(theta_2d)
-        # Contribution de diff_z (d/dz)
-        div_grad[:-1, :] += diff_z[1:, :]  # d/dz (θ_{z+1} - θ_z) → +1 à θ_z
-        div_grad[1:, :] -= diff_z[1:, :]   # → -1 à θ_{z+1}
-        # Contribution de diff_x (d/dx)
-        div_grad[:, :-1] += diff_x[:, 1:]  # d/dx (θ_{x+1} - θ_x) → +1 à θ_x
-        div_grad[:, 1:] -= diff_x[:, 1:]   # → -1 à θ_{x+1}
-        grad_tv = div_grad.reshape(-1)
-        # Mise à jour avec régularisation TV
-        grad_total = grad_ls + lambda_tv * grad_tv
-        theta_new = theta_flat - (1/L) * grad_total
-        theta_new = torch.clamp(theta_new, min=0.0)
-        # Accélération de FISTA
-        t_new = (1 + torch.sqrt(1 + 4 * t**2)) / 2
-        theta_flat = theta_new + ((t - 1) / t_new) * (theta_new - theta_prev)
-        theta_prev = theta_new.clone()
-        t = t_new
-        # Sauvegarde conditionnelle
-        if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
-            theta_normalized = theta_flat.clone()
-            if torch.max(theta_normalized) > 0:
-                theta_normalized /= torch.max(theta_normalized)
-            theta_history.append(theta_normalized.reshape(Z, X).clone())
-            saved_indices.append(k + 1)
-            save_count += 1
-            if save_count >= 1000:
-                break
-    # Renormalisation finale
-    if renormalize_output:
-        if A_norm > 0 and y_norm > 0:
-            theta_flat *= (y_norm / (A_norm + 1e-8))
-        if torch.max(theta_flat) > 0:
-            theta_flat /= torch.max(theta_flat)
-    # Nettoyage
-    del A_flat, y_flat, theta_prev, grad_ls, grad_tv, theta_new, div_grad, diff_z, diff_x
-    torch.cuda.empty_cache()
-    if isSavingEachIteration:
-        return [t.cpu().numpy() for t in theta_history], saved_indices
-    else:
-        return theta_flat.reshape(Z, X).cpu().numpy(), None
+    finally:
+        if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
+            SMatrix.ctx.pop()

AOT-biomaps 2.9.138__py3-none-any.whl → 2.9.279__py3-none-any.whl

Potentially problematic release.

AOT-biomaps 2.9.138py3-none-any.whl → 2.9.279py3-none-any.whl