PyPI - AOT-biomaps - Versions diffs - 2.9.261__py3-none-any.whl → 2.9.294__py3-none-any.whl - Mend

AOT-biomaps 2.9.261py3-none-any.whl → 2.9.294py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (14) hide show

AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +400 -10
AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +60 -25
AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +442 -11
AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +48 -26
AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +115 -109
AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
AOT_biomaps/AOT_Recon/AlgebraicRecon.py +27 -20
AOT_biomaps/AOT_Recon/PrimalDualRecon.py +94 -41
AOT_biomaps/AOT_Recon/ReconTools.py +164 -18
AOT_biomaps/__init__.py +34 -1
{aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/METADATA +1 -1
{aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/RECORD +14 -13
{aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/WHEEL +0 -0
{aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/top_level.txt +0 -0

AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py CHANGED Viewed

@@ -1,34 +1,57 @@
 from AOT_biomaps.Config import config
+from AOT_biomaps.AOT_Recon.ReconTools import calculate_memory_requirement, check_gpu_memory
+from AOT_biomaps.AOT_Recon.ReconEnums import SMatrixType
 import torch
 import numpy as np
 from tqdm import trange
-from AOT_biomaps.AOT_Recon.ReconTools import calculate_memory_requirement, check_gpu_memory
+import pycuda.driver as drv
+import torch.cuda
+import gc
 def LS(
     SMatrix,
     y,
-    numIterations=5000,
-    alpha=1e-3,
+    numIterations=100,
     isSavingEachIteration=True,
     withTumor=True,
+    alpha=1e-1,
     device=None,
+    use_numba=False,
+    denominator_threshold=1e-6,
     max_saves=5000,
-    show_logs=True
+    show_logs=True,
+    smatrixType=SMatrixType.SELL
 ):
     """
     Least Squares reconstruction using Projected Gradient Descent (PGD) with non-negativity constraint.
     Currently only implements the stable GPU version.
     """
     tumor_str = "WITH" if withTumor else "WITHOUT"
-    # Force GPU usage for now
+    # Auto-select device and method
     if device is None:
         if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
-            raise RuntimeError("CUDA is required for this implementation.")
-        device = torch.device(f"cuda:{config.select_best_gpu()}")
+            device = torch.device(f"cuda:{config.select_best_gpu()}")
+            use_gpu = True
+        else:
+            device = torch.device("cpu")
+            use_gpu = False
+    else:
+        use_gpu = device.type == "cuda"
+    # Dispatch to the appropriate implementation
+    if use_gpu:
+            if smatrixType == SMatrixType.CSR:
+                return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
+            elif smatrixType == SMatrixType.SELL:
+                return _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
+            elif smatrixType == SMatrixType.DENSE:
+                return _LS_GPU_stable(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold,show_logs)
+            else:
+                raise ValueError("Unsupported SMatrixType for GPU LS.")
     else:
-        if device.type != "cuda":
-            raise RuntimeError("Only GPU implementation is available for now.")
-    return _LS_GPU_stable(SMatrix, y, numIterations, alpha, isSavingEachIteration, tumor_str, max_saves, show_logs=show_logs)
+        raise NotImplementedError("Only GPU implementations are currently available for LS.")
 def _LS_GPU_stable(SMatrix, y, numIterations, alpha, isSavingEachIteration, tumor_str, max_saves=5000, show_logs=True):
     """
@@ -104,3 +127,370 @@ def _LS_CPU_opti(*args, **kwargs):
 def _LS_CPU_basic(*args, **kwargs):
     raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
+def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
+    """
+    Reconstruction par Moindres Carrés (LS) via Gradient Conjugué (CG) sur format CSR.
+    Utilise les mêmes arguments que la fonction MLEM, sans sous-fonctions Python.
+    SMatrix: instance de SparseSMatrix_CSR (déjà allouée)
+    y: données mesurées (1D np.float32 de taille TN)
+    """
+    final_result = None
+    # Paramètres non utilisés dans CG mais conservés pour la signature: denominator_threshold, device
+    # --- Logique de Produit Scalaire (Intégrée) ---
+    def _dot_product_gpu(mod, a_ptr, b_ptr, N_int, stream):
+        block_size = 256
+        grid_size = (N_int + block_size - 1) // block_size
+        reduction_host = np.empty(grid_size, dtype=np.float32)
+        reduction_buffer = drv.mem_alloc(reduction_host.nbytes)
+        dot_kernel = mod.get_function("dot_product_reduction_kernel")
+        dot_kernel(reduction_buffer, a_ptr, b_ptr, np.int32(N_int),
+                   block=(block_size, 1, 1), grid=(grid_size, 1, 1), stream=stream)
+        drv.memcpy_dtoh(reduction_host, reduction_buffer)
+        total_dot = np.sum(reduction_host)
+        reduction_buffer.free()
+        return total_dot
+    # -----------------------------------------------
+    try:
+        if not isinstance(SMatrix, SMatrix.__class__):
+            raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
+        if SMatrix.ctx:
+            SMatrix.ctx.push()
+        dtype = np.float32
+        TN = SMatrix.N * SMatrix.T
+        ZX = SMatrix.Z * SMatrix.X
+        Z = SMatrix.Z
+        X = SMatrix.X
+        block_size = 256
+        tolerance = 1e-12
+        if show_logs:
+            print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
+            print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
+        stream = drv.Stream()
+        # Récupération des Kernels
+        projection_kernel = SMatrix.sparse_mod.get_function('projection_kernel__CSR')
+        backprojection_kernel = SMatrix.sparse_mod.get_function('backprojection_kernel__CSR')
+        axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
+        minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
+        # --- Allocation des buffers (Pointeurs Bruts) ---
+        y = y.T.flatten().astype(dtype)
+        y_gpu = drv.mem_alloc(y.nbytes)
+        drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
+        theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # lambda
+        drv.memcpy_htod_async(theta_flat_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
+        q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)      # q = A*p
+        r_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)      # r (residue)
+        p_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)      # p (direction)
+        z_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)      # z = A^T A p
+        ATy_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)    # A^T y (constant)
+        # --- Initialisation CG ---
+        # 1. ATy = A^T * y
+        drv.memset_d32_async(ATy_flat_gpu, 0, ZX, stream)
+        backprojection_kernel(ATy_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
+                              y_gpu, np.int32(TN),
+                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+        # 2. q = A * theta_0
+        projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
+                          theta_flat_gpu, np.int32(TN),
+                          block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+        # 3. r_temp = A^T * q = A^T A theta_0
+        drv.memset_d32_async(r_flat_gpu, 0, ZX, stream)
+        backprojection_kernel(r_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
+                              q_flat_gpu, np.int32(TN),
+                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+        # 4. r_0 = ATy - r_temp (r = ATy + (-1)*r_temp)
+        axpby_kernel(r_flat_gpu, ATy_flat_gpu, r_flat_gpu,
+                     np.float32(1.0), np.float32(-1.0), np.int32(ZX),
+                     block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+        # 5. p_0 = r_0
+        drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
+        # 6. rho_prev = ||r_0||^2
+        rho_prev = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+        # --- Boucle itérative ---
+        saved_theta, saved_indices = [], []
+        if numIterations <= max_saves:
+            save_indices = list(range(numIterations))
+        else:
+            save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
+            if save_indices[-1] != numIterations - 1:
+                save_indices.append(numIterations - 1)
+        description = f"AOT-BioMaps -- LS-CG (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
+        iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        for it in iterator:
+            # a. q = A * p
+            projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
+                              p_flat_gpu, np.int32(TN),
+                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            # b. z = A^T * q = A^T A p
+            drv.memset_d32_async(z_flat_gpu, 0, ZX, stream)
+            backprojection_kernel(z_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
+                                  q_flat_gpu, np.int32(TN),
+                                  block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            # c. alpha = rho_prev / <p, z>
+            pAp = _dot_product_gpu(SMatrix.sparse_mod, p_flat_gpu, z_flat_gpu, ZX, stream)
+            if abs(pAp) < 1e-15: break
+            alpha = rho_prev / pAp
+            # d. theta = theta + alpha * p
+            axpby_kernel(theta_flat_gpu, theta_flat_gpu, p_flat_gpu,
+                         np.float32(1.0), alpha, np.int32(ZX),
+                         block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            # e. r = r - alpha * z
+            minus_axpy_kernel(r_flat_gpu, z_flat_gpu, alpha, np.int32(ZX),
+                              block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            # f. rho_curr = ||r||^2
+            rho_curr = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+            if rho_curr < tolerance: break
+            # g. beta = rho_curr / rho_prev
+            beta = rho_curr / rho_prev
+            # h. p = r + beta * p
+            axpby_kernel(p_flat_gpu, r_flat_gpu, p_flat_gpu,
+                         np.float32(1.0), beta, np.int32(ZX),
+                         block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            rho_prev = rho_curr
+            if show_logs and (it % 10 == 0 or it == numIterations - 1):
+                drv.Context.synchronize()
+            if isSavingEachIteration and it in save_indices:
+                theta_host = np.empty(ZX, dtype=dtype)
+                drv.memcpy_dtoh(theta_host, theta_flat_gpu)
+                saved_theta.append(theta_host.reshape(Z, X))
+                saved_indices.append(it)
+        drv.Context.synchronize()
+        final_result = np.empty(ZX, dtype=dtype)
+        drv.memcpy_dtoh(final_result, theta_flat_gpu)
+        final_result = final_result.reshape(Z, X)
+        # Libération
+        y_gpu.free(); q_flat_gpu.free(); r_flat_gpu.free(); p_flat_gpu.free(); z_flat_gpu.free(); theta_flat_gpu.free(); ATy_flat_gpu.free()
+        return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
+    except Exception as e:
+        print(f"Error in LS_CG_sparseCSR_pycuda: {type(e).__name__}: {e}")
+        gc.collect()
+        return None, None
+    finally:
+        if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
+            SMatrix.ctx.pop()
+def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
+    """
+    Reconstruction par Moindres Carrés (LS) via Gradient Conjugué (CG) sur format SELL-C-sigma.
+    Utilise les mêmes arguments que la fonction MLEM, sans sous-fonctions Python.
+    SMatrix: instance de SparseSMatrix_SELL (déjà allouée)
+    y: données mesurées (1D np.float32 de taille TN)
+    """
+    final_result = None
+    # --- Logique de Produit Scalaire (Intégrée) ---
+    def _dot_product_gpu(mod, a_ptr, b_ptr, N_int, stream):
+        block_size = 256
+        grid_size = (N_int + block_size - 1) // block_size
+        reduction_host = np.empty(grid_size, dtype=np.float32)
+        reduction_buffer = drv.mem_alloc(reduction_host.nbytes)
+        dot_kernel = mod.get_function("dot_product_reduction_kernel")
+        dot_kernel(reduction_buffer, a_ptr, b_ptr, np.int32(N_int),
+                   block=(block_size, 1, 1), grid=(grid_size, 1, 1), stream=stream)
+        drv.memcpy_dtoh(reduction_host, reduction_buffer)
+        total_dot = np.sum(reduction_host)
+        reduction_buffer.free()
+        return total_dot
+    # -----------------------------------------------
+    try:
+        if not isinstance(SMatrix, SMatrix.__class__):
+            raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
+        if SMatrix.sell_values_gpu is None:
+            raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
+        if SMatrix.ctx:
+            SMatrix.ctx.push()
+        dtype = np.float32
+        TN = int(SMatrix.N * SMatrix.T)
+        ZX = int(SMatrix.Z * SMatrix.X)
+        Z = SMatrix.Z
+        X = SMatrix.X
+        block_size = 256
+        tolerance = 1e-12
+        # Accès aux paramètres SELL
+        projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
+        backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
+        axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
+        minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
+        slice_height = np.int32(SMatrix.slice_height)
+        grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
+        stream = drv.Stream()
+        # Allocation des buffers
+        y = y.T.flatten().astype(dtype)
+        y_gpu = drv.mem_alloc(y.nbytes)
+        drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
+        theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
+        drv.memcpy_htod_async(theta_flat_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
+        q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
+        r_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
+        p_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
+        z_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
+        ATy_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
+        # --- Initialisation CG ---
+        # 1. ATy = A^T * y
+        drv.memset_d32_async(ATy_flat_gpu, 0, ZX, stream)
+        backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                              y_gpu, ATy_flat_gpu, np.int32(TN), slice_height,
+                              block=(block_size, 1, 1), grid=grid_rows, stream=stream)
+        # 2. q = A * theta_0
+        projection_kernel(q_flat_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                          theta_flat_gpu, np.int32(TN), slice_height,
+                          block=(block_size, 1, 1), grid=grid_rows, stream=stream)
+        # 3. r_temp = A^T * q = A^T A theta_0
+        drv.memset_d32_async(r_flat_gpu, 0, ZX, stream)
+        backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                              q_flat_gpu, r_flat_gpu, np.int32(TN), slice_height,
+                              block=(block_size, 1, 1), grid=grid_rows, stream=stream)
+        # 4. r_0 = ATy - r_temp
+        axpby_kernel(r_flat_gpu, ATy_flat_gpu, r_flat_gpu,
+                     np.float32(1.0), np.float32(-1.0), np.int32(ZX),
+                     block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+        # 5. p_0 = r_0
+        drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
+        # 6. rho_prev = ||r_0||^2
+        rho_prev = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+        # --- Boucle itérative ---
+        saved_theta, saved_indices = [], []
+        if numIterations <= max_saves:
+            save_indices = list(range(numIterations))
+        else:
+            save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
+            if save_indices[-1] != numIterations - 1:
+                save_indices.append(numIterations - 1)
+        description = f"AOT-BioMaps -- LS-CG (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
+        iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        for it in iterator:
+            # a. q = A * p
+            projection_kernel(q_flat_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                              p_flat_gpu, np.int32(TN), slice_height,
+                              block=(block_size, 1, 1), grid=grid_rows, stream=stream)
+            # b. z = A^T * q = A^T A p
+            drv.memset_d32_async(z_flat_gpu, 0, ZX, stream)
+            backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                                  q_flat_gpu, z_flat_gpu, np.int32(TN), slice_height,
+                                  block=(block_size, 1, 1), grid=grid_rows, stream=stream)
+            # c. alpha = rho_prev / <p, z>
+            pAp = _dot_product_gpu(SMatrix.sparse_mod, p_flat_gpu, z_flat_gpu, ZX, stream)
+            if abs(pAp) < 1e-15: break
+            alpha = rho_prev / pAp
+            # d. theta = theta + alpha * p
+            axpby_kernel(theta_flat_gpu, theta_flat_gpu, p_flat_gpu,
+                         np.float32(1.0), alpha, np.int32(ZX),
+                         block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            # e. r = r - alpha * z
+            minus_axpy_kernel(r_flat_gpu, z_flat_gpu, alpha, np.int32(ZX),
+                              block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            # f. rho_curr = ||r||^2
+            rho_curr = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+            if rho_curr < tolerance: break
+            # g. beta = rho_curr / rho_prev
+            beta = rho_curr / rho_prev
+            # h. p = r + beta * p
+            axpby_kernel(p_flat_gpu, r_flat_gpu, p_flat_gpu,
+                         np.float32(1.0), beta, np.int32(ZX),
+                         block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            rho_prev = rho_curr
+            stream.synchronize()
+            if isSavingEachIteration and it in save_indices:
+                out = np.empty(ZX, dtype=dtype)
+                drv.memcpy_dtoh(out, theta_flat_gpu)
+                saved_theta.append(out.reshape((Z, X)))
+                saved_indices.append(it)
+        # final copy
+        res = np.empty(ZX, dtype=np.float32)
+        drv.memcpy_dtoh(res, theta_flat_gpu)
+        final_result = res.reshape((Z, X))
+        # free temporaries
+        y_gpu.free(); q_flat_gpu.free(); r_flat_gpu.free(); p_flat_gpu.free(); z_flat_gpu.free(); theta_flat_gpu.free(); ATy_flat_gpu.free()
+        return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
+    except Exception as e:
+        print(f"Error in LS_CG_sparseSELL_pycuda: {type(e).__name__}: {e}")
+        gc.collect()
+        return None, None
+    finally:
+        if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
+            SMatrix.ctx.pop()

AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py CHANGED Viewed

@@ -26,7 +26,6 @@ def MLEM(
     max_saves=5000,
     show_logs=True,
     smatrixType=SMatrixType.SELL,
-    Z=350,
 ):
     """
     Unified MLEM algorithm for Acousto-Optic Tomography.
@@ -59,7 +58,7 @@ def MLEM(
     # Dispatch to the appropriate implementation
     if use_gpu:
             if smatrixType == SMatrixType.CSR:
-                return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
+                return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
             elif smatrixType == SMatrixType.SELL:
                 return MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
             elif smatrixType == SMatrixType.DENSE:
@@ -229,7 +228,7 @@ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
         print(f"Error in optimized CPU MLEM: {type(e).__name__}: {e}")
         return None, None
-def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs=True):
+def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
     """
     SMatrix: instance of SparseMatrixGPU (already allocated)
     y: measured data (1D np.float32 of length TN)
@@ -237,25 +236,39 @@ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumo
     Assumptions:
     - SMatrix.values_gpu and SMatrix.col_ind_gpu and SMatrix.row_ptr_gpu are device pointers
     - SMatrix.norm_factor_inv_gpu exists
+    - SMatrix.ctx is the PyCUDA context for the target GPU.
     """
+    # We use a final_result placeholder to ensure it's defined outside the try block
+    final_result = None
     try:
         if not isinstance(SMatrix, SparseSMatrix_CSR):
             raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
+        # --- CONTEXT FIX: Push the context associated with SMatrix ---
+        # This ensures all subsequent PyCUDA operations use the correct GPU/context.
+        if SMatrix.ctx:
+            SMatrix.ctx.push()
+        # -----------------------------------------------------------
         dtype = np.float32
         TN = SMatrix.N * SMatrix.T
         ZX = SMatrix.Z * SMatrix.X
-        if Z is None:
-            Z = SMatrix.Z
+        # Ensure Z and X are correctly defined for reshaping
+        Z = SMatrix.Z
         X = SMatrix.X
         if show_logs:
+            # We assume SMatrix was initialized using the correct device index.
+            print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
             print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
-        # Use existing context from SMatrix
         # streams
         stream = drv.Stream()
         # allocate device buffers
+        y = y.T.flatten().astype(np.float32)
         y_gpu = drv.mem_alloc(y.nbytes)
         drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
@@ -269,12 +282,11 @@ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumo
         e_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
         c_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
-        mlem_mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
-        projection_kernel = mlem_mod.get_function('projection_kernel__CSR')
-        backprojection_kernel = mlem_mod.get_function('backprojection_kernel__CSR')
-        ratio_kernel = mlem_mod.get_function('ratio_kernel')
-        update_kernel = mlem_mod.get_function('update_theta_kernel')
+        # Assuming the cubin file is found globally or managed by the caller
+        projection_kernel = SMatrix.sparse_mod.get_function('projection_kernel__CSR')
+        backprojection_kernel = SMatrix.sparse_mod.get_function('backprojection_kernel__CSR')
+        ratio_kernel = SMatrix.sparse_mod.get_function('ratio_kernel')
+        update_kernel = SMatrix.sparse_mod.get_function('update_theta_kernel')
         block_size = 256
         saved_theta, saved_indices = [], []
@@ -296,7 +308,7 @@ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumo
             # ratio: e = y / max(q, threshold)
             ratio_kernel(e_flat_gpu, y_gpu, q_flat_gpu, np.float32(denominator_threshold), np.int32(TN),
-                         block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
             # backprojection: c = A^T * e
             drv.memset_d32_async(c_flat_gpu, 0, ZX, stream)
@@ -319,45 +331,60 @@ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumo
         drv.Context.synchronize()
-        result = np.empty(ZX, dtype=dtype)
-        drv.memcpy_dtoh(result, theta_flat_gpu)
-        result = result.reshape(Z, X)
+        final_result = np.empty(ZX, dtype=dtype)
+        drv.memcpy_dtoh(final_result, theta_flat_gpu)
+        final_result = final_result.reshape(Z, X)
         # free local allocations
         y_gpu.free(); q_flat_gpu.free(); e_flat_gpu.free(); c_flat_gpu.free(); theta_flat_gpu.free()
-        return (saved_theta, saved_indices) if isSavingEachIteration else (result, None)
+        return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
     except Exception as e:
         print(f"Error in MLEM_sparseCSR_pycuda: {type(e).__name__}: {e}")
         gc.collect()
         return None, None
+    finally:
+        # --- CONTEXT FIX: Pop the context ---
+        if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
+            SMatrix.ctx.pop()
+        # ------------------------------------
-def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs=True):
+def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
     """
     MLEM using SELL-C-σ kernels already present on device.
     y must be float32 length TN.
     """
+    final_result = None
     try:
         # check if SMatrix is SparseSMatrix_SELL object
         if not isinstance(SMatrix, SparseSMatrix_SELL):
             raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
         if SMatrix.sell_values_gpu is None:
             raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
+        # --- CONTEXT FIX: Push the context associated with SMatrix ---
+        # This ensures all subsequent PyCUDA operations use the correct GPU/context.
+        if SMatrix.ctx:
+            SMatrix.ctx.push()
+        # -----------------------------------------------------------
         TN = int(SMatrix.N * SMatrix.T)
         ZX = int(SMatrix.Z * SMatrix.X)
         dtype = np.float32
         block_size = 256
-        mod = SMatrix.sparse_mod
-        proj = mod.get_function("projection_kernel__SELL")
-        backproj = mod.get_function("backprojection_kernel__SELL")
-        ratio = mod.get_function("ratio_kernel")
-        update = mod.get_function("update_theta_kernel")
+        proj = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
+        backproj = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
+        ratio = SMatrix.sparse_mod.get_function("ratio_kernel")
+        update = SMatrix.sparse_mod.get_function("update_theta_kernel")
         stream = drv.Stream()
         # device buffers
+        y = y.T.flatten().astype(np.float32)
         y_gpu = drv.mem_alloc(y.nbytes)
         drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
@@ -420,9 +447,17 @@ def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tum
         # free temporaries
         y_gpu.free(); q_gpu.free(); e_gpu.free(); c_gpu.free(); theta_gpu.free()
-        return (saved_theta, saved_indices) if isSavingEachIteration else (res.reshape((SMatrix.Z, SMatrix.X)), None)
+        final_result = res.reshape((SMatrix.Z, SMatrix.X))
+        return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
     except Exception as e:
         print(f"Error in MLEM_sparseSELL_pycuda: {type(e).__name__}: {e}")
         gc.collect()
         return None, None
+    finally:
+        # --- CONTEXT FIX: Pop the context ---
+        if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
+            SMatrix.ctx.pop()
+        # ------------------------------------

AOT-biomaps 2.9.261__py3-none-any.whl → 2.9.294__py3-none-any.whl

Potentially problematic release.

AOT-biomaps 2.9.261py3-none-any.whl → 2.9.294py3-none-any.whl