PyPI - AOT-biomaps - Versions diffs - 2.9.279__py3-none-any.whl → 2.9.300__py3-none-any.whl - Mend - Supply Chain Defender

AOT-biomaps 2.9.279py3-none-any.whl → 2.9.300py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (14) hide show

AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py CHANGED Viewed

@@ -23,8 +23,7 @@ def LS(
     denominator_threshold=1e-6,
     max_saves=5000,
     show_logs=True,
-    smatrixType=SMatrixType.SELL,
-    Z=350,
+    smatrixType=SMatrixType.SELL
 ):
     """
     Least Squares reconstruction using Projected Gradient Descent (PGD) with non-negativity constraint.
@@ -44,7 +43,7 @@ def LS(
     # Dispatch to the appropriate implementation
     if use_gpu:
             if smatrixType == SMatrixType.CSR:
-                return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
+                return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
             elif smatrixType == SMatrixType.SELL:
                 return _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
             elif smatrixType == SMatrixType.DENSE:
@@ -181,13 +180,12 @@ def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tu
             print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
         stream = drv.Stream()
-        mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
         # Récupération des Kernels
-        projection_kernel = mod.get_function('projection_kernel__CSR')
-        backprojection_kernel = mod.get_function('backprojection_kernel__CSR')
-        axpby_kernel = mod.get_function("vector_axpby_kernel")
-        minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
+        projection_kernel = SMatrix.sparse_mod.get_function('projection_kernel__CSR')
+        backprojection_kernel = SMatrix.sparse_mod.get_function('backprojection_kernel__CSR')
+        axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
+        minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
         # --- Allocation des buffers (Pointeurs Bruts) ---
         y = y.T.flatten().astype(dtype)
@@ -231,7 +229,7 @@ def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tu
         drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
         # 6. rho_prev = ||r_0||^2
-        rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+        rho_prev = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
         # --- Boucle itérative ---
         saved_theta, saved_indices = [], []
@@ -258,7 +256,7 @@ def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tu
                                   block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
             # c. alpha = rho_prev / <p, z>
-            pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
+            pAp = _dot_product_gpu(SMatrix.sparse_mod, p_flat_gpu, z_flat_gpu, ZX, stream)
             if abs(pAp) < 1e-15: break
             alpha = rho_prev / pAp
@@ -273,7 +271,7 @@ def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tu
                               block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
             # f. rho_curr = ||r||^2
-            rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+            rho_curr = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
             if rho_curr < tolerance: break
@@ -364,11 +362,10 @@ def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, t
         tolerance = 1e-12
         # Accès aux paramètres SELL
-        mod = SMatrix.sparse_mod
-        projection_kernel = mod.get_function("projection_kernel__SELL")
-        backprojection_kernel = mod.get_function("backprojection_kernel__SELL")
-        axpby_kernel = mod.get_function("vector_axpby_kernel")
-        minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
+        projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
+        backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
+        axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
+        minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
         slice_height = np.int32(SMatrix.slice_height)
         grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
@@ -416,7 +413,7 @@ def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, t
         drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
         # 6. rho_prev = ||r_0||^2
-        rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+        rho_prev = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
         # --- Boucle itérative ---
         saved_theta, saved_indices = [], []
@@ -443,7 +440,7 @@ def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, t
                                   block=(block_size, 1, 1), grid=grid_rows, stream=stream)
             # c. alpha = rho_prev / <p, z>
-            pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
+            pAp = _dot_product_gpu(SMatrix.sparse_mod, p_flat_gpu, z_flat_gpu, ZX, stream)
             if abs(pAp) < 1e-15: break
             alpha = rho_prev / pAp
@@ -458,7 +455,7 @@ def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, t
                               block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
             # f. rho_curr = ||r||^2
-            rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
+            rho_curr = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
             if rho_curr < tolerance: break

AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py CHANGED Viewed

@@ -26,7 +26,6 @@ def MLEM(
     max_saves=5000,
     show_logs=True,
     smatrixType=SMatrixType.SELL,
-    Z=350,
 ):
     """
     Unified MLEM algorithm for Acousto-Optic Tomography.
@@ -59,11 +58,11 @@ def MLEM(
     # Dispatch to the appropriate implementation
     if use_gpu:
             if smatrixType == SMatrixType.CSR:
-                return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
+                return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
             elif smatrixType == SMatrixType.SELL:
-                return MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
+                return MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
             elif smatrixType == SMatrixType.DENSE:
-                return _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold,show_logs)
+                return _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold,show_logs)
             else:
                 raise ValueError("Unsupported SMatrixType for GPU MLEM.")
     else:
@@ -229,49 +228,60 @@ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
         print(f"Error in optimized CPU MLEM: {type(e).__name__}: {e}")
         return None, None
-def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
+def MLEM_sparseCSR_pycuda(
+    SMatrix,
+    y,
+    numIterations,
+    isSavingEachIteration,
+    tumor_str,
+    max_saves,
+    denominator_threshold,
+    show_logs=True,
+):
     """
-    SMatrix: instance of SparseMatrixGPU (already allocated)
-    y: measured data (1D np.float32 of length TN)
-    Assumptions:
-    - SMatrix.values_gpu and SMatrix.col_ind_gpu and SMatrix.row_ptr_gpu are device pointers
-    - SMatrix.norm_factor_inv_gpu exists
-    - SMatrix.ctx is the PyCUDA context for the target GPU.
+    Robust MLEM implementation for CSR SMatrix using PyCUDA kernels.
+    Expects SMatrix to be SparseSMatrix_CSR with attributes:
+      - values_gpu, col_ind_gpu, row_ptr_gpu (device pointers)
+      - norm_factor_inv_gpu (device pointer)
+      - sparse_mod (loaded module with kernels)
+      - ctx (PyCUDA context)
+    Returns (saved_theta_list, saved_indices) if isSavingEachIteration else (final_theta, None)
     """
-    # We use a final_result placeholder to ensure it's defined outside the try block
     final_result = None
+    # Local holders to free in finally
+    y_gpu = q_flat_gpu = e_flat_gpu = c_flat_gpu = theta_flat_gpu = None
     try:
         if not isinstance(SMatrix, SparseSMatrix_CSR):
             raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
-        # --- CONTEXT FIX: Push the context associated with SMatrix ---
-        # This ensures all subsequent PyCUDA operations use the correct GPU/context.
-        if SMatrix.ctx:
+        # push context (if provided)
+        popped_ctx = False
+        if getattr(SMatrix, "ctx", None):
             SMatrix.ctx.push()
-        # -----------------------------------------------------------
+            popped_ctx = True
         dtype = np.float32
-        TN = SMatrix.N * SMatrix.T
-        ZX = SMatrix.Z * SMatrix.X
-        # Ensure Z and X are correctly defined for reshaping
-        Z = SMatrix.Z
-        X = SMatrix.X
-        if show_logs:
-            # We assume SMatrix was initialized using the correct device index.
-            print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
-            print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
-        # streams
+        TN = int(SMatrix.N * SMatrix.T)
+        ZX = int(SMatrix.Z * SMatrix.X)
+        Z = int(SMatrix.Z)
+        X = int(SMatrix.X)
+        # Make sure required GPU pointers exist
+        if getattr(SMatrix, "values_gpu", None) is None or getattr(SMatrix, "col_ind_gpu", None) is None or getattr(SMatrix, "row_ptr_gpu", None) is None:
+            raise RuntimeError("SMatrix is missing GPU buffers (values_gpu / col_ind_gpu / row_ptr_gpu)")
+        if getattr(SMatrix, "norm_factor_inv_gpu", None) is None:
+            raise RuntimeError("SMatrix.norm_factor_inv_gpu not available on GPU")
+        # stream for async operations
         stream = drv.Stream()
-        # allocate device buffers
-        y = y.T.flatten().astype(np.float32)
-        y_gpu = drv.mem_alloc(y.nbytes)
-        drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
+        # prepare device buffers
+        y_arr = np.ascontiguousarray(y.T.flatten().astype(np.float32))
+        y_gpu = drv.mem_alloc(y_arr.nbytes)
+        drv.memcpy_htod_async(y_gpu, y_arr, stream)
         theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
         initial_theta = np.full(ZX, 0.1, dtype=dtype)
@@ -283,62 +293,111 @@ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumo
         e_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
         c_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
-        # Assuming the cubin file is found globally or managed by the caller
-        mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
-        projection_kernel = mod.get_function('projection_kernel__CSR')
-        backprojection_kernel = mod.get_function('backprojection_kernel__CSR')
-        ratio_kernel = mod.get_function('ratio_kernel')
-        update_kernel = mod.get_function('update_theta_kernel')
+        # Ensure kernels exist
+        projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__CSR")
+        backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__CSR")
+        ratio_kernel = SMatrix.sparse_mod.get_function("ratio_kernel")
+        update_kernel = SMatrix.sparse_mod.get_function("update_theta_kernel")
         block_size = 256
-        saved_theta, saved_indices = [], []
+        # prepare save indices once
         if numIterations <= max_saves:
             save_indices = list(range(numIterations))
         else:
-            save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
+            step = max(1, numIterations // max_saves)
+            save_indices = list(range(0, numIterations, step))
             if save_indices[-1] != numIterations - 1:
                 save_indices.append(numIterations - 1)
+        saved_theta = []
+        saved_indices = []
         description = f"AOT-BioMaps -- ML-EM (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
         iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        # grid sizes
+        grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
+        grid_cols = ((ZX + block_size - 1) // block_size, 1, 1)
         for it in iterator:
             # projection: q = A * theta
-            projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
-                              theta_flat_gpu, np.int32(TN),
-                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1),
-                              stream=stream)
+            projection_kernel(
+                q_flat_gpu,
+                SMatrix.values_gpu,
+                SMatrix.row_ptr_gpu,
+                SMatrix.col_ind_gpu,
+                theta_flat_gpu,
+                np.int32(TN),
+                block=(block_size, 1, 1),
+                grid=grid_rows,
+                stream=stream,
+            )
             # ratio: e = y / max(q, threshold)
-            ratio_kernel(e_flat_gpu, y_gpu, q_flat_gpu, np.float32(denominator_threshold), np.int32(TN),
-                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
-            # backprojection: c = A^T * e
+            ratio_kernel(
+                e_flat_gpu,
+                y_gpu,
+                q_flat_gpu,
+                np.float32(denominator_threshold),
+                np.int32(TN),
+                block=(block_size, 1, 1),
+                grid=grid_rows,
+                stream=stream,
+            )
+            # backprojection: c = A^T * e  (zero c first)
             drv.memset_d32_async(c_flat_gpu, 0, ZX, stream)
-            backprojection_kernel(c_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
-                                  e_flat_gpu, np.int32(TN),
-                                  block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            backprojection_kernel(
+                c_flat_gpu,
+                SMatrix.values_gpu,
+                SMatrix.row_ptr_gpu,
+                SMatrix.col_ind_gpu,
+                e_flat_gpu,
+                np.int32(TN),
+                block=(block_size, 1, 1),
+                grid=grid_rows,
+                stream=stream,
+            )
             # update: theta *= norm_factor_inv * c
-            update_kernel(theta_flat_gpu, c_flat_gpu, norm_factor_inv_gpu, np.int32(ZX),
-                          block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            update_kernel(
+                theta_flat_gpu,
+                c_flat_gpu,
+                norm_factor_inv_gpu,
+                np.int32(ZX),
+                block=(block_size, 1, 1),
+                grid=grid_cols,
+                stream=stream,
+            )
+            # periodic synchronization for stability / logging
             if show_logs and (it % 10 == 0 or it == numIterations - 1):
-                drv.Context.synchronize()
+                stream.synchronize()
+            # save snapshot if required
             if isSavingEachIteration and it in save_indices:
+                # ensure kernels finished
+                stream.synchronize()
                 theta_host = np.empty(ZX, dtype=dtype)
                 drv.memcpy_dtoh(theta_host, theta_flat_gpu)
                 saved_theta.append(theta_host.reshape(Z, X))
-                saved_indices.append(it)
-        drv.Context.synchronize()
-        final_result = np.empty(ZX, dtype=dtype)
-        drv.memcpy_dtoh(final_result, theta_flat_gpu)
-        final_result = final_result.reshape(Z, X)
-        # free local allocations
-        y_gpu.free(); q_flat_gpu.free(); e_flat_gpu.free(); c_flat_gpu.free(); theta_flat_gpu.free()
+                saved_indices.append(int(it))
+        # make sure everything finished
+        stream.synchronize()
+        final_theta_host = np.empty(ZX, dtype=dtype)
+        drv.memcpy_dtoh(final_theta_host, theta_flat_gpu)
+        final_result = final_theta_host.reshape(Z, X)
+        # free local allocations (will also be freed in finally if exception)
+        try:
+            y_gpu.free()
+            q_flat_gpu.free()
+            e_flat_gpu.free()
+            c_flat_gpu.free()
+            theta_flat_gpu.free()
+        except Exception:
+            pass
         return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
@@ -346,47 +405,64 @@ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumo
         print(f"Error in MLEM_sparseCSR_pycuda: {type(e).__name__}: {e}")
         gc.collect()
         return None, None
-    finally:
-        # --- CONTEXT FIX: Pop the context ---
-        if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
-            SMatrix.ctx.pop()
-        # ------------------------------------
-def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
+    finally:
+        # free buffers if still allocated
+        for buf in ("y_gpu", "q_flat_gpu", "e_flat_gpu", "c_flat_gpu", "theta_flat_gpu"):
+            try:
+                val = locals().get(buf, None)
+                if val is not None:
+                    val.free()
+            except Exception:
+                pass
+        # pop context safely
+        try:
+            if SMatrix and hasattr(SMatrix, "ctx") and SMatrix.ctx and popped_ctx:
+                SMatrix.ctx.pop()
+        except Exception:
+            pass
+def MLEM_sparseSELL_pycuda(
+    SMatrix,
+    y,
+    numIterations,
+    isSavingEachIteration,
+    tumor_str,
+    max_saves,
+    denominator_threshold,
+    show_logs=True,
+):
     """
     MLEM using SELL-C-σ kernels already present on device.
     y must be float32 length TN.
+    Version propre : diagnostics retirés.
     """
     final_result = None
     try:
-        # check if SMatrix is SparseSMatrix_SELL object
         if not isinstance(SMatrix, SparseSMatrix_SELL):
             raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
         if SMatrix.sell_values_gpu is None:
             raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
-        # --- CONTEXT FIX: Push the context associated with SMatrix ---
-        # This ensures all subsequent PyCUDA operations use the correct GPU/context.
+        # Context
         if SMatrix.ctx:
             SMatrix.ctx.push()
-        # -----------------------------------------------------------
         TN = int(SMatrix.N * SMatrix.T)
         ZX = int(SMatrix.Z * SMatrix.X)
         dtype = np.float32
         block_size = 256
-        mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
-        proj = mod.get_function("projection_kernel__SELL")
-        backproj = mod.get_function("backprojection_kernel__SELL")
-        ratio = mod.get_function("ratio_kernel")
-        update = mod.get_function("update_theta_kernel")
+        proj = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
+        backproj = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
+        ratio = SMatrix.sparse_mod.get_function("ratio_kernel")
+        update = SMatrix.sparse_mod.get_function("update_theta_kernel")
         stream = drv.Stream()
-        # device buffers
+        # Device buffers
         y = y.T.flatten().astype(np.float32)
         y_gpu = drv.mem_alloc(y.nbytes)
         drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
@@ -405,6 +481,7 @@ def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tum
         grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
         grid_cols = ((ZX + block_size - 1) // block_size, 1, 1)
+        # Prepare save indices
         saved_theta, saved_indices = [], []
         if numIterations <= max_saves:
             save_indices = list(range(numIterations))
@@ -415,52 +492,59 @@ def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tum
         description = f"AOT-BioMaps -- ML-EM (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
         iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        # --- MLEM Loop ---
         for it in iterator:
-            # projection
-            proj(q_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, slice_ptr_gpu, slice_len_gpu,
-                    theta_gpu, np.int32(TN), slice_height,
-                    block=(block_size,1,1), grid=grid_rows, stream=stream)
-            # ratio
+            proj(q_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu,
+                 slice_ptr_gpu, slice_len_gpu,
+                 theta_gpu, np.int32(TN), slice_height,
+                 block=(block_size,1,1), grid=grid_rows, stream=stream)
             ratio(e_gpu, y_gpu, q_gpu, np.float32(denominator_threshold), np.int32(TN),
-                    block=(block_size,1,1), grid=grid_rows, stream=stream)
+                  block=(block_size,1,1), grid=grid_rows, stream=stream)
-            # zero c
             drv.memset_d32_async(c_gpu, 0, ZX, stream)
-            # backprojection accumulate
-            backproj(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, slice_ptr_gpu, slice_len_gpu,
-                        e_gpu, c_gpu, np.int32(TN), slice_height,
-                        block=(block_size,1,1), grid=grid_rows, stream=stream)
+            backproj(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu,
+                     slice_ptr_gpu, slice_len_gpu,
+                     e_gpu, c_gpu, np.int32(TN), slice_height,
+                     block=(block_size,1,1), grid=grid_rows, stream=stream)
-            # update
             update(theta_gpu, c_gpu, SMatrix.norm_factor_inv_gpu, np.int32(ZX),
-                    block=(block_size,1,1), grid=grid_cols, stream=stream)
+                   block=(block_size,1,1), grid=grid_cols, stream=stream)
-            stream.synchronize()
             if isSavingEachIteration and it in save_indices:
                 out = np.empty(ZX, dtype=np.float32)
                 drv.memcpy_dtoh(out, theta_gpu)
                 saved_theta.append(out.reshape((SMatrix.Z, SMatrix.X)))
                 saved_indices.append(it)
-        # final copy
+        stream.synchronize()
         res = np.empty(ZX, dtype=np.float32)
         drv.memcpy_dtoh(res, theta_gpu)
-        # free temporaries
-        y_gpu.free(); q_gpu.free(); e_gpu.free(); c_gpu.free(); theta_gpu.free()
+        # free
+        try:
+            y_gpu.free()
+            q_gpu.free()
+            e_gpu.free()
+            c_gpu.free()
+            theta_gpu.free()
+        except Exception:
+            pass
         final_result = res.reshape((SMatrix.Z, SMatrix.X))
         return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
     except Exception as e:
         print(f"Error in MLEM_sparseSELL_pycuda: {type(e).__name__}: {e}")
         gc.collect()
         return None, None
     finally:
-        # --- CONTEXT FIX: Pop the context ---
         if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
-            SMatrix.ctx.pop()
-        # ------------------------------------
+            try:
+                SMatrix.ctx.pop()
+            except Exception:
+                pass