PyPI - AOT-biomaps - Versions diffs - 2.9.291__tar.gz → 2.9.312__tar.gz - Mend

AOT-biomaps 2.9.291tar.gz → 2.9.312tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (52) hide show

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py RENAMED Viewed

@@ -23,8 +23,7 @@ def LS(
     denominator_threshold=1e-6,
     max_saves=5000,
     show_logs=True,
-    smatrixType=SMatrixType.SELL,
-    Z=350,
+    smatrixType=SMatrixType.SELL
 ):
     """
     Least Squares reconstruction using Projected Gradient Descent (PGD) with non-negativity constraint.
@@ -44,7 +43,7 @@ def LS(
     # Dispatch to the appropriate implementation
     if use_gpu:
             if smatrixType == SMatrixType.CSR:
-                return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
+                return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
             elif smatrixType == SMatrixType.SELL:
                 return _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
             elif smatrixType == SMatrixType.DENSE:

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py RENAMED Viewed

@@ -26,7 +26,6 @@ def MLEM(
     max_saves=5000,
     show_logs=True,
     smatrixType=SMatrixType.SELL,
-    Z=350,
 ):
     """
     Unified MLEM algorithm for Acousto-Optic Tomography.
@@ -59,11 +58,11 @@ def MLEM(
     # Dispatch to the appropriate implementation
     if use_gpu:
             if smatrixType == SMatrixType.CSR:
-                return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
+                return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
             elif smatrixType == SMatrixType.SELL:
-                return MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
+                return MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
             elif smatrixType == SMatrixType.DENSE:
-                return _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold,show_logs)
+                return _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold,show_logs)
             else:
                 raise ValueError("Unsupported SMatrixType for GPU MLEM.")
     else:
@@ -229,49 +228,60 @@ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
         print(f"Error in optimized CPU MLEM: {type(e).__name__}: {e}")
         return None, None
-def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
+def MLEM_sparseCSR_pycuda(
+    SMatrix,
+    y,
+    numIterations,
+    isSavingEachIteration,
+    tumor_str,
+    max_saves,
+    denominator_threshold,
+    show_logs=True,
+):
     """
-    SMatrix: instance of SparseMatrixGPU (already allocated)
-    y: measured data (1D np.float32 of length TN)
-    Assumptions:
-    - SMatrix.values_gpu and SMatrix.col_ind_gpu and SMatrix.row_ptr_gpu are device pointers
-    - SMatrix.norm_factor_inv_gpu exists
-    - SMatrix.ctx is the PyCUDA context for the target GPU.
+    Robust MLEM implementation for CSR SMatrix using PyCUDA kernels.
+    Expects SMatrix to be SparseSMatrix_CSR with attributes:
+      - values_gpu, col_ind_gpu, row_ptr_gpu (device pointers)
+      - norm_factor_inv_gpu (device pointer)
+      - sparse_mod (loaded module with kernels)
+      - ctx (PyCUDA context)
+    Returns (saved_theta_list, saved_indices) if isSavingEachIteration else (final_theta, None)
     """
-    # We use a final_result placeholder to ensure it's defined outside the try block
     final_result = None
+    # Local holders to free in finally
+    y_gpu = q_flat_gpu = e_flat_gpu = c_flat_gpu = theta_flat_gpu = None
     try:
         if not isinstance(SMatrix, SparseSMatrix_CSR):
             raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
-        # --- CONTEXT FIX: Push the context associated with SMatrix ---
-        # This ensures all subsequent PyCUDA operations use the correct GPU/context.
-        if SMatrix.ctx:
+        # push context (if provided)
+        popped_ctx = False
+        if getattr(SMatrix, "ctx", None):
             SMatrix.ctx.push()
-        # -----------------------------------------------------------
+            popped_ctx = True
         dtype = np.float32
-        TN = SMatrix.N * SMatrix.T
-        ZX = SMatrix.Z * SMatrix.X
-        # Ensure Z and X are correctly defined for reshaping
-        Z = SMatrix.Z
-        X = SMatrix.X
-        if show_logs:
-            # We assume SMatrix was initialized using the correct device index.
-            print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
-            print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
-        # streams
+        TN = int(SMatrix.N * SMatrix.T)
+        ZX = int(SMatrix.Z * SMatrix.X)
+        Z = int(SMatrix.Z)
+        X = int(SMatrix.X)
+        # Make sure required GPU pointers exist
+        if getattr(SMatrix, "values_gpu", None) is None or getattr(SMatrix, "col_ind_gpu", None) is None or getattr(SMatrix, "row_ptr_gpu", None) is None:
+            raise RuntimeError("SMatrix is missing GPU buffers (values_gpu / col_ind_gpu / row_ptr_gpu)")
+        if getattr(SMatrix, "norm_factor_inv_gpu", None) is None:
+            raise RuntimeError("SMatrix.norm_factor_inv_gpu not available on GPU")
+        # stream for async operations
         stream = drv.Stream()
-        # allocate device buffers
-        y = y.T.flatten().astype(np.float32)
-        y_gpu = drv.mem_alloc(y.nbytes)
-        drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
+        # prepare device buffers
+        y_arr = np.ascontiguousarray(y.T.flatten().astype(np.float32))
+        y_gpu = drv.mem_alloc(y_arr.nbytes)
+        drv.memcpy_htod_async(y_gpu, y_arr, stream)
         theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
         initial_theta = np.full(ZX, 0.1, dtype=dtype)
@@ -283,61 +293,111 @@ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumo
         e_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
         c_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
-        # Assuming the cubin file is found globally or managed by the caller
-        projection_kernel = SMatrix.sparse_mod.get_function('projection_kernel__CSR')
-        backprojection_kernel = SMatrix.sparse_mod.get_function('backprojection_kernel__CSR')
-        ratio_kernel = SMatrix.sparse_mod.get_function('ratio_kernel')
-        update_kernel = SMatrix.sparse_mod.get_function('update_theta_kernel')
+        # Ensure kernels exist
+        projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__CSR")
+        backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__CSR")
+        ratio_kernel = SMatrix.sparse_mod.get_function("ratio_kernel")
+        update_kernel = SMatrix.sparse_mod.get_function("update_theta_kernel")
         block_size = 256
-        saved_theta, saved_indices = [], []
+        # prepare save indices once
         if numIterations <= max_saves:
             save_indices = list(range(numIterations))
         else:
-            save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
+            step = max(1, numIterations // max_saves)
+            save_indices = list(range(0, numIterations, step))
             if save_indices[-1] != numIterations - 1:
                 save_indices.append(numIterations - 1)
+        saved_theta = []
+        saved_indices = []
         description = f"AOT-BioMaps -- ML-EM (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
         iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        # grid sizes
+        grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
+        grid_cols = ((ZX + block_size - 1) // block_size, 1, 1)
         for it in iterator:
             # projection: q = A * theta
-            projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
-                              theta_flat_gpu, np.int32(TN),
-                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1),
-                              stream=stream)
+            projection_kernel(
+                q_flat_gpu,
+                SMatrix.values_gpu,
+                SMatrix.row_ptr_gpu,
+                SMatrix.col_ind_gpu,
+                theta_flat_gpu,
+                np.int32(TN),
+                block=(block_size, 1, 1),
+                grid=grid_rows,
+                stream=stream,
+            )
             # ratio: e = y / max(q, threshold)
-            ratio_kernel(e_flat_gpu, y_gpu, q_flat_gpu, np.float32(denominator_threshold), np.int32(TN),
-                              block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
-            # backprojection: c = A^T * e
+            ratio_kernel(
+                e_flat_gpu,
+                y_gpu,
+                q_flat_gpu,
+                np.float32(denominator_threshold),
+                np.int32(TN),
+                block=(block_size, 1, 1),
+                grid=grid_rows,
+                stream=stream,
+            )
+            # backprojection: c = A^T * e  (zero c first)
             drv.memset_d32_async(c_flat_gpu, 0, ZX, stream)
-            backprojection_kernel(c_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
-                                  e_flat_gpu, np.int32(TN),
-                                  block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            backprojection_kernel(
+                c_flat_gpu,
+                SMatrix.values_gpu,
+                SMatrix.row_ptr_gpu,
+                SMatrix.col_ind_gpu,
+                e_flat_gpu,
+                np.int32(TN),
+                block=(block_size, 1, 1),
+                grid=grid_rows,
+                stream=stream,
+            )
             # update: theta *= norm_factor_inv * c
-            update_kernel(theta_flat_gpu, c_flat_gpu, norm_factor_inv_gpu, np.int32(ZX),
-                          block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
+            update_kernel(
+                theta_flat_gpu,
+                c_flat_gpu,
+                norm_factor_inv_gpu,
+                np.int32(ZX),
+                block=(block_size, 1, 1),
+                grid=grid_cols,
+                stream=stream,
+            )
+            # periodic synchronization for stability / logging
             if show_logs and (it % 10 == 0 or it == numIterations - 1):
-                drv.Context.synchronize()
+                stream.synchronize()
+            # save snapshot if required
             if isSavingEachIteration and it in save_indices:
+                # ensure kernels finished
+                stream.synchronize()
                 theta_host = np.empty(ZX, dtype=dtype)
                 drv.memcpy_dtoh(theta_host, theta_flat_gpu)
                 saved_theta.append(theta_host.reshape(Z, X))
-                saved_indices.append(it)
-        drv.Context.synchronize()
-        final_result = np.empty(ZX, dtype=dtype)
-        drv.memcpy_dtoh(final_result, theta_flat_gpu)
-        final_result = final_result.reshape(Z, X)
-        # free local allocations
-        y_gpu.free(); q_flat_gpu.free(); e_flat_gpu.free(); c_flat_gpu.free(); theta_flat_gpu.free()
+                saved_indices.append(int(it))
+        # make sure everything finished
+        stream.synchronize()
+        final_theta_host = np.empty(ZX, dtype=dtype)
+        drv.memcpy_dtoh(final_theta_host, theta_flat_gpu)
+        final_result = final_theta_host.reshape(Z, X)
+        # free local allocations (will also be freed in finally if exception)
+        try:
+            y_gpu.free()
+            q_flat_gpu.free()
+            e_flat_gpu.free()
+            c_flat_gpu.free()
+            theta_flat_gpu.free()
+        except Exception:
+            pass
         return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
@@ -345,32 +405,50 @@ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumo
         print(f"Error in MLEM_sparseCSR_pycuda: {type(e).__name__}: {e}")
         gc.collect()
         return None, None
-    finally:
-        # --- CONTEXT FIX: Pop the context ---
-        if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
-            SMatrix.ctx.pop()
-        # ------------------------------------
-def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
+    finally:
+        # free buffers if still allocated
+        for buf in ("y_gpu", "q_flat_gpu", "e_flat_gpu", "c_flat_gpu", "theta_flat_gpu"):
+            try:
+                val = locals().get(buf, None)
+                if val is not None:
+                    val.free()
+            except Exception:
+                pass
+        # pop context safely
+        try:
+            if SMatrix and hasattr(SMatrix, "ctx") and SMatrix.ctx and popped_ctx:
+                SMatrix.ctx.pop()
+        except Exception:
+            pass
+def MLEM_sparseSELL_pycuda(
+    SMatrix,
+    y,
+    numIterations,
+    isSavingEachIteration,
+    tumor_str,
+    max_saves,
+    denominator_threshold,
+    show_logs=True,
+):
     """
     MLEM using SELL-C-σ kernels already present on device.
     y must be float32 length TN.
+    Version propre : diagnostics retirés.
     """
     final_result = None
     try:
-        # check if SMatrix is SparseSMatrix_SELL object
         if not isinstance(SMatrix, SparseSMatrix_SELL):
             raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
         if SMatrix.sell_values_gpu is None:
             raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
-        # --- CONTEXT FIX: Push the context associated with SMatrix ---
-        # This ensures all subsequent PyCUDA operations use the correct GPU/context.
+        # Context
         if SMatrix.ctx:
             SMatrix.ctx.push()
-        # -----------------------------------------------------------
         TN = int(SMatrix.N * SMatrix.T)
         ZX = int(SMatrix.Z * SMatrix.X)
@@ -384,7 +462,7 @@ def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tum
         stream = drv.Stream()
-        # device buffers
+        # Device buffers
         y = y.T.flatten().astype(np.float32)
         y_gpu = drv.mem_alloc(y.nbytes)
         drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
@@ -403,6 +481,7 @@ def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tum
         grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
         grid_cols = ((ZX + block_size - 1) // block_size, 1, 1)
+        # Prepare save indices
         saved_theta, saved_indices = [], []
         if numIterations <= max_saves:
             save_indices = list(range(numIterations))
@@ -413,52 +492,59 @@ def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tum
         description = f"AOT-BioMaps -- ML-EM (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
         iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        # --- MLEM Loop ---
         for it in iterator:
-            # projection
-            proj(q_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, slice_ptr_gpu, slice_len_gpu,
-                    theta_gpu, np.int32(TN), slice_height,
-                    block=(block_size,1,1), grid=grid_rows, stream=stream)
-            # ratio
+            proj(q_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu,
+                 slice_ptr_gpu, slice_len_gpu,
+                 theta_gpu, np.int32(TN), slice_height,
+                 block=(block_size,1,1), grid=grid_rows, stream=stream)
             ratio(e_gpu, y_gpu, q_gpu, np.float32(denominator_threshold), np.int32(TN),
-                    block=(block_size,1,1), grid=grid_rows, stream=stream)
+                  block=(block_size,1,1), grid=grid_rows, stream=stream)
-            # zero c
             drv.memset_d32_async(c_gpu, 0, ZX, stream)
-            # backprojection accumulate
-            backproj(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, slice_ptr_gpu, slice_len_gpu,
-                        e_gpu, c_gpu, np.int32(TN), slice_height,
-                        block=(block_size,1,1), grid=grid_rows, stream=stream)
+            backproj(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu,
+                     slice_ptr_gpu, slice_len_gpu,
+                     e_gpu, c_gpu, np.int32(TN), slice_height,
+                     block=(block_size,1,1), grid=grid_rows, stream=stream)
-            # update
             update(theta_gpu, c_gpu, SMatrix.norm_factor_inv_gpu, np.int32(ZX),
-                    block=(block_size,1,1), grid=grid_cols, stream=stream)
+                   block=(block_size,1,1), grid=grid_cols, stream=stream)
-            stream.synchronize()
             if isSavingEachIteration and it in save_indices:
                 out = np.empty(ZX, dtype=np.float32)
                 drv.memcpy_dtoh(out, theta_gpu)
                 saved_theta.append(out.reshape((SMatrix.Z, SMatrix.X)))
                 saved_indices.append(it)
-        # final copy
+        stream.synchronize()
         res = np.empty(ZX, dtype=np.float32)
         drv.memcpy_dtoh(res, theta_gpu)
-        # free temporaries
-        y_gpu.free(); q_gpu.free(); e_gpu.free(); c_gpu.free(); theta_gpu.free()
+        # free
+        try:
+            y_gpu.free()
+            q_gpu.free()
+            e_gpu.free()
+            c_gpu.free()
+            theta_gpu.free()
+        except Exception:
+            pass
         final_result = res.reshape((SMatrix.Z, SMatrix.X))
         return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
     except Exception as e:
         print(f"Error in MLEM_sparseSELL_pycuda: {type(e).__name__}: {e}")
         gc.collect()
         return None, None
     finally:
-        # --- CONTEXT FIX: Pop the context ---
         if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
-            SMatrix.ctx.pop()
-        # ------------------------------------
+            try:
+                SMatrix.ctx.pop()
+            except Exception:
+                pass

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py RENAMED Viewed

@@ -51,7 +51,7 @@ def CP_TV(
             if smatrixType == SMatrixType.CSR:
                 raise NotImplementedError("GPU Chambolle Pock (LS-TV) with CSR not implemented.")
             elif smatrixType == SMatrixType.SELL:
-                return CP_TV_Tikhonov_sparseCSR_pycuda(SMatrix, y, alpha,beta, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs, k_security, use_power_method, auto_alpha_gamma, apply_positivity_clamp, tikhonov_as_gradient, use_laplacian, laplacian_beta_scale)
+                return CP_TV_Tikhonov_sparseSELL_pycuda(SMatrix, y, alpha,beta, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs, k_security, use_power_method, auto_alpha_gamma, apply_positivity_clamp, tikhonov_as_gradient, use_laplacian, laplacian_beta_scale)
             elif smatrixType == SMatrixType.DENSE:
                 return CP_TV_dense(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
             else:
@@ -223,7 +223,7 @@ def CP_TV_dense(
     else:
         return (x.reshape(Z, X) * (norm_y / norm_A)).cpu().numpy(), None
-def CP_TV_Tikhonov_sparseCSR_pycuda(
+def CP_TV_Tikhonov_sparseSELL_pycuda(
     SMatrix,
     y,
     alpha=None,               # TV regularization parameter (if None, alpha is auto-scaled)

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py RENAMED Viewed

@@ -224,27 +224,20 @@ class SparseSMatrix_CSR:
     def getMatrixSize(self):
         """
         Retourne la taille totale de la matrice CSR en Go (en sommant la mémoire GPU).
+        Utilise les attributs de taille stockés pour contourner l'AttributeError de DeviceAllocation.
         """
+        # Note: L'utilisateur doit s'assurer que self.row_ptr existe avant cet appel.
         if self.row_ptr is None:
             return {"error": "La matrice sparse n'est pas encore allouée."}
         total_bytes = 0
-        # Mémoire GPU (row_ptr_gpu, col_ind_gpu, values_gpu, norm_factor_inv_gpu)
-        if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
-            total_bytes += self.row_ptr_gpu.size
-        if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
-            total_bytes += self.col_ind_gpu.size
-        if hasattr(self, 'values_gpu') and self.values_gpu:
-            total_bytes += self.values_gpu.size
-        if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
-            total_bytes += self.norm_factor_inv_gpu.size
-        # NOTE: Les versions précédentes utilisaient le .size de l'objet DeviceAllocation,
-        # qui était problématique. Si l'erreur se reproduit ici, il faudra
-        # stocker la taille en octets comme nous l'avons fait pour SELL.
-        # Pour l'instant, nous conservons la méthode getMatrixSize originale de CSR.
+        # Somme des tailles stockées (Taille calculée et attribuée dans allocate et compute_norm_factor_from_csr)
+        total_bytes += getattr(self, 'row_ptr_gpu_size', 0)
+        total_bytes += getattr(self, 'col_ind_gpu_size', 0)
+        total_bytes += getattr(self, 'values_gpu_size', 0)
+        total_bytes += getattr(self, 'norm_factor_inv_gpu_size', 0)
         return total_bytes / (1024**3)
     def free(self):

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py RENAMED Viewed

@@ -92,13 +92,11 @@ class SparseSMatrix_SELL:
     def allocate(self):
         """
         Build SELL-C-σ directly from manip AcousticFields in streaming blocks.
-        NOTE: This is the logic of allocate_sell_c_sigma_direct from the working class.
+        Corrected: per-block row_nnz copy, zeroing of host block, proper sync.
         """
         if self.sparse_mod is None:
             raise RuntimeError("CUDA module not loaded. Check compilation.")
-        # NOTE: Les noms de kernel (count_nnz_rows_kernel, fill_kernel__SELL) sont utilisés
-        # car ils sont présents dans la classe fonctionnelle.
         count_kernel = self.sparse_mod.get_function("count_nnz_rows_kernel")
         fill_kernel  = self.sparse_mod.get_function("fill_kernel__SELL")
@@ -106,34 +104,34 @@ class SparseSMatrix_SELL:
         num_cols = int(self.Z * self.X)
         C = int(self.slice_height)
-        # host temporary block
         br = int(self.block_rows)
-        bytes_per_elem = np.dtype(np.float32).itemsize
         dense_host = np.empty((br, num_cols), dtype=np.float32)
-        # Allocation 1: Dense block GPU memory
+        # Allocation dense buffer on device (size = br * num_cols)
         dense_gpu_size = dense_host.nbytes
         dense_gpu  = drv.mem_alloc(dense_gpu_size)
-        # 1) count nnz per row (on host via small blocks with GPU kernel)
+        # 1) count nnz per row (per block)
         row_nnz = np.zeros(num_rows, dtype=np.int32)
         row_nnz_gpu_block_size = br * np.dtype(np.int32).itemsize
         row_nnz_gpu_block = drv.mem_alloc(row_nnz_gpu_block_size)
-        block = 256
+        block = 128
         for b in trange(0, num_rows, br, desc="Count NNZ per row"):
             R = min(br, num_rows - b)
-            # fill dense_host
+            # zero the host block to avoid garbage in tail when R < br
+            dense_host.fill(0.0)
             for i in range(R):
                 rg = b + i
                 n_idx = rg // self.T
                 t_idx = rg % self.T
                 dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
-            # copy only R rows
+            # copy whole buffer (safe because we zeroed tail)
             drv.memcpy_htod(dense_gpu, dense_host)
             grid = ((R + block - 1) // block, 1, 1)
             count_kernel(dense_gpu, row_nnz_gpu_block, np.int32(R), np.int32(num_cols), np.float32(self.relative_threshold),
-                         block=(block,1,1), grid=grid)
+                        block=(block,1,1), grid=grid)
+            drv.Context.synchronize()
             tmp = np.empty(R, dtype=np.int32)
             drv.memcpy_dtoh(tmp, row_nnz_gpu_block)
             row_nnz[b:b+R] = tmp
@@ -148,7 +146,6 @@ class SparseSMatrix_SELL:
             r0 = s * C
             r1 = min(num_rows, r0 + C)
             slice_len[s] = int(np.max(row_nnz[r0:r1])) if (r1>r0) else 0
-        # slice_ptr (int64)
         slice_ptr = np.zeros(num_slices + 1, dtype=np.int64)
         for s in range(num_slices):
             slice_ptr[s+1] = slice_ptr[s] + (slice_len[s] * C)
@@ -160,9 +157,14 @@ class SparseSMatrix_SELL:
         self.sell_values_gpu_size = total_storage * np.dtype(np.float32).itemsize
         self.sell_colinds_gpu_size = total_storage * np.dtype(np.uint32).itemsize
+        # allocate and optionally zero them
         self.sell_values_gpu = drv.mem_alloc(self.sell_values_gpu_size)
+        # It's good practice to zero the values buffer to avoid leftover memory
+        drv.memset_d32(self.sell_values_gpu, 0, total_storage)
         self.sell_colinds_gpu = drv.mem_alloc(self.sell_colinds_gpu_size)
+        drv.memset_d32(self.sell_colinds_gpu, 0, total_storage)
         # allocate slice metadata on device
         self.slice_ptr = slice_ptr
         self.slice_len = slice_len
@@ -177,29 +179,28 @@ class SparseSMatrix_SELL:
         drv.memcpy_htod(self.slice_len_gpu, self.slice_len)
         # 3) fill SELL arrays by streaming blocks again (use GPU fill kernel)
-        # reuse dense_host and allocate new dense_gpu
         dense_host = np.empty((br, num_cols), dtype=np.float32)
+        dense_gpu  = drv.mem_alloc(dense_host.nbytes)
-        dense_gpu_2_size = dense_host.nbytes
-        dense_gpu  = drv.mem_alloc(dense_gpu_2_size)
-        # we also need row_nnz on device per-block; supply global row_nnz on host but the kernel recomputes threshold
-        row_nnz_host_gpu_size = br * np.dtype(np.int32).itemsize
-        row_nnz_host_gpu = drv.mem_alloc(row_nnz_host_gpu_size)
+        # For per-block row_nnz pointer we allocate a buffer of max block size once, then reuse
+        row_nnz_host_gpu = drv.mem_alloc(br * np.dtype(np.int32).itemsize)
         for b in trange(0, num_rows, br, desc="Fill SELL"):
             R = min(br, num_rows - b)
+            dense_host.fill(0.0)
             for i in range(R):
                 rg = b + i
                 n_idx = rg // self.T
                 t_idx = rg % self.T
                 dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
+            # copy host block
             drv.memcpy_htod(dense_gpu, dense_host)
-            # We pass a dummy row_nnz pointer (not used in this kernel; left for API)
-            # Kernel expects rows_in_block, rows_global_offset to know where to write.
+            # copy corresponding row_nnz slice (only R entries)
+            drv.memcpy_htod(row_nnz_host_gpu, row_nnz[b:b+R])
             grid = ((R + block - 1) // block, 1, 1)
             fill_kernel(dense_gpu,
-                        np.intp(0), # placeholder for row_nnz pointer (not used)
+                        row_nnz_host_gpu,
                         self.slice_ptr_gpu,
                         self.slice_len_gpu,
                         self.sell_colinds_gpu,
@@ -210,12 +211,14 @@ class SparseSMatrix_SELL:
                         np.int32(C),
                         np.float32(self.relative_threshold),
                         block=(block,1,1), grid=grid)
+            drv.Context.synchronize()
         dense_gpu.free()
         row_nnz_host_gpu.free()
         # 4) compute norm_factor_inv via GPU accumulate (col sums)
         self.compute_norm_factor()
     def apply_apodization_gpu(self, window_vector_gpu):
         """
         Applique le fenêtrage directement sur self.sell_values_gpu
@@ -234,7 +237,7 @@ class SparseSMatrix_SELL:
             )
         # Le total_storage inclut les éléments non-nuls et le padding SELL.
-        threads = 256
+        threads = 128
         blocks = (self.total_storage + threads - 1) // threads
         # Lancement du kernel. Il travaille sur total_storage éléments.
@@ -248,43 +251,72 @@ class SparseSMatrix_SELL:
         )
         drv.Context.synchronize()
         print("✅ Multiplication par le fenêtrage effectuée in-place sur GPU (SELL-C-σ).")
-    # --- Ajout de la fonction de normalisation (qui fonctionne) ---
     def compute_norm_factor(self):
         """
-        Accumulate column sums on GPU using accumulate_columns_atomic, then compute inverse.
+        Compute the TRUE MLEM normalization norm_factor_inv = 1 / (A^T * 1)
+        by performing a SELL backprojection of a vector of ones.
+        This is the ONLY correct normalization for MLEM.
         """
-        if self.total_storage == 0:
-            raise RuntimeError("sell not built")
         ZX = int(self.Z * self.X)
+        TN = int(self.T * self.N)
-        # allocate col sum on device
-        col_sum_gpu_size = ZX * np.dtype(np.float32).itemsize
-        col_sum_gpu = drv.mem_alloc(col_sum_gpu_size)
-        drv.memset_d32(col_sum_gpu, 0, ZX)
+        # Allocate device vector of ones (projections)
+        ones_gpu = drv.mem_alloc(TN * np.dtype(np.float32).itemsize)
+        drv.memset_d32(ones_gpu, 0x3f800000, TN)   # 1.0f bit pattern
-        # FIX: Kernel name is "accumulate_columns_atomic"
-        acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic")
-        threads = 256
-        blocks = (self.total_storage + threads - 1) // threads
-        acc_kernel(self.sell_values_gpu, self.sell_colinds_gpu, np.int64(self.total_storage), col_sum_gpu,
-                   block=(threads,1,1), grid=(blocks,1,1))
+        # Allocate output for backprojection (ZX pixels)
+        c_gpu = drv.mem_alloc(ZX * np.dtype(np.float32).itemsize)
+        drv.memset_d32(c_gpu, 0, ZX)
+        # Get SELL backprojection kernel
+        try:
+            bp_kernel = self.sparse_mod.get_function("backprojection_kernel__SELL")
+        except Exception as e:
+            raise RuntimeError("Missing kernel backprojection_kernel__SELL in the cubin") from e
+        threads = 256
+        blocks = (TN + threads - 1) // threads
+        # Launch GPU backprojection
+        bp_kernel(
+            self.sell_values_gpu,
+            self.sell_colinds_gpu,
+            self.slice_ptr_gpu,
+            self.slice_len_gpu,
+            ones_gpu,
+            c_gpu,
+            np.int32(TN),
+            # np.int32(ZX),
+            np.int32(self.slice_height),
+            # np.int64(self.total_storage),
+            block=(threads, 1, 1), # Utilise le nouveau nombre de threads
+            grid=(blocks, 1, 1)
+        )
         drv.Context.synchronize()
-        # copy back
-        norm_host = np.empty(ZX, dtype=np.float32)
-        drv.memcpy_dtoh(norm_host, col_sum_gpu)
-        col_sum_gpu.free()
+        # Copy back to host
+        c_host = np.empty(ZX, dtype=np.float32)
+        drv.memcpy_dtoh(c_host, c_gpu)
+        ones_gpu.free()
+        c_gpu.free()
+        # Avoid divide-by-zero
+        c_host = np.maximum(c_host, 1e-6)
+        # Compute inverse (stored for use in MLEM)
+        self.norm_factor_inv = (1.0 / c_host).astype(np.float32)
-        norm = np.maximum(norm_host.astype(np.float64), 1e-6)
-        self.norm_factor_inv = (1.0 / norm).astype(np.float32)
+        # Upload to GPU
         if self.norm_factor_inv_gpu is not None:
             self.norm_factor_inv_gpu.free()
         self.norm_factor_inv_gpu_size = self.norm_factor_inv.nbytes
         self.norm_factor_inv_gpu = drv.mem_alloc(self.norm_factor_inv_gpu_size)
         drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
+        print("✓ Normalization (A^T*1) computed for MLEM.")
     def compute_density(self):
         """
         Returns only the density of the SELL-C-σ matrix.

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin RENAMED Viewed

Binary file

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/AOT_biomaps/AOT_Recon/AlgebraicRecon.py RENAMED Viewed

@@ -45,8 +45,6 @@ class AlgebraicRecon(Recon):
         self.sparseThreshold = sparseThreshold
-        self.Z_dim = None  # Used for sparse matrix reconstruction
         if self.numIterations <= 0:
             raise ValueError("Number of iterations must be greater than 0.")
         if self.numSubsets <= 0:
@@ -760,7 +758,6 @@ class AlgebraicRecon(Recon):
                                                         max_saves=self.maxSaves,
                                                         show_logs=show_logs,
                                                         smatrixType=self.smatrixType,
-                                                        Z=self.Z_dim
                                                         )
             else:
                 self.reconLaser, self.indices = MLEM(SMatrix=self.SMatrix,
@@ -774,7 +771,6 @@ class AlgebraicRecon(Recon):
                                                         max_saves=self.maxSaves,
                                                         show_logs=show_logs,
                                                         smatrixType=self.smatrixType,
-                                                        Z=self.Z_dim
                                                         )
         elif self.optimizer.value == OptimizerType.LS.value:
             if self.alpha is None:
@@ -790,8 +786,7 @@ class AlgebraicRecon(Recon):
                                                         denominator_threshold=self.denominatorThreshold,
                                                         max_saves=self.maxSaves,
                                                         show_logs=show_logs,
-                                                        smatrixType=self.smatrixType,
-                                                        Z=self.Z_dim
+                                                        smatrixType=self.smatrixType
                                                 )
             else:
                 self.reconLaser, self.indices = LS(SMatrix=self.SMatrix,
@@ -805,8 +800,7 @@ class AlgebraicRecon(Recon):
                                                         denominator_threshold=self.denominatorThreshold,
                                                         max_saves=self.maxSaves,
                                                         show_logs=show_logs,
-                                                        smatrixType=self.smatrixType,
-                                                        Z=self.Z_dim
+                                                        smatrixType=self.smatrixType
                                                 )
         else:
             raise ValueError(f"Only MLEM and LS are supported for simple algebraic reconstruction. {self.optimizer.value} need Bayesian reconstruction")

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/AOT_biomaps/__init__.py RENAMED Viewed

@@ -85,7 +85,7 @@ from .AOT_Recon.AOT_PotentialFunctions.RelativeDifferences import *
 from .Config import config
 from .Settings import *
-__version__ = '2.9.291'
+__version__ = '2.9.312'
 __process__ = config.get_process()
 def initialize(process=None):
@@ -135,6 +135,27 @@ def initialize(process=None):

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/AOT_biomaps.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: AOT_biomaps
-Version: 2.9.291
+Version: 2.9.312
 Summary: Acousto-Optic Tomography
 Home-page: https://github.com/LucasDuclos/AcoustoOpticTomography
 Author: Lucas Duclos

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: AOT_biomaps
-Version: 2.9.291
+Version: 2.9.312
 Summary: Acousto-Optic Tomography
 Home-page: https://github.com/LucasDuclos/AcoustoOpticTomography
 Author: Lucas Duclos

{aot_biomaps-2.9.291 → aot_biomaps-2.9.312}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name='AOT_biomaps',
-    version='2.9.291',
+    version='2.9.312',
     packages=find_packages(),
     include_package_data=True,
@@ -293,6 +293,27 @@ setup(