PyPI - AOT-biomaps - Versions diffs - 2.9.279__py3-none-any.whl → 2.9.300__py3-none-any.whl - Mend

AOT-biomaps 2.9.279py3-none-any.whl → 2.9.300py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (14) hide show

AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star
+from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star, _call_axpby, _call_minus_axpy, compute_TV_cpu, power_method_estimate_L__SELL, calculate_memory_requirement, check_gpu_memory
 from AOT_biomaps.Config import config
-from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType
+from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType, SMatrixType
 import torch
 from tqdm import trange
+import numpy as np
+import pycuda.driver as drv
 '''
 This module implements Primal-Dual Hybrid Gradient (PDHG) methods for solving inverse problems in Acousto-Optic Tomography.
@@ -11,6 +13,103 @@ The methods can run on both CPU and GPU, with configurations set in the AOT_biom
 '''
 def CP_TV(
+    SMatrix,
+    y,
+    alpha=None,               # TV regularization parameter (if None, alpha is auto-scaled)
+    beta=1e-4,              # Tikhonov regularization parameter
+    theta=1.0,
+    numIterations=5000,
+    isSavingEachIteration=True,
+    L=None,
+    withTumor=True,
+    device=None,
+    max_saves=5000,
+    show_logs=True,
+    smatrixType=SMatrixType.SELL,
+    k_security=0.8,
+    use_power_method=True,
+    auto_alpha_gamma=0.05,    # gamma for auto alpha: alpha = gamma * data_term / tv_term
+    apply_positivity_clamp=True,
+    tikhonov_as_gradient=False,  # if True, apply -tau*2*beta*x instead of prox multiplicative
+    use_laplacian=True,         # enable Laplacian (Hessian scalar) penalty
+    laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
+):
+    # try:
+    tumor_str = "WITH" if withTumor else "WITHOUT"
+    # Auto-select device and method
+    if device is None:
+        if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
+            device = torch.device(f"cuda:{config.select_best_gpu()}")
+            use_gpu = True
+        else:
+            device = torch.device("cpu")
+            use_gpu = False
+    else:
+        use_gpu = device.type == "cuda"
+    # Dispatch to the appropriate implementation
+    if use_gpu:
+            if smatrixType == SMatrixType.CSR:
+                raise NotImplementedError("GPU Chambolle Pock (LS-TV) with CSR not implemented.")
+            elif smatrixType == SMatrixType.SELL:
+                return CP_TV_Tikhonov_sparseSELL_pycuda(SMatrix, y, alpha,beta, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs, k_security, use_power_method, auto_alpha_gamma, apply_positivity_clamp, tikhonov_as_gradient, use_laplacian, laplacian_beta_scale)
+            elif smatrixType == SMatrixType.DENSE:
+                return CP_TV_dense(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
+            else:
+                raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-TV).")
+    else:
+        raise NotImplementedError("CPU Chambolle Pock (LS-TV) not implemented.")
+def CP_KL(
+    SMatrix,
+    y,
+    alpha=None,               # TV regularization parameter (if None, alpha is auto-scaled)
+    beta=1e-4,              # Tikhonov regularization parameter
+    theta=1.0,
+    numIterations=5000,
+    isSavingEachIteration=True,
+    L=None,
+    withTumor=True,
+    device=None,
+    max_saves=5000,
+    show_logs=True,
+    smatrixType=SMatrixType.SELL,
+    k_security=0.8,
+    use_power_method=True,
+    auto_alpha_gamma=0.05,    # gamma for auto alpha: alpha = gamma * data_term / tv_term
+    apply_positivity_clamp=True,
+    tikhonov_as_gradient=False,  # if True, apply -tau*2*beta*x instead of prox multiplicative
+    use_laplacian=True,         # enable Laplacian (Hessian scalar) penalty
+    laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
+):
+    # try:
+    tumor_str = "WITH" if withTumor else "WITHOUT"
+    # Auto-select device and method
+    if device is None:
+        if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
+            device = torch.device(f"cuda:{config.select_best_gpu()}")
+            use_gpu = True
+        else:
+            device = torch.device("cpu")
+            use_gpu = False
+    else:
+        use_gpu = device.type == "cuda"
+    # Dispatch to the appropriate implementation
+    if use_gpu:
+            if smatrixType == SMatrixType.CSR:
+                raise NotImplementedError("GPU Chambolle Pock (LS-KL) with CSR not implemented.")
+            elif smatrixType == SMatrixType.SELL:
+                raise NotImplementedError("GPU Chambolle Pock (LS-KL) with SELL not implemented.")
+            elif smatrixType == SMatrixType.DENSE:
+                return CP_KL(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
+            else:
+                raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-KL).")
+    else:
+        raise NotImplementedError("CPU Chambolle Pock (LS-KL) not implemented.")
+def CP_TV_dense(
     SMatrix,
     y,
     alpha=1e-1,
@@ -21,6 +120,7 @@ def CP_TV(
     withTumor=True,
     device=None,
     max_saves=5000,
+    show_logs=True,
 ):
     """
     Chambolle-Pock algorithm for Total Variation (TV) regularization.
@@ -92,10 +192,10 @@ def CP_TV(
     # Description for progress bar
     tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
     device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
-    description = f"AOT-BioMaps -- Primal/Dual Reconstruction (TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
+    description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
-    # Main loop
-    for iteration in trange(numIterations, desc=description):
+    iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+    for it in iterator:
         # Update p (TV proximal step)
         grad_x = gradient(x_tilde.reshape(Z, X))
         p = proj_l2(p + sigma * grad_x, alpha)
@@ -113,9 +213,9 @@ def CP_TV(
         x_tilde = x + theta * (x - x_old)
         # Save intermediate result if needed
-        if isSavingEachIteration and iteration in save_indices:
+        if isSavingEachIteration and it in save_indices:
             I_reconMatrix.append(x.reshape(Z, X).clone() * (norm_y / norm_A))
-            saved_indices.append(iteration)
+            saved_indices.append(it)
     # Return results
     if isSavingEachIteration:
@@ -123,6 +223,337 @@ def CP_TV(
     else:
         return (x.reshape(Z, X) * (norm_y / norm_A)).cpu().numpy(), None
+def CP_TV_Tikhonov_sparseSELL_pycuda(
+    SMatrix,
+    y,
+    alpha=None,               # TV regularization parameter (if None, alpha is auto-scaled)
+    beta=1e-4,              # Tikhonov regularization parameter
+    theta=1.0,
+    numIterations=2000,
+    isSavingEachIteration=True,
+    L=None,
+    tumor_str="",
+    device=None,
+    max_saves=2000,
+    show_logs=True,
+    k_security=0.8,
+    use_power_method=True,
+    auto_alpha_gamma=0.05,    # gamma for auto alpha: alpha = gamma * data_term / tv_term
+    apply_positivity_clamp=True,
+    tikhonov_as_gradient=False,  # if True, apply -tau*2*beta*x instead of prox multiplicative
+    use_laplacian=True,         # enable Laplacian (Hessian scalar) penalty
+    laplacian_beta_scale=1.0    # multiply beta for laplacian term if you want separate scaling
+):
+    """
+    CP-TV + Tikhonov + Laplacian (Hessian scalar) penalty integrated.
+    Returns (I_reconMatrix, saved_indices) if isSavingEachIteration else (x_final, None).
+    """
+        # ----- begin main -----
+    if SMatrix.ctx:
+        SMatrix.ctx.push()
+    # prepare variables
+    dtype = np.float32
+    TN = int(SMatrix.N * SMatrix.T)
+    ZX = int(SMatrix.Z * SMatrix.X)
+    Z, X = SMatrix.Z, SMatrix.X
+    block_size = 256
+    # existing kernels
+    projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
+    backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
+    axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
+    minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
+    gradient_kernel = SMatrix.sparse_mod.get_function("gradient_kernel")
+    divergence_kernel = SMatrix.sparse_mod.get_function("divergence_kernel")
+    proj_tv_kernel = SMatrix.sparse_mod.get_function("proj_tv_kernel")
+    # optional kernels (laplacian & clamp)
+    has_laplacian = False
+    has_clamp_kernel = False
+    try:
+        laplacian_kernel = SMatrix.sparse_mod.get_function("laplacian_kernel")
+        laplacian_adj_kernel = SMatrix.sparse_mod.get_function("laplacian_adj_kernel")
+        has_laplacian = True
+    except Exception:
+        has_laplacian = False
+    try:
+        clamp_positive_kernel = SMatrix.sparse_mod.get_function("clamp_positive_kernel")
+        has_clamp_kernel = True
+    except Exception:
+        has_clamp_kernel = False
+    stream = drv.Stream()
+    # estimate L operator norm if needed
+    if use_power_method or L is None:
+        L_LS_sq = power_method_estimate_L__SELL(SMatrix, stream, n_it=20, block_size=block_size)
+        L_nabla_sq = 8.0
+        L_op_norm = np.sqrt(L_LS_sq + L_nabla_sq)
+        if L_op_norm < 1e-6:
+            L_op_norm = 1.0
+    else:
+        L_op_norm = L
+    tau = np.float32(k_security / L_op_norm)
+    sigma = np.float32(k_security / L_op_norm)
+    # prepare y and normalization
+    y = y.T.astype(dtype).reshape(-1)
+    maxy = float(np.max(np.abs(y))) if y.size > 0 else 0.0
+    if maxy > 0:
+        y_normed = (y / maxy).copy()
+    else:
+        y_normed = y.copy()
+    # GPU allocations
+    bufs = []
+    y_gpu = drv.mem_alloc(y_normed.nbytes); bufs.append(y_gpu)
+    drv.memcpy_htod_async(y_gpu, y_normed.T.flatten(), stream)
+    x_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_gpu)
+    drv.memset_d32_async(x_gpu, 0, ZX, stream)
+    x_old_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_old_gpu)
+    x_tilde_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_tilde_gpu)
+    drv.memcpy_dtod_async(x_tilde_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
+    p_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(p_gpu)
+    q_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(q_gpu)
+    drv.memset_d32_async(p_gpu, 0, 2 * ZX, stream)
+    drv.memset_d32_async(q_gpu, 0, TN, stream)
+    grad_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(grad_gpu)
+    div_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(div_gpu)
+    Ax_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(Ax_gpu)
+    ATq_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(ATq_gpu)
+    zero_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(zero_gpu)
+    drv.memset_d32_async(zero_gpu, 0, ZX, stream)
+    # Laplacian buffers (if enabled and kernel available)
+    use_lap = use_laplacian and has_laplacian and (beta > 0)
+    if use_lap:
+        lap_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(lap_gpu)
+        r_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(r_gpu)
+        drv.memset_d32_async(r_gpu, 0, ZX, stream)
+        # scalar beta for laplacian (allow separate scale)
+        beta_lap = float(beta) * float(laplacian_beta_scale)
+        inv_1_plus_sigma_beta = np.float32(1.0 / (1.0 + float(sigma) * beta_lap))
+    # host buffers for logs
+    x_host = np.empty(ZX, dtype=dtype)
+    Ax_host = np.empty(TN, dtype=dtype)
+    q_host = np.empty(TN, dtype=dtype)
+    p_host = np.empty(2 * ZX, dtype=dtype)
+    ATq_host = np.empty(ZX, dtype=dtype)
+    # compute initial backprojection for auto-alpha
+    drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
+    backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                        y_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                        block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+    stream.synchronize()
+    drv.memcpy_dtoh(x_host, ATq_gpu)
+    # auto alpha if requested
+    if alpha is None:
+        drv.memcpy_htod_async(x_gpu, x_host, stream)
+        projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                        x_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                        block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+        stream.synchronize()
+        drv.memcpy_dtoh(Ax_host, Ax_gpu)
+        resid = Ax_host - y_normed[:TN]
+        data_term = 0.5 * float(np.dot(resid, resid))
+        tv_term = float(compute_TV_cpu(x_host, Z, X)) + 1e-12
+        alpha = float(auto_alpha_gamma * data_term / tv_term)
+        if show_logs:
+            print(f"[auto-alpha] data_term={data_term:.6e}, tv_term={tv_term:.6e}, alpha_set={alpha:.6e}")
+    # tikhonov prox multiplicative scale
+    if tikhonov_as_gradient:
+        tikh_scale = None
+    else:
+        tikh_scale = np.float32(1.0 / (1.0 + 2.0 * tau * beta)) if beta > 0 else np.float32(1.0)
+    # saving policy
+    if numIterations <= max_saves:
+        save_indices_all = list(range(0, numIterations + 1))
+    else:
+        step = max(1, numIterations // max_saves)
+        save_indices_all = list(range(0, numIterations + 1, step))
+    device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
+    if show_logs:
+        if (alpha is None or alpha == 0) and (beta is None or beta == 0):
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS) -- {tumor_str} -- {device_str}"
+        if alpha is None or alpha == 0:
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} beta={beta:.4e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-Tikhonov) -- {tumor_str} -- {device_str}"
+        elif beta is None or beta == 0:
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) -- {tumor_str} -- {device_str}"
+        else:
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV-Tikhonov) -- {tumor_str} -- {device_str}"
+    I_reconMatrix = []
+    saved_indices = []
+    if isSavingEachIteration and 0 in save_indices_all:
+        drv.memcpy_dtoh(x_host, x_gpu)
+        x0 = x_host.reshape((Z, X)).copy()
+        if maxy > 0:
+            x0 *= maxy
+        I_reconMatrix.append(x0)
+        saved_indices.append(0)
+    # main loop
+    try:
+        iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        for it in iterator:
+            # 1) dual p update (TV)
+            gradient_kernel(grad_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                            block=(block_size, 1, 1),
+                            grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                            stream=stream)
+            _call_axpby(axpby_kernel, p_gpu, p_gpu, grad_gpu, 1.0, sigma, 2 * ZX, stream, block_size)
+            proj_tv_kernel(p_gpu, np.float32(alpha), np.int32(ZX),
+                            block=(block_size, 1, 1),
+                            grid=((ZX + block_size - 1) // block_size, 1, 1),
+                            stream=stream)
+            # 2) dual q update (data fidelity)
+            projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                            x_tilde_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                            block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            _call_axpby(axpby_kernel, Ax_gpu, Ax_gpu, y_gpu, 1.0, -1.0, TN, stream, block_size)
+            _call_axpby(axpby_kernel, q_gpu, q_gpu, Ax_gpu, 1.0 / (1.0 + sigma), sigma / (1.0 + sigma), TN, stream, block_size)
+            # optional Laplacian dual update
+            if use_lap:
+                # compute Laplacian of x_tilde -> lap_gpu
+                laplacian_kernel(lap_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                                block=(block_size, 1, 1),
+                                grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                                stream=stream)
+                # r = r + sigma * lap
+                _call_axpby(axpby_kernel, r_gpu, r_gpu, lap_gpu, 1.0, sigma, ZX, stream, block_size)
+                # r = r / (1 + sigma * beta_lap)
+                _call_axpby(axpby_kernel, r_gpu, r_gpu, zero_gpu, inv_1_plus_sigma_beta, 0.0, ZX, stream, block_size)
+            # 3) primal x update
+            drv.memcpy_dtod_async(x_old_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
+            divergence_kernel(div_gpu, p_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                            block=(block_size, 1, 1),
+                            grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                            stream=stream)
+            drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
+            backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                                q_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                                block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            # ATq - div
+            _call_minus_axpy(minus_axpy_kernel, ATq_gpu, div_gpu, 1.0, ZX, stream, block_size)
+            # if laplacian is used, add H^T r into ATq
+            if use_lap:
+                # compute laplacian_adj_kernel(temp, r)
+                # reuse grad_gpu as temporary if safe (its content used earlier, but not reused until later)
+                laplacian_adj_kernel(grad_gpu, r_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                                    block=(block_size, 1, 1),
+                                    grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                                    stream=stream)
+                # ATq_gpu += temp (grad_gpu)
+                _call_axpby(axpby_kernel, ATq_gpu, ATq_gpu, grad_gpu, 1.0, 1.0, ZX, stream, block_size)
+            # x = x_old - tau * ATq_buffer
+            _call_minus_axpy(minus_axpy_kernel, x_gpu, ATq_gpu, tau, ZX, stream, block_size)
+            # Tikhonov
+            if beta > 0:
+                if tikhonov_as_gradient:
+                    mul = 1.0 - 2.0 * float(tau) * float(beta)
+                    if mul <= 0.0:
+                        # fallback to prox multiplicative stable
+                        fallback_scale = np.float32(1.0 / (1.0 + 2.0 * float(tau) * float(beta)))
+                        _call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, fallback_scale, 0.0, ZX, stream, block_size)
+                    else:
+                        # x *= mul  => implemented as axpby: out = 1* x + (mul-1)*x
+                        _call_axpby(axpby_kernel, x_gpu, x_gpu, x_gpu, 1.0, np.float32(mul - 1.0), ZX, stream, block_size)
+                else:
+                    _call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, tikh_scale, np.float32(0.0), ZX, stream, block_size)
+            # positivity clamp (prefer GPU kernel if available)
+            if apply_positivity_clamp:
+                if has_clamp_kernel:
+                    # in-place clamp on GPU
+                    clamp_positive_kernel(x_gpu, np.int32(ZX),
+                                        block=(block_size, 1, 1),
+                                        grid=((ZX + block_size - 1) // block_size, 1, 1),
+                                        stream=stream)
+                else:
+                    # fallback CPU roundtrip (slower)
+                    stream.synchronize()
+                    drv.memcpy_dtoh(x_host, x_gpu)
+                    np.maximum(x_host, 0.0, out=x_host)
+                    drv.memcpy_htod_async(x_gpu, x_host, stream)
+            # extrapolation
+            _call_axpby(axpby_kernel, x_tilde_gpu, x_gpu, x_old_gpu, np.float32(1.0 + theta), np.float32(-theta), ZX, stream, block_size)
+            # saves
+            if isSavingEachIteration and (it + 1) in save_indices_all:
+                stream.synchronize()
+                drv.memcpy_dtoh(x_host, x_gpu)
+                x_saved = x_host.reshape((Z, X)).copy()
+                if maxy > 0:
+                    x_saved *= maxy
+                I_reconMatrix.append(x_saved)
+                saved_indices.append(it + 1)
+        stream.synchronize()
+        drv.memcpy_dtoh(x_host, x_gpu)
+        x_final = x_host.reshape((Z, X)).copy()
+        if maxy > 0:
+            x_final *= maxy
+            if isSavingEachIteration and len(I_reconMatrix):
+                for i in range(len(I_reconMatrix)):
+                    I_reconMatrix[i] *= maxy
+        # free buffers
+        for buff in bufs:
+            try:
+                buff.free()
+            except:
+                pass
+        if SMatrix.ctx:
+            SMatrix.ctx.pop()
+        if isSavingEachIteration:
+            return I_reconMatrix, saved_indices
+        else:
+            return x_final, None
+    except Exception as e:
+        # cleanup robustly
+        print("Error in CP_TV_Tikhonov+Lap (robust):", e)
+        try:
+            for buff in bufs:
+                try:
+                    buff.free()
+                except:
+                    pass
+        except:
+            pass
+        try:
+            if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
+                SMatrix.ctx.pop()
+        except:
+            pass
+        raise
 def CP_KL(
     SMatrix,
@@ -132,9 +563,10 @@ def CP_KL(
     numIterations=5000,
     isSavingEachIteration=True,
     L=None,
-    withTumor=True,
+    tumor_str="",
     device=None,
     max_saves=5000,
+    show_logs=True,
 ):
     """
     Chambolle-Pock algorithm for Kullback-Leibler (KL) divergence regularization.
@@ -193,12 +625,11 @@ def CP_KL(
     saved_indices = [0]
     # Description for progress bar
-    tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
     device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
     description = f"AOT-BioMaps -- Primal/Dual Reconstruction (KL) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
-    # Main loop
-    for iteration in trange(numIterations, desc=description):
+    iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+    for iteration in iterator:
         # Update q (proximal step for F*)
         q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)

AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py CHANGED Viewed

@@ -224,27 +224,20 @@ class SparseSMatrix_CSR:
     def getMatrixSize(self):
         """
         Retourne la taille totale de la matrice CSR en Go (en sommant la mémoire GPU).
+        Utilise les attributs de taille stockés pour contourner l'AttributeError de DeviceAllocation.
         """
+        # Note: L'utilisateur doit s'assurer que self.row_ptr existe avant cet appel.
         if self.row_ptr is None:
             return {"error": "La matrice sparse n'est pas encore allouée."}
         total_bytes = 0
-        # Mémoire GPU (row_ptr_gpu, col_ind_gpu, values_gpu, norm_factor_inv_gpu)
-        if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
-            total_bytes += self.row_ptr_gpu.size
-        if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
-            total_bytes += self.col_ind_gpu.size
-        if hasattr(self, 'values_gpu') and self.values_gpu:
-            total_bytes += self.values_gpu.size
-        if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
-            total_bytes += self.norm_factor_inv_gpu.size
-        # NOTE: Les versions précédentes utilisaient le .size de l'objet DeviceAllocation,
-        # qui était problématique. Si l'erreur se reproduit ici, il faudra
-        # stocker la taille en octets comme nous l'avons fait pour SELL.
-        # Pour l'instant, nous conservons la méthode getMatrixSize originale de CSR.
+        # Somme des tailles stockées (Taille calculée et attribuée dans allocate et compute_norm_factor_from_csr)
+        total_bytes += getattr(self, 'row_ptr_gpu_size', 0)
+        total_bytes += getattr(self, 'col_ind_gpu_size', 0)
+        total_bytes += getattr(self, 'values_gpu_size', 0)
+        total_bytes += getattr(self, 'norm_factor_inv_gpu_size', 0)
         return total_bytes / (1024**3)
     def free(self):

AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py CHANGED Viewed

@@ -92,13 +92,11 @@ class SparseSMatrix_SELL:
     def allocate(self):
         """
         Build SELL-C-σ directly from manip AcousticFields in streaming blocks.
-        NOTE: This is the logic of allocate_sell_c_sigma_direct from the working class.
+        Corrected: per-block row_nnz copy, zeroing of host block, proper sync.
         """
         if self.sparse_mod is None:
             raise RuntimeError("CUDA module not loaded. Check compilation.")
-        # NOTE: Les noms de kernel (count_nnz_rows_kernel, fill_kernel__SELL) sont utilisés
-        # car ils sont présents dans la classe fonctionnelle.
         count_kernel = self.sparse_mod.get_function("count_nnz_rows_kernel")
         fill_kernel  = self.sparse_mod.get_function("fill_kernel__SELL")
@@ -106,16 +104,14 @@ class SparseSMatrix_SELL:
         num_cols = int(self.Z * self.X)
         C = int(self.slice_height)
-        # host temporary block
         br = int(self.block_rows)
-        bytes_per_elem = np.dtype(np.float32).itemsize
         dense_host = np.empty((br, num_cols), dtype=np.float32)
-        # Allocation 1: Dense block GPU memory
+        # Allocation dense buffer on device (size = br * num_cols)
         dense_gpu_size = dense_host.nbytes
         dense_gpu  = drv.mem_alloc(dense_gpu_size)
-        # 1) count nnz per row (on host via small blocks with GPU kernel)
+        # 1) count nnz per row (per block)
         row_nnz = np.zeros(num_rows, dtype=np.int32)
         row_nnz_gpu_block_size = br * np.dtype(np.int32).itemsize
         row_nnz_gpu_block = drv.mem_alloc(row_nnz_gpu_block_size)
@@ -123,17 +119,19 @@ class SparseSMatrix_SELL:
         block = 256
         for b in trange(0, num_rows, br, desc="Count NNZ per row"):
             R = min(br, num_rows - b)
-            # fill dense_host
+            # zero the host block to avoid garbage in tail when R < br
+            dense_host.fill(0.0)
             for i in range(R):
                 rg = b + i
                 n_idx = rg // self.T
                 t_idx = rg % self.T
                 dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
-            # copy only R rows
+            # copy whole buffer (safe because we zeroed tail)
             drv.memcpy_htod(dense_gpu, dense_host)
             grid = ((R + block - 1) // block, 1, 1)
             count_kernel(dense_gpu, row_nnz_gpu_block, np.int32(R), np.int32(num_cols), np.float32(self.relative_threshold),
-                         block=(block,1,1), grid=grid)
+                        block=(block,1,1), grid=grid)
+            drv.Context.synchronize()
             tmp = np.empty(R, dtype=np.int32)
             drv.memcpy_dtoh(tmp, row_nnz_gpu_block)
             row_nnz[b:b+R] = tmp
@@ -148,7 +146,6 @@ class SparseSMatrix_SELL:
             r0 = s * C
             r1 = min(num_rows, r0 + C)
             slice_len[s] = int(np.max(row_nnz[r0:r1])) if (r1>r0) else 0
-        # slice_ptr (int64)
         slice_ptr = np.zeros(num_slices + 1, dtype=np.int64)
         for s in range(num_slices):
             slice_ptr[s+1] = slice_ptr[s] + (slice_len[s] * C)
@@ -160,9 +157,14 @@ class SparseSMatrix_SELL:
         self.sell_values_gpu_size = total_storage * np.dtype(np.float32).itemsize
         self.sell_colinds_gpu_size = total_storage * np.dtype(np.uint32).itemsize
+        # allocate and optionally zero them
         self.sell_values_gpu = drv.mem_alloc(self.sell_values_gpu_size)
+        # It's good practice to zero the values buffer to avoid leftover memory
+        drv.memset_d32(self.sell_values_gpu, 0, total_storage)
         self.sell_colinds_gpu = drv.mem_alloc(self.sell_colinds_gpu_size)
+        drv.memset_d32(self.sell_colinds_gpu, 0, total_storage)
         # allocate slice metadata on device
         self.slice_ptr = slice_ptr
         self.slice_len = slice_len
@@ -177,29 +179,28 @@ class SparseSMatrix_SELL:
         drv.memcpy_htod(self.slice_len_gpu, self.slice_len)
         # 3) fill SELL arrays by streaming blocks again (use GPU fill kernel)
-        # reuse dense_host and allocate new dense_gpu
         dense_host = np.empty((br, num_cols), dtype=np.float32)
+        dense_gpu  = drv.mem_alloc(dense_host.nbytes)
-        dense_gpu_2_size = dense_host.nbytes
-        dense_gpu  = drv.mem_alloc(dense_gpu_2_size)
-        # we also need row_nnz on device per-block; supply global row_nnz on host but the kernel recomputes threshold
-        row_nnz_host_gpu_size = br * np.dtype(np.int32).itemsize
-        row_nnz_host_gpu = drv.mem_alloc(row_nnz_host_gpu_size)
+        # For per-block row_nnz pointer we allocate a buffer of max block size once, then reuse
+        row_nnz_host_gpu = drv.mem_alloc(br * np.dtype(np.int32).itemsize)
         for b in trange(0, num_rows, br, desc="Fill SELL"):
             R = min(br, num_rows - b)
+            dense_host.fill(0.0)
             for i in range(R):
                 rg = b + i
                 n_idx = rg // self.T
                 t_idx = rg % self.T
                 dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
+            # copy host block
             drv.memcpy_htod(dense_gpu, dense_host)
-            # We pass a dummy row_nnz pointer (not used in this kernel; left for API)
-            # Kernel expects rows_in_block, rows_global_offset to know where to write.
+            # copy corresponding row_nnz slice (only R entries)
+            drv.memcpy_htod(row_nnz_host_gpu, row_nnz[b:b+R])
             grid = ((R + block - 1) // block, 1, 1)
             fill_kernel(dense_gpu,
-                        np.intp(0), # placeholder for row_nnz pointer (not used)
+                        row_nnz_host_gpu,
                         self.slice_ptr_gpu,
                         self.slice_len_gpu,
                         self.sell_colinds_gpu,
@@ -210,12 +211,14 @@ class SparseSMatrix_SELL:
                         np.int32(C),
                         np.float32(self.relative_threshold),
                         block=(block,1,1), grid=grid)
+            drv.Context.synchronize()
         dense_gpu.free()
         row_nnz_host_gpu.free()
         # 4) compute norm_factor_inv via GPU accumulate (col sums)
         self.compute_norm_factor()
     def apply_apodization_gpu(self, window_vector_gpu):
         """
         Applique le fenêtrage directement sur self.sell_values_gpu

AOT-biomaps 2.9.279__py3-none-any.whl → 2.9.300__py3-none-any.whl

Potentially problematic release.

AOT-biomaps 2.9.279py3-none-any.whl → 2.9.300py3-none-any.whl