PyPI - AOT-biomaps - Versions diffs - 2.9.261__py3-none-any.whl → 2.9.294__py3-none-any.whl - Mend

AOT-biomaps 2.9.261py3-none-any.whl → 2.9.294py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +400 -10
AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +60 -25
AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +442 -11
AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +48 -26
AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +115 -109
AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
AOT_biomaps/AOT_Recon/AlgebraicRecon.py +27 -20
AOT_biomaps/AOT_Recon/PrimalDualRecon.py +94 -41
AOT_biomaps/AOT_Recon/ReconTools.py +164 -18
AOT_biomaps/__init__.py +34 -1
{aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/METADATA +1 -1
{aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/RECORD +14 -13
{aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/WHEEL +0 -0
{aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/top_level.txt +0 -0

AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star
+from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star, _call_axpby, _call_minus_axpy, compute_TV_cpu, power_method_estimate_L__SELL, calculate_memory_requirement, check_gpu_memory
 from AOT_biomaps.Config import config
-from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType
+from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType, SMatrixType
 import torch
 from tqdm import trange
+import numpy as np
+import pycuda.driver as drv
 '''
 This module implements Primal-Dual Hybrid Gradient (PDHG) methods for solving inverse problems in Acousto-Optic Tomography.
@@ -11,6 +13,103 @@ The methods can run on both CPU and GPU, with configurations set in the AOT_biom
 '''
 def CP_TV(
+    SMatrix,
+    y,
+    alpha=None,               # TV regularization parameter (if None, alpha is auto-scaled)
+    beta=1e-4,              # Tikhonov regularization parameter
+    theta=1.0,
+    numIterations=5000,
+    isSavingEachIteration=True,
+    L=None,
+    withTumor=True,
+    device=None,
+    max_saves=5000,
+    show_logs=True,
+    smatrixType=SMatrixType.SELL,
+    k_security=0.8,
+    use_power_method=True,
+    auto_alpha_gamma=0.05,    # gamma for auto alpha: alpha = gamma * data_term / tv_term
+    apply_positivity_clamp=True,
+    tikhonov_as_gradient=False,  # if True, apply -tau*2*beta*x instead of prox multiplicative
+    use_laplacian=True,         # enable Laplacian (Hessian scalar) penalty
+    laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
+):
+    # try:
+    tumor_str = "WITH" if withTumor else "WITHOUT"
+    # Auto-select device and method
+    if device is None:
+        if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
+            device = torch.device(f"cuda:{config.select_best_gpu()}")
+            use_gpu = True
+        else:
+            device = torch.device("cpu")
+            use_gpu = False
+    else:
+        use_gpu = device.type == "cuda"
+    # Dispatch to the appropriate implementation
+    if use_gpu:
+            if smatrixType == SMatrixType.CSR:
+                raise NotImplementedError("GPU Chambolle Pock (LS-TV) with CSR not implemented.")
+            elif smatrixType == SMatrixType.SELL:
+                return CP_TV_Tikhonov_sparseSELL_pycuda(SMatrix, y, alpha,beta, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs, k_security, use_power_method, auto_alpha_gamma, apply_positivity_clamp, tikhonov_as_gradient, use_laplacian, laplacian_beta_scale)
+            elif smatrixType == SMatrixType.DENSE:
+                return CP_TV_dense(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
+            else:
+                raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-TV).")
+    else:
+        raise NotImplementedError("CPU Chambolle Pock (LS-TV) not implemented.")
+def CP_KL(
+    SMatrix,
+    y,
+    alpha=None,               # TV regularization parameter (if None, alpha is auto-scaled)
+    beta=1e-4,              # Tikhonov regularization parameter
+    theta=1.0,
+    numIterations=5000,
+    isSavingEachIteration=True,
+    L=None,
+    withTumor=True,
+    device=None,
+    max_saves=5000,
+    show_logs=True,
+    smatrixType=SMatrixType.SELL,
+    k_security=0.8,
+    use_power_method=True,
+    auto_alpha_gamma=0.05,    # gamma for auto alpha: alpha = gamma * data_term / tv_term
+    apply_positivity_clamp=True,
+    tikhonov_as_gradient=False,  # if True, apply -tau*2*beta*x instead of prox multiplicative
+    use_laplacian=True,         # enable Laplacian (Hessian scalar) penalty
+    laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
+):
+    # try:
+    tumor_str = "WITH" if withTumor else "WITHOUT"
+    # Auto-select device and method
+    if device is None:
+        if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
+            device = torch.device(f"cuda:{config.select_best_gpu()}")
+            use_gpu = True
+        else:
+            device = torch.device("cpu")
+            use_gpu = False
+    else:
+        use_gpu = device.type == "cuda"
+    # Dispatch to the appropriate implementation
+    if use_gpu:
+            if smatrixType == SMatrixType.CSR:
+                raise NotImplementedError("GPU Chambolle Pock (LS-KL) with CSR not implemented.")
+            elif smatrixType == SMatrixType.SELL:
+                raise NotImplementedError("GPU Chambolle Pock (LS-KL) with SELL not implemented.")
+            elif smatrixType == SMatrixType.DENSE:
+                return CP_KL(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
+            else:
+                raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-KL).")
+    else:
+        raise NotImplementedError("CPU Chambolle Pock (LS-KL) not implemented.")
+def CP_TV_dense(
     SMatrix,
     y,
     alpha=1e-1,
@@ -21,6 +120,7 @@ def CP_TV(
     withTumor=True,
     device=None,
     max_saves=5000,
+    show_logs=True,
 ):
     """
     Chambolle-Pock algorithm for Total Variation (TV) regularization.
@@ -92,10 +192,10 @@ def CP_TV(
     # Description for progress bar
     tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
     device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
-    description = f"AOT-BioMaps -- Primal/Dual Reconstruction (TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
+    description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
-    # Main loop
-    for iteration in trange(numIterations, desc=description):
+    iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+    for it in iterator:
         # Update p (TV proximal step)
         grad_x = gradient(x_tilde.reshape(Z, X))
         p = proj_l2(p + sigma * grad_x, alpha)
@@ -113,9 +213,9 @@ def CP_TV(
         x_tilde = x + theta * (x - x_old)
         # Save intermediate result if needed
-        if isSavingEachIteration and iteration in save_indices:
+        if isSavingEachIteration and it in save_indices:
             I_reconMatrix.append(x.reshape(Z, X).clone() * (norm_y / norm_A))
-            saved_indices.append(iteration)
+            saved_indices.append(it)
     # Return results
     if isSavingEachIteration:
@@ -123,6 +223,337 @@ def CP_TV(
     else:
         return (x.reshape(Z, X) * (norm_y / norm_A)).cpu().numpy(), None
+def CP_TV_Tikhonov_sparseSELL_pycuda(
+    SMatrix,
+    y,
+    alpha=None,               # TV regularization parameter (if None, alpha is auto-scaled)
+    beta=1e-4,              # Tikhonov regularization parameter
+    theta=1.0,
+    numIterations=2000,
+    isSavingEachIteration=True,
+    L=None,
+    tumor_str="",
+    device=None,
+    max_saves=2000,
+    show_logs=True,
+    k_security=0.8,
+    use_power_method=True,
+    auto_alpha_gamma=0.05,    # gamma for auto alpha: alpha = gamma * data_term / tv_term
+    apply_positivity_clamp=True,
+    tikhonov_as_gradient=False,  # if True, apply -tau*2*beta*x instead of prox multiplicative
+    use_laplacian=True,         # enable Laplacian (Hessian scalar) penalty
+    laplacian_beta_scale=1.0    # multiply beta for laplacian term if you want separate scaling
+):
+    """
+    CP-TV + Tikhonov + Laplacian (Hessian scalar) penalty integrated.
+    Returns (I_reconMatrix, saved_indices) if isSavingEachIteration else (x_final, None).
+    """
+        # ----- begin main -----
+    if SMatrix.ctx:
+        SMatrix.ctx.push()
+    # prepare variables
+    dtype = np.float32
+    TN = int(SMatrix.N * SMatrix.T)
+    ZX = int(SMatrix.Z * SMatrix.X)
+    Z, X = SMatrix.Z, SMatrix.X
+    block_size = 256
+    # existing kernels
+    projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
+    backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
+    axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
+    minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
+    gradient_kernel = SMatrix.sparse_mod.get_function("gradient_kernel")
+    divergence_kernel = SMatrix.sparse_mod.get_function("divergence_kernel")
+    proj_tv_kernel = SMatrix.sparse_mod.get_function("proj_tv_kernel")
+    # optional kernels (laplacian & clamp)
+    has_laplacian = False
+    has_clamp_kernel = False
+    try:
+        laplacian_kernel = SMatrix.sparse_mod.get_function("laplacian_kernel")
+        laplacian_adj_kernel = SMatrix.sparse_mod.get_function("laplacian_adj_kernel")
+        has_laplacian = True
+    except Exception:
+        has_laplacian = False
+    try:
+        clamp_positive_kernel = SMatrix.sparse_mod.get_function("clamp_positive_kernel")
+        has_clamp_kernel = True
+    except Exception:
+        has_clamp_kernel = False
+    stream = drv.Stream()
+    # estimate L operator norm if needed
+    if use_power_method or L is None:
+        L_LS_sq = power_method_estimate_L__SELL(SMatrix, stream, n_it=20, block_size=block_size)
+        L_nabla_sq = 8.0
+        L_op_norm = np.sqrt(L_LS_sq + L_nabla_sq)
+        if L_op_norm < 1e-6:
+            L_op_norm = 1.0
+    else:
+        L_op_norm = L
+    tau = np.float32(k_security / L_op_norm)
+    sigma = np.float32(k_security / L_op_norm)
+    # prepare y and normalization
+    y = y.T.astype(dtype).reshape(-1)
+    maxy = float(np.max(np.abs(y))) if y.size > 0 else 0.0
+    if maxy > 0:
+        y_normed = (y / maxy).copy()
+    else:
+        y_normed = y.copy()
+    # GPU allocations
+    bufs = []
+    y_gpu = drv.mem_alloc(y_normed.nbytes); bufs.append(y_gpu)
+    drv.memcpy_htod_async(y_gpu, y_normed.T.flatten(), stream)
+    x_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_gpu)
+    drv.memset_d32_async(x_gpu, 0, ZX, stream)
+    x_old_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_old_gpu)
+    x_tilde_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_tilde_gpu)
+    drv.memcpy_dtod_async(x_tilde_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
+    p_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(p_gpu)
+    q_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(q_gpu)
+    drv.memset_d32_async(p_gpu, 0, 2 * ZX, stream)
+    drv.memset_d32_async(q_gpu, 0, TN, stream)
+    grad_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(grad_gpu)
+    div_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(div_gpu)
+    Ax_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(Ax_gpu)
+    ATq_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(ATq_gpu)
+    zero_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(zero_gpu)
+    drv.memset_d32_async(zero_gpu, 0, ZX, stream)
+    # Laplacian buffers (if enabled and kernel available)
+    use_lap = use_laplacian and has_laplacian and (beta > 0)
+    if use_lap:
+        lap_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(lap_gpu)
+        r_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(r_gpu)
+        drv.memset_d32_async(r_gpu, 0, ZX, stream)
+        # scalar beta for laplacian (allow separate scale)
+        beta_lap = float(beta) * float(laplacian_beta_scale)
+        inv_1_plus_sigma_beta = np.float32(1.0 / (1.0 + float(sigma) * beta_lap))
+    # host buffers for logs
+    x_host = np.empty(ZX, dtype=dtype)
+    Ax_host = np.empty(TN, dtype=dtype)
+    q_host = np.empty(TN, dtype=dtype)
+    p_host = np.empty(2 * ZX, dtype=dtype)
+    ATq_host = np.empty(ZX, dtype=dtype)
+    # compute initial backprojection for auto-alpha
+    drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
+    backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                        y_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                        block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+    stream.synchronize()
+    drv.memcpy_dtoh(x_host, ATq_gpu)
+    # auto alpha if requested
+    if alpha is None:
+        drv.memcpy_htod_async(x_gpu, x_host, stream)
+        projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                        x_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                        block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+        stream.synchronize()
+        drv.memcpy_dtoh(Ax_host, Ax_gpu)
+        resid = Ax_host - y_normed[:TN]
+        data_term = 0.5 * float(np.dot(resid, resid))
+        tv_term = float(compute_TV_cpu(x_host, Z, X)) + 1e-12
+        alpha = float(auto_alpha_gamma * data_term / tv_term)
+        if show_logs:
+            print(f"[auto-alpha] data_term={data_term:.6e}, tv_term={tv_term:.6e}, alpha_set={alpha:.6e}")
+    # tikhonov prox multiplicative scale
+    if tikhonov_as_gradient:
+        tikh_scale = None
+    else:
+        tikh_scale = np.float32(1.0 / (1.0 + 2.0 * tau * beta)) if beta > 0 else np.float32(1.0)
+    # saving policy
+    if numIterations <= max_saves:
+        save_indices_all = list(range(0, numIterations + 1))
+    else:
+        step = max(1, numIterations // max_saves)
+        save_indices_all = list(range(0, numIterations + 1, step))
+    device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
+    if show_logs:
+        if (alpha is None or alpha == 0) and (beta is None or beta == 0):
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS) -- {tumor_str} -- {device_str}"
+        if alpha is None or alpha == 0:
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} beta={beta:.4e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-Tikhonov) -- {tumor_str} -- {device_str}"
+        elif beta is None or beta == 0:
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) -- {tumor_str} -- {device_str}"
+        else:
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV-Tikhonov) -- {tumor_str} -- {device_str}"
+    I_reconMatrix = []
+    saved_indices = []
+    if isSavingEachIteration and 0 in save_indices_all:
+        drv.memcpy_dtoh(x_host, x_gpu)
+        x0 = x_host.reshape((Z, X)).copy()
+        if maxy > 0:
+            x0 *= maxy
+        I_reconMatrix.append(x0)
+        saved_indices.append(0)
+    # main loop
+    try:
+        iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        for it in iterator:
+            # 1) dual p update (TV)
+            gradient_kernel(grad_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                            block=(block_size, 1, 1),
+                            grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                            stream=stream)
+            _call_axpby(axpby_kernel, p_gpu, p_gpu, grad_gpu, 1.0, sigma, 2 * ZX, stream, block_size)
+            proj_tv_kernel(p_gpu, np.float32(alpha), np.int32(ZX),
+                            block=(block_size, 1, 1),
+                            grid=((ZX + block_size - 1) // block_size, 1, 1),
+                            stream=stream)
+            # 2) dual q update (data fidelity)
+            projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                            x_tilde_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                            block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            _call_axpby(axpby_kernel, Ax_gpu, Ax_gpu, y_gpu, 1.0, -1.0, TN, stream, block_size)
+            _call_axpby(axpby_kernel, q_gpu, q_gpu, Ax_gpu, 1.0 / (1.0 + sigma), sigma / (1.0 + sigma), TN, stream, block_size)
+            # optional Laplacian dual update
+            if use_lap:
+                # compute Laplacian of x_tilde -> lap_gpu
+                laplacian_kernel(lap_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                                block=(block_size, 1, 1),
+                                grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                                stream=stream)
+                # r = r + sigma * lap
+                _call_axpby(axpby_kernel, r_gpu, r_gpu, lap_gpu, 1.0, sigma, ZX, stream, block_size)
+                # r = r / (1 + sigma * beta_lap)
+                _call_axpby(axpby_kernel, r_gpu, r_gpu, zero_gpu, inv_1_plus_sigma_beta, 0.0, ZX, stream, block_size)
+            # 3) primal x update
+            drv.memcpy_dtod_async(x_old_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
+            divergence_kernel(div_gpu, p_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                            block=(block_size, 1, 1),
+                            grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                            stream=stream)
+            drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
+            backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                                q_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                                block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            # ATq - div
+            _call_minus_axpy(minus_axpy_kernel, ATq_gpu, div_gpu, 1.0, ZX, stream, block_size)
+            # if laplacian is used, add H^T r into ATq
+            if use_lap:
+                # compute laplacian_adj_kernel(temp, r)
+                # reuse grad_gpu as temporary if safe (its content used earlier, but not reused until later)
+                laplacian_adj_kernel(grad_gpu, r_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                                    block=(block_size, 1, 1),
+                                    grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                                    stream=stream)
+                # ATq_gpu += temp (grad_gpu)
+                _call_axpby(axpby_kernel, ATq_gpu, ATq_gpu, grad_gpu, 1.0, 1.0, ZX, stream, block_size)
+            # x = x_old - tau * ATq_buffer
+            _call_minus_axpy(minus_axpy_kernel, x_gpu, ATq_gpu, tau, ZX, stream, block_size)
+            # Tikhonov
+            if beta > 0:
+                if tikhonov_as_gradient:
+                    mul = 1.0 - 2.0 * float(tau) * float(beta)
+                    if mul <= 0.0:
+                        # fallback to prox multiplicative stable
+                        fallback_scale = np.float32(1.0 / (1.0 + 2.0 * float(tau) * float(beta)))
+                        _call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, fallback_scale, 0.0, ZX, stream, block_size)
+                    else:
+                        # x *= mul  => implemented as axpby: out = 1* x + (mul-1)*x
+                        _call_axpby(axpby_kernel, x_gpu, x_gpu, x_gpu, 1.0, np.float32(mul - 1.0), ZX, stream, block_size)
+                else:
+                    _call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, tikh_scale, np.float32(0.0), ZX, stream, block_size)
+            # positivity clamp (prefer GPU kernel if available)
+            if apply_positivity_clamp:
+                if has_clamp_kernel:
+                    # in-place clamp on GPU
+                    clamp_positive_kernel(x_gpu, np.int32(ZX),
+                                        block=(block_size, 1, 1),
+                                        grid=((ZX + block_size - 1) // block_size, 1, 1),
+                                        stream=stream)
+                else:
+                    # fallback CPU roundtrip (slower)
+                    stream.synchronize()
+                    drv.memcpy_dtoh(x_host, x_gpu)
+                    np.maximum(x_host, 0.0, out=x_host)
+                    drv.memcpy_htod_async(x_gpu, x_host, stream)
+            # extrapolation
+            _call_axpby(axpby_kernel, x_tilde_gpu, x_gpu, x_old_gpu, np.float32(1.0 + theta), np.float32(-theta), ZX, stream, block_size)
+            # saves
+            if isSavingEachIteration and (it + 1) in save_indices_all:
+                stream.synchronize()
+                drv.memcpy_dtoh(x_host, x_gpu)
+                x_saved = x_host.reshape((Z, X)).copy()
+                if maxy > 0:
+                    x_saved *= maxy
+                I_reconMatrix.append(x_saved)
+                saved_indices.append(it + 1)
+        stream.synchronize()
+        drv.memcpy_dtoh(x_host, x_gpu)
+        x_final = x_host.reshape((Z, X)).copy()
+        if maxy > 0:
+            x_final *= maxy
+            if isSavingEachIteration and len(I_reconMatrix):
+                for i in range(len(I_reconMatrix)):
+                    I_reconMatrix[i] *= maxy
+        # free buffers
+        for buff in bufs:
+            try:
+                buff.free()
+            except:
+                pass
+        if SMatrix.ctx:
+            SMatrix.ctx.pop()
+        if isSavingEachIteration:
+            return I_reconMatrix, saved_indices
+        else:
+            return x_final, None
+    except Exception as e:
+        # cleanup robustly
+        print("Error in CP_TV_Tikhonov+Lap (robust):", e)
+        try:
+            for buff in bufs:
+                try:
+                    buff.free()
+                except:
+                    pass
+        except:
+            pass
+        try:
+            if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
+                SMatrix.ctx.pop()
+        except:
+            pass
+        raise
 def CP_KL(
     SMatrix,
@@ -132,9 +563,10 @@ def CP_KL(
     numIterations=5000,
     isSavingEachIteration=True,
     L=None,
-    withTumor=True,
+    tumor_str="",
     device=None,
     max_saves=5000,
+    show_logs=True,
 ):
     """
     Chambolle-Pock algorithm for Kullback-Leibler (KL) divergence regularization.
@@ -193,12 +625,11 @@ def CP_KL(
     saved_indices = [0]
     # Description for progress bar
-    tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
     device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
     description = f"AOT-BioMaps -- Primal/Dual Reconstruction (KL) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
-    # Main loop
-    for iteration in trange(numIterations, desc=description):
+    iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+    for iteration in iterator:
         # Update q (proximal step for F*)
         q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)

AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# sparse_matrix_gpu.py
 import pycuda.driver as drv
 import numpy as np
 from pycuda.compiler import SourceModule
@@ -26,6 +25,14 @@ class SparseSMatrix_CSR:
         self.X = manip.AcousticFields[0].field.shape[2]
         self.block_rows = block_rows
         self.relative_threshold = relative_threshold
+        # --- FIX: Résolution du chemin du .cubin (dans AOT_Recon/) ---
+        # Le fichier SparseSMatrix_CSR.py est dans AOT_Recon/AOT_SparseSMatrix/
+        # On remonte d'un répertoire pour atteindre AOT_Recon/
+        cubin_parent_dir = os.path.dirname(os.path.dirname(__file__))
+        self.module_path = os.path.join(cubin_parent_dir, "AOT_biomaps_kernels.cubin")
+        # --- FIN FIX ---
         self.h_dense = None
         self.row_ptr = None
         self.row_ptr_gpu = None
@@ -41,22 +48,24 @@ class SparseSMatrix_CSR:
     def __exit__(self, exc_type, exc, tb):
         self.free()
-    def load_precompiled_module(self, so_path="AOT_biomaps_kernels.cubin"):
+    def load_precompiled_module(self):
+        """
+        Charge le module CUDA pré-compilé (.cubin) en utilisant le chemin résolu.
+        Supprime la logique de compilation JIT.
+        """
+        so_path = self.module_path # Utilise le chemin résolu dans __init__
+        if not os.path.exists(so_path):
+            raise FileNotFoundError(
+                f"Le module CUDA {os.path.basename(so_path)} est introuvable au chemin: {so_path}. "
+                "Assurez-vous qu'il est compilé et bien placé."
+            )
         try:
-            # If a PTX or cubin is provided via path
             self.sparse_mod = drv.module_from_file(so_path)
             print(f"✅ Module CUDA chargé depuis {so_path}")
-        except Exception:
-            # Fallback: try to compile from bundled source (if available)
-            src_path = os.path.join(os.path.dirname(__file__), 'AOT_biomaps_kernels.cu')
-            if os.path.exists(src_path):
-                print("Compilation JIT du kernel CUDA depuis source...")
-                with open(src_path, 'r') as f:
-                    src = f.read()
-                self.sparse_mod = SourceModule(src, no_extern_c=True)
-                print("✅ Module compilé JIT")
-            else:
-                raise
+        except Exception as e:
+             raise RuntimeError(f"Le fichier {os.path.basename(so_path)} a été trouvé, mais PyCUDA n'a pas pu le charger. Vérifiez la compatibilité.") from e
     def estimate_nnz_cpu(self):
         """Estimation rapide (non-exacte) — utile si tu veux une estimation faible.
@@ -80,12 +89,10 @@ class SparseSMatrix_CSR:
             bytes_float = np.dtype(np.float32).itemsize
             # Charge module
-            if kernel_module_path:
-                self.load_precompiled_module(kernel_module_path)
-            else:
-                self.load_precompiled_module('AOT_biomaps_kernels.cubin')
+            # FIX: Toujours charger depuis self.module_path (résolu)
+            self.load_precompiled_module()
-            count_nnz_kernel = self.sparse_mod.get_function('count_nnz_per_row_kernel')
+            count_nnz_kernel = self.sparse_mod.get_function('count_nnz_rows_kernel')
             fill_csr_kernel = self.sparse_mod.get_function('fill_kernel__CSR')
             # allocate host row_ptr
@@ -110,6 +117,7 @@ class SparseSMatrix_CSR:
                 drv.memcpy_htod(dense_block_gpu, dense_block_host)
                 grid = ((current_rows + block_size - 1) // block_size, 1, 1)
+                # Note: Assuming 'count_nnz_per_row_kernel' is the correct name (verified by user in prior steps)
                 count_nnz_kernel(dense_block_gpu, row_nnz_gpu,
                                  np.int32(current_rows), np.int32(num_cols),
                                  np.float32(self.relative_threshold),
@@ -182,7 +190,11 @@ class SparseSMatrix_CSR:
         drv.memset_d32(col_sum_gpu, 0, ZX)
         # 2) Récupérer le kernel
-        acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic__CSR")
+        # FIX: Utiliser le nom générique 'accumulate_columns_atomic' comme dans SELL (si le binaire est partagé)
+        # Si le développeur utilise la convention __CSR, on la garde.
+        # Basé sur notre historique SELL, le nom est probablement générique 'accumulate_columns_atomic'.
+        # Je vais supposer que le nom est générique pour éviter une LogicError ici aussi.
+        acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic")
         # 3) Lancer le kernel
         threads = 256
@@ -210,12 +222,23 @@ class SparseSMatrix_CSR:
         drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
     def getMatrixSize(self):
+        """
+        Retourne la taille totale de la matrice CSR en Go (en sommant la mémoire GPU).
+        Utilise les attributs de taille stockés pour contourner l'AttributeError de DeviceAllocation.
+        """
+        # Note: L'utilisateur doit s'assurer que self.row_ptr existe avant cet appel.
         if self.row_ptr is None:
             return {"error": "La matrice sparse n'est pas encore allouée."}
-        total = (self.row_ptr.nbytes if self.row_ptr is not None else 0) + \
-                (self.h_col_ind.nbytes if self.h_col_ind is not None else 0) + \
-                (self.h_values.nbytes if self.h_values is not None else 0)
-        return total / (1024**3)
+        total_bytes = 0
+        # Somme des tailles stockées (Taille calculée et attribuée dans allocate et compute_norm_factor_from_csr)
+        total_bytes += getattr(self, 'row_ptr_gpu_size', 0)
+        total_bytes += getattr(self, 'col_ind_gpu_size', 0)
+        total_bytes += getattr(self, 'values_gpu_size', 0)
+        total_bytes += getattr(self, 'norm_factor_inv_gpu_size', 0)
+        return total_bytes / (1024**3)
     def free(self):
         try:
@@ -248,5 +271,4 @@ class SparseSMatrix_CSR:
         num_cols = int(self.Z * self.X)
         total_nnz = int(self.row_ptr[-1])
         density = total_nnz / (num_rows * num_cols)
-        return density
+        return density

AOT-biomaps 2.9.261__py3-none-any.whl → 2.9.294__py3-none-any.whl

AOT-biomaps 2.9.261py3-none-any.whl → 2.9.294py3-none-any.whl