PyPI - AOT-biomaps - Versions diffs - 2.9.186__py3-none-any.whl → 2.9.294__py3-none-any.whl - Mend

AOT-biomaps 2.9.186py3-none-any.whl → 2.9.294py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (28) hide show

AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +11 -6
AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
AOT_biomaps/AOT_Experiment/_mainExperiment.py +95 -55
AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +48 -13
AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +406 -13
AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +118 -38
AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +303 -102
AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +443 -12
AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +274 -0
AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +328 -0
AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
AOT_biomaps/AOT_Recon/AlgebraicRecon.py +243 -113
AOT_biomaps/AOT_Recon/AnalyticRecon.py +26 -41
AOT_biomaps/AOT_Recon/BayesianRecon.py +81 -146
AOT_biomaps/AOT_Recon/PrimalDualRecon.py +157 -94
AOT_biomaps/AOT_Recon/ReconEnums.py +27 -2
AOT_biomaps/AOT_Recon/ReconTools.py +229 -12
AOT_biomaps/AOT_Recon/__init__.py +1 -0
AOT_biomaps/AOT_Recon/_mainRecon.py +60 -53
AOT_biomaps/__init__.py +4 -69
{aot_biomaps-2.9.186.dist-info → aot_biomaps-2.9.294.dist-info}/METADATA +2 -1
aot_biomaps-2.9.294.dist-info/RECORD +47 -0
aot_biomaps-2.9.186.dist-info/RECORD +0 -43
{aot_biomaps-2.9.186.dist-info → aot_biomaps-2.9.294.dist-info}/WHEEL +0 -0
{aot_biomaps-2.9.186.dist-info → aot_biomaps-2.9.294.dist-info}/top_level.txt +0 -0

AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star
+from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star, _call_axpby, _call_minus_axpy, compute_TV_cpu, power_method_estimate_L__SELL, calculate_memory_requirement, check_gpu_memory
 from AOT_biomaps.Config import config
-from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType
+from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType, SMatrixType
 import torch
 from tqdm import trange
+import numpy as np
+import pycuda.driver as drv
 '''
 This module implements Primal-Dual Hybrid Gradient (PDHG) methods for solving inverse problems in Acousto-Optic Tomography.
@@ -11,6 +13,103 @@ The methods can run on both CPU and GPU, with configurations set in the AOT_biom
 '''
 def CP_TV(
+    SMatrix,
+    y,
+    alpha=None,               # TV regularization parameter (if None, alpha is auto-scaled)
+    beta=1e-4,              # Tikhonov regularization parameter
+    theta=1.0,
+    numIterations=5000,
+    isSavingEachIteration=True,
+    L=None,
+    withTumor=True,
+    device=None,
+    max_saves=5000,
+    show_logs=True,
+    smatrixType=SMatrixType.SELL,
+    k_security=0.8,
+    use_power_method=True,
+    auto_alpha_gamma=0.05,    # gamma for auto alpha: alpha = gamma * data_term / tv_term
+    apply_positivity_clamp=True,
+    tikhonov_as_gradient=False,  # if True, apply -tau*2*beta*x instead of prox multiplicative
+    use_laplacian=True,         # enable Laplacian (Hessian scalar) penalty
+    laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
+):
+    # try:
+    tumor_str = "WITH" if withTumor else "WITHOUT"
+    # Auto-select device and method
+    if device is None:
+        if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
+            device = torch.device(f"cuda:{config.select_best_gpu()}")
+            use_gpu = True
+        else:
+            device = torch.device("cpu")
+            use_gpu = False
+    else:
+        use_gpu = device.type == "cuda"
+    # Dispatch to the appropriate implementation
+    if use_gpu:
+            if smatrixType == SMatrixType.CSR:
+                raise NotImplementedError("GPU Chambolle Pock (LS-TV) with CSR not implemented.")
+            elif smatrixType == SMatrixType.SELL:
+                return CP_TV_Tikhonov_sparseSELL_pycuda(SMatrix, y, alpha,beta, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs, k_security, use_power_method, auto_alpha_gamma, apply_positivity_clamp, tikhonov_as_gradient, use_laplacian, laplacian_beta_scale)
+            elif smatrixType == SMatrixType.DENSE:
+                return CP_TV_dense(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
+            else:
+                raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-TV).")
+    else:
+        raise NotImplementedError("CPU Chambolle Pock (LS-TV) not implemented.")
+def CP_KL(
+    SMatrix,
+    y,
+    alpha=None,               # TV regularization parameter (if None, alpha is auto-scaled)
+    beta=1e-4,              # Tikhonov regularization parameter
+    theta=1.0,
+    numIterations=5000,
+    isSavingEachIteration=True,
+    L=None,
+    withTumor=True,
+    device=None,
+    max_saves=5000,
+    show_logs=True,
+    smatrixType=SMatrixType.SELL,
+    k_security=0.8,
+    use_power_method=True,
+    auto_alpha_gamma=0.05,    # gamma for auto alpha: alpha = gamma * data_term / tv_term
+    apply_positivity_clamp=True,
+    tikhonov_as_gradient=False,  # if True, apply -tau*2*beta*x instead of prox multiplicative
+    use_laplacian=True,         # enable Laplacian (Hessian scalar) penalty
+    laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
+):
+    # try:
+    tumor_str = "WITH" if withTumor else "WITHOUT"
+    # Auto-select device and method
+    if device is None:
+        if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
+            device = torch.device(f"cuda:{config.select_best_gpu()}")
+            use_gpu = True
+        else:
+            device = torch.device("cpu")
+            use_gpu = False
+    else:
+        use_gpu = device.type == "cuda"
+    # Dispatch to the appropriate implementation
+    if use_gpu:
+            if smatrixType == SMatrixType.CSR:
+                raise NotImplementedError("GPU Chambolle Pock (LS-KL) with CSR not implemented.")
+            elif smatrixType == SMatrixType.SELL:
+                raise NotImplementedError("GPU Chambolle Pock (LS-KL) with SELL not implemented.")
+            elif smatrixType == SMatrixType.DENSE:
+                return CP_KL(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
+            else:
+                raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-KL).")
+    else:
+        raise NotImplementedError("CPU Chambolle Pock (LS-KL) not implemented.")
+def CP_TV_dense(
     SMatrix,
     y,
     alpha=1e-1,
@@ -21,13 +120,14 @@ def CP_TV(
     withTumor=True,
     device=None,
     max_saves=5000,
+    show_logs=True,
 ):
     """
     Chambolle-Pock algorithm for Total Variation (TV) regularization.
     Works on both CPU and GPU.
     Args:
         SMatrix: System matrix (shape: T, Z, X, N)
-        y: Measurement data (shape: T, X, N)
+        y: Measurement data (shape: T, N)
         alpha: Regularization parameter for TV
         theta: Relaxation parameter (1.0 for standard Chambolle-Pock)
         numIterations: Number of iterations
@@ -92,10 +192,10 @@ def CP_TV(
     # Description for progress bar
     tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
     device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
-    description = f"AOT-BioMaps -- Primal/Dual Reconstruction (TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
+    description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
-    # Main loop
-    for iteration in trange(numIterations, desc=description):
+    iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+    for it in iterator:
         # Update p (TV proximal step)
         grad_x = gradient(x_tilde.reshape(Z, X))
         p = proj_l2(p + sigma * grad_x, alpha)
@@ -113,9 +213,9 @@ def CP_TV(
         x_tilde = x + theta * (x - x_old)
         # Save intermediate result if needed
-        if isSavingEachIteration and iteration in save_indices:
+        if isSavingEachIteration and it in save_indices:
             I_reconMatrix.append(x.reshape(Z, X).clone() * (norm_y / norm_A))
-            saved_indices.append(iteration)
+            saved_indices.append(it)
     # Return results
     if isSavingEachIteration:
@@ -123,6 +223,337 @@ def CP_TV(
     else:
         return (x.reshape(Z, X) * (norm_y / norm_A)).cpu().numpy(), None
+def CP_TV_Tikhonov_sparseSELL_pycuda(
+    SMatrix,
+    y,
+    alpha=None,               # TV regularization parameter (if None, alpha is auto-scaled)
+    beta=1e-4,              # Tikhonov regularization parameter
+    theta=1.0,
+    numIterations=2000,
+    isSavingEachIteration=True,
+    L=None,
+    tumor_str="",
+    device=None,
+    max_saves=2000,
+    show_logs=True,
+    k_security=0.8,
+    use_power_method=True,
+    auto_alpha_gamma=0.05,    # gamma for auto alpha: alpha = gamma * data_term / tv_term
+    apply_positivity_clamp=True,
+    tikhonov_as_gradient=False,  # if True, apply -tau*2*beta*x instead of prox multiplicative
+    use_laplacian=True,         # enable Laplacian (Hessian scalar) penalty
+    laplacian_beta_scale=1.0    # multiply beta for laplacian term if you want separate scaling
+):
+    """
+    CP-TV + Tikhonov + Laplacian (Hessian scalar) penalty integrated.
+    Returns (I_reconMatrix, saved_indices) if isSavingEachIteration else (x_final, None).
+    """
+        # ----- begin main -----
+    if SMatrix.ctx:
+        SMatrix.ctx.push()
+    # prepare variables
+    dtype = np.float32
+    TN = int(SMatrix.N * SMatrix.T)
+    ZX = int(SMatrix.Z * SMatrix.X)
+    Z, X = SMatrix.Z, SMatrix.X
+    block_size = 256
+    # existing kernels
+    projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
+    backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
+    axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
+    minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
+    gradient_kernel = SMatrix.sparse_mod.get_function("gradient_kernel")
+    divergence_kernel = SMatrix.sparse_mod.get_function("divergence_kernel")
+    proj_tv_kernel = SMatrix.sparse_mod.get_function("proj_tv_kernel")
+    # optional kernels (laplacian & clamp)
+    has_laplacian = False
+    has_clamp_kernel = False
+    try:
+        laplacian_kernel = SMatrix.sparse_mod.get_function("laplacian_kernel")
+        laplacian_adj_kernel = SMatrix.sparse_mod.get_function("laplacian_adj_kernel")
+        has_laplacian = True
+    except Exception:
+        has_laplacian = False
+    try:
+        clamp_positive_kernel = SMatrix.sparse_mod.get_function("clamp_positive_kernel")
+        has_clamp_kernel = True
+    except Exception:
+        has_clamp_kernel = False
+    stream = drv.Stream()
+    # estimate L operator norm if needed
+    if use_power_method or L is None:
+        L_LS_sq = power_method_estimate_L__SELL(SMatrix, stream, n_it=20, block_size=block_size)
+        L_nabla_sq = 8.0
+        L_op_norm = np.sqrt(L_LS_sq + L_nabla_sq)
+        if L_op_norm < 1e-6:
+            L_op_norm = 1.0
+    else:
+        L_op_norm = L
+    tau = np.float32(k_security / L_op_norm)
+    sigma = np.float32(k_security / L_op_norm)
+    # prepare y and normalization
+    y = y.T.astype(dtype).reshape(-1)
+    maxy = float(np.max(np.abs(y))) if y.size > 0 else 0.0
+    if maxy > 0:
+        y_normed = (y / maxy).copy()
+    else:
+        y_normed = y.copy()
+    # GPU allocations
+    bufs = []
+    y_gpu = drv.mem_alloc(y_normed.nbytes); bufs.append(y_gpu)
+    drv.memcpy_htod_async(y_gpu, y_normed.T.flatten(), stream)
+    x_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_gpu)
+    drv.memset_d32_async(x_gpu, 0, ZX, stream)
+    x_old_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_old_gpu)
+    x_tilde_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_tilde_gpu)
+    drv.memcpy_dtod_async(x_tilde_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
+    p_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(p_gpu)
+    q_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(q_gpu)
+    drv.memset_d32_async(p_gpu, 0, 2 * ZX, stream)
+    drv.memset_d32_async(q_gpu, 0, TN, stream)
+    grad_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(grad_gpu)
+    div_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(div_gpu)
+    Ax_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(Ax_gpu)
+    ATq_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(ATq_gpu)
+    zero_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(zero_gpu)
+    drv.memset_d32_async(zero_gpu, 0, ZX, stream)
+    # Laplacian buffers (if enabled and kernel available)
+    use_lap = use_laplacian and has_laplacian and (beta > 0)
+    if use_lap:
+        lap_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(lap_gpu)
+        r_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(r_gpu)
+        drv.memset_d32_async(r_gpu, 0, ZX, stream)
+        # scalar beta for laplacian (allow separate scale)
+        beta_lap = float(beta) * float(laplacian_beta_scale)
+        inv_1_plus_sigma_beta = np.float32(1.0 / (1.0 + float(sigma) * beta_lap))
+    # host buffers for logs
+    x_host = np.empty(ZX, dtype=dtype)
+    Ax_host = np.empty(TN, dtype=dtype)
+    q_host = np.empty(TN, dtype=dtype)
+    p_host = np.empty(2 * ZX, dtype=dtype)
+    ATq_host = np.empty(ZX, dtype=dtype)
+    # compute initial backprojection for auto-alpha
+    drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
+    backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                        y_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                        block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+    stream.synchronize()
+    drv.memcpy_dtoh(x_host, ATq_gpu)
+    # auto alpha if requested
+    if alpha is None:
+        drv.memcpy_htod_async(x_gpu, x_host, stream)
+        projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                        x_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                        block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+        stream.synchronize()
+        drv.memcpy_dtoh(Ax_host, Ax_gpu)
+        resid = Ax_host - y_normed[:TN]
+        data_term = 0.5 * float(np.dot(resid, resid))
+        tv_term = float(compute_TV_cpu(x_host, Z, X)) + 1e-12
+        alpha = float(auto_alpha_gamma * data_term / tv_term)
+        if show_logs:
+            print(f"[auto-alpha] data_term={data_term:.6e}, tv_term={tv_term:.6e}, alpha_set={alpha:.6e}")
+    # tikhonov prox multiplicative scale
+    if tikhonov_as_gradient:
+        tikh_scale = None
+    else:
+        tikh_scale = np.float32(1.0 / (1.0 + 2.0 * tau * beta)) if beta > 0 else np.float32(1.0)
+    # saving policy
+    if numIterations <= max_saves:
+        save_indices_all = list(range(0, numIterations + 1))
+    else:
+        step = max(1, numIterations // max_saves)
+        save_indices_all = list(range(0, numIterations + 1, step))
+    device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
+    if show_logs:
+        if (alpha is None or alpha == 0) and (beta is None or beta == 0):
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS) -- {tumor_str} -- {device_str}"
+        if alpha is None or alpha == 0:
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} beta={beta:.4e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-Tikhonov) -- {tumor_str} -- {device_str}"
+        elif beta is None or beta == 0:
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) -- {tumor_str} -- {device_str}"
+        else:
+            print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
+            description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV-Tikhonov) -- {tumor_str} -- {device_str}"
+    I_reconMatrix = []
+    saved_indices = []
+    if isSavingEachIteration and 0 in save_indices_all:
+        drv.memcpy_dtoh(x_host, x_gpu)
+        x0 = x_host.reshape((Z, X)).copy()
+        if maxy > 0:
+            x0 *= maxy
+        I_reconMatrix.append(x0)
+        saved_indices.append(0)
+    # main loop
+    try:
+        iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+        for it in iterator:
+            # 1) dual p update (TV)
+            gradient_kernel(grad_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                            block=(block_size, 1, 1),
+                            grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                            stream=stream)
+            _call_axpby(axpby_kernel, p_gpu, p_gpu, grad_gpu, 1.0, sigma, 2 * ZX, stream, block_size)
+            proj_tv_kernel(p_gpu, np.float32(alpha), np.int32(ZX),
+                            block=(block_size, 1, 1),
+                            grid=((ZX + block_size - 1) // block_size, 1, 1),
+                            stream=stream)
+            # 2) dual q update (data fidelity)
+            projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                            x_tilde_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                            block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            _call_axpby(axpby_kernel, Ax_gpu, Ax_gpu, y_gpu, 1.0, -1.0, TN, stream, block_size)
+            _call_axpby(axpby_kernel, q_gpu, q_gpu, Ax_gpu, 1.0 / (1.0 + sigma), sigma / (1.0 + sigma), TN, stream, block_size)
+            # optional Laplacian dual update
+            if use_lap:
+                # compute Laplacian of x_tilde -> lap_gpu
+                laplacian_kernel(lap_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                                block=(block_size, 1, 1),
+                                grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                                stream=stream)
+                # r = r + sigma * lap
+                _call_axpby(axpby_kernel, r_gpu, r_gpu, lap_gpu, 1.0, sigma, ZX, stream, block_size)
+                # r = r / (1 + sigma * beta_lap)
+                _call_axpby(axpby_kernel, r_gpu, r_gpu, zero_gpu, inv_1_plus_sigma_beta, 0.0, ZX, stream, block_size)
+            # 3) primal x update
+            drv.memcpy_dtod_async(x_old_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
+            divergence_kernel(div_gpu, p_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                            block=(block_size, 1, 1),
+                            grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                            stream=stream)
+            drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
+            backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
+                                q_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
+                                block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
+            # ATq - div
+            _call_minus_axpy(minus_axpy_kernel, ATq_gpu, div_gpu, 1.0, ZX, stream, block_size)
+            # if laplacian is used, add H^T r into ATq
+            if use_lap:
+                # compute laplacian_adj_kernel(temp, r)
+                # reuse grad_gpu as temporary if safe (its content used earlier, but not reused until later)
+                laplacian_adj_kernel(grad_gpu, r_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
+                                    block=(block_size, 1, 1),
+                                    grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
+                                    stream=stream)
+                # ATq_gpu += temp (grad_gpu)
+                _call_axpby(axpby_kernel, ATq_gpu, ATq_gpu, grad_gpu, 1.0, 1.0, ZX, stream, block_size)
+            # x = x_old - tau * ATq_buffer
+            _call_minus_axpy(minus_axpy_kernel, x_gpu, ATq_gpu, tau, ZX, stream, block_size)
+            # Tikhonov
+            if beta > 0:
+                if tikhonov_as_gradient:
+                    mul = 1.0 - 2.0 * float(tau) * float(beta)
+                    if mul <= 0.0:
+                        # fallback to prox multiplicative stable
+                        fallback_scale = np.float32(1.0 / (1.0 + 2.0 * float(tau) * float(beta)))
+                        _call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, fallback_scale, 0.0, ZX, stream, block_size)
+                    else:
+                        # x *= mul  => implemented as axpby: out = 1* x + (mul-1)*x
+                        _call_axpby(axpby_kernel, x_gpu, x_gpu, x_gpu, 1.0, np.float32(mul - 1.0), ZX, stream, block_size)
+                else:
+                    _call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, tikh_scale, np.float32(0.0), ZX, stream, block_size)
+            # positivity clamp (prefer GPU kernel if available)
+            if apply_positivity_clamp:
+                if has_clamp_kernel:
+                    # in-place clamp on GPU
+                    clamp_positive_kernel(x_gpu, np.int32(ZX),
+                                        block=(block_size, 1, 1),
+                                        grid=((ZX + block_size - 1) // block_size, 1, 1),
+                                        stream=stream)
+                else:
+                    # fallback CPU roundtrip (slower)
+                    stream.synchronize()
+                    drv.memcpy_dtoh(x_host, x_gpu)
+                    np.maximum(x_host, 0.0, out=x_host)
+                    drv.memcpy_htod_async(x_gpu, x_host, stream)
+            # extrapolation
+            _call_axpby(axpby_kernel, x_tilde_gpu, x_gpu, x_old_gpu, np.float32(1.0 + theta), np.float32(-theta), ZX, stream, block_size)
+            # saves
+            if isSavingEachIteration and (it + 1) in save_indices_all:
+                stream.synchronize()
+                drv.memcpy_dtoh(x_host, x_gpu)
+                x_saved = x_host.reshape((Z, X)).copy()
+                if maxy > 0:
+                    x_saved *= maxy
+                I_reconMatrix.append(x_saved)
+                saved_indices.append(it + 1)
+        stream.synchronize()
+        drv.memcpy_dtoh(x_host, x_gpu)
+        x_final = x_host.reshape((Z, X)).copy()
+        if maxy > 0:
+            x_final *= maxy
+            if isSavingEachIteration and len(I_reconMatrix):
+                for i in range(len(I_reconMatrix)):
+                    I_reconMatrix[i] *= maxy
+        # free buffers
+        for buff in bufs:
+            try:
+                buff.free()
+            except:
+                pass
+        if SMatrix.ctx:
+            SMatrix.ctx.pop()
+        if isSavingEachIteration:
+            return I_reconMatrix, saved_indices
+        else:
+            return x_final, None
+    except Exception as e:
+        # cleanup robustly
+        print("Error in CP_TV_Tikhonov+Lap (robust):", e)
+        try:
+            for buff in bufs:
+                try:
+                    buff.free()
+                except:
+                    pass
+        except:
+            pass
+        try:
+            if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
+                SMatrix.ctx.pop()
+        except:
+            pass
+        raise
 def CP_KL(
     SMatrix,
@@ -132,9 +563,10 @@ def CP_KL(
     numIterations=5000,
     isSavingEachIteration=True,
     L=None,
-    withTumor=True,
+    tumor_str="",
     device=None,
     max_saves=5000,
+    show_logs=True,
 ):
     """
     Chambolle-Pock algorithm for Kullback-Leibler (KL) divergence regularization.
@@ -193,12 +625,11 @@ def CP_KL(
     saved_indices = [0]
     # Description for progress bar
-    tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
     device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
     description = f"AOT-BioMaps -- Primal/Dual Reconstruction (KL) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
-    # Main loop
-    for iteration in trange(numIterations, desc=description):
+    iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
+    for iteration in iterator:
         # Update q (proximal step for F*)
         q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)

AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py CHANGED Viewed

@@ -9,26 +9,24 @@ def _Omega_RELATIVE_DIFFERENCE_CPU(theta_flat, index, values, gamma):
     theta_k = theta_flat[k_idx]
     diff = theta_k - theta_j
     abs_diff = np.abs(diff)
     denom = theta_k + theta_j + gamma * abs_diff + 1e-8
     num = diff ** 2
+    psi_pair = num / denom
+    psi_pair = values * psi_pair
     # First derivative ∂U/∂θ_j
     dpsi = (2 * diff * denom - num * (1 + gamma * np.sign(diff))) / (denom ** 2)
     grad_pair = values * (-dpsi)  # Note the negative sign: U contains ψ(θ_k, θ_j), seeking ∂/∂θ_j
     # Second derivative ∂²U/∂θ_j² (numerically stable, approximate treatment)
     d2psi = (2 * denom ** 2 - 4 * diff * denom * (1 + gamma * np.sign(diff))
                 + 2 * num * (1 + gamma * np.sign(diff)) ** 2) / (denom ** 3 + 1e-8)
     hess_pair = values * d2psi
     grad_U = np.zeros_like(theta_flat)
     hess_U = np.zeros_like(theta_flat)
     np.add.at(grad_U, j_idx, grad_pair)
     np.add.at(hess_U, j_idx, hess_pair)
-    return grad_U, hess_U
+    # Compute U_value
+    U_value = 0.5 * np.sum(psi_pair)
+    return grad_U, hess_U, U_value
 def _Omega_RELATIVE_DIFFERENCE_GPU(theta_flat, index, values, device, gamma):
     j_idx, k_idx = index
@@ -38,26 +36,24 @@ def _Omega_RELATIVE_DIFFERENCE_GPU(theta_flat, index, values, device, gamma):
     abs_diff = torch.abs(diff)
     denom = theta_k + theta_j + gamma * abs_diff + 1e-8
     num = diff ** 2
+    psi_pair = num / denom
+    psi_pair = values * psi_pair
     # Compute gradient contributions
     dpsi = (2 * diff * denom - num * (1 + gamma * torch.sign(diff))) / (denom ** 2)
     grad_pair = values * (-dpsi)
     # Compute Hessian contributions
     d2psi = (2 * denom ** 2 - 4 * diff * denom * (1 + gamma * torch.sign(diff))
              + 2 * num * (1 + gamma * torch.sign(diff)) ** 2) / (denom ** 3 + 1e-8)
     hess_pair = values * d2psi
     # Initialize gradient and Hessian on the correct device
     grad_U = torch.zeros_like(theta_flat, device=device)
     hess_U = torch.zeros_like(theta_flat, device=device)
     # Accumulate gradient contributions
     grad_U.index_add_(0, j_idx, grad_pair)
     grad_U.index_add_(0, k_idx, -grad_pair)
     # Accumulate Hessian contributions
     hess_U.index_add_(0, j_idx, hess_pair)
     hess_U.index_add_(0, k_idx, hess_pair)
-    return grad_U, hess_U
+    # Compute U_value
+    U_value = 0.5 * psi_pair.sum()
+    return grad_U, hess_U, U_value

AOT-biomaps 2.9.186__py3-none-any.whl → 2.9.294__py3-none-any.whl

Potentially problematic release.

AOT-biomaps 2.9.186py3-none-any.whl → 2.9.294py3-none-any.whl