AOT-biomaps 2.9.261__py3-none-any.whl → 2.9.294__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +400 -10
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +60 -25
- AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +442 -11
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +48 -26
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +115 -109
- AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
- AOT_biomaps/AOT_Recon/AlgebraicRecon.py +27 -20
- AOT_biomaps/AOT_Recon/PrimalDualRecon.py +94 -41
- AOT_biomaps/AOT_Recon/ReconTools.py +164 -18
- AOT_biomaps/__init__.py +34 -1
- {aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/METADATA +1 -1
- {aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/RECORD +14 -13
- {aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/WHEEL +0 -0
- {aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.294.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star
|
|
1
|
+
from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star, _call_axpby, _call_minus_axpy, compute_TV_cpu, power_method_estimate_L__SELL, calculate_memory_requirement, check_gpu_memory
|
|
2
2
|
from AOT_biomaps.Config import config
|
|
3
|
-
from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType
|
|
3
|
+
from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType, SMatrixType
|
|
4
4
|
import torch
|
|
5
5
|
from tqdm import trange
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pycuda.driver as drv
|
|
6
8
|
|
|
7
9
|
'''
|
|
8
10
|
This module implements Primal-Dual Hybrid Gradient (PDHG) methods for solving inverse problems in Acousto-Optic Tomography.
|
|
@@ -11,6 +13,103 @@ The methods can run on both CPU and GPU, with configurations set in the AOT_biom
|
|
|
11
13
|
'''
|
|
12
14
|
|
|
13
15
|
def CP_TV(
|
|
16
|
+
SMatrix,
|
|
17
|
+
y,
|
|
18
|
+
alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
|
|
19
|
+
beta=1e-4, # Tikhonov regularization parameter
|
|
20
|
+
theta=1.0,
|
|
21
|
+
numIterations=5000,
|
|
22
|
+
isSavingEachIteration=True,
|
|
23
|
+
L=None,
|
|
24
|
+
withTumor=True,
|
|
25
|
+
device=None,
|
|
26
|
+
max_saves=5000,
|
|
27
|
+
show_logs=True,
|
|
28
|
+
smatrixType=SMatrixType.SELL,
|
|
29
|
+
k_security=0.8,
|
|
30
|
+
use_power_method=True,
|
|
31
|
+
auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
|
|
32
|
+
apply_positivity_clamp=True,
|
|
33
|
+
tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
|
|
34
|
+
use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
|
|
35
|
+
laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
|
|
36
|
+
):
|
|
37
|
+
# try:
|
|
38
|
+
tumor_str = "WITH" if withTumor else "WITHOUT"
|
|
39
|
+
# Auto-select device and method
|
|
40
|
+
if device is None:
|
|
41
|
+
if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
|
|
42
|
+
device = torch.device(f"cuda:{config.select_best_gpu()}")
|
|
43
|
+
use_gpu = True
|
|
44
|
+
else:
|
|
45
|
+
device = torch.device("cpu")
|
|
46
|
+
use_gpu = False
|
|
47
|
+
else:
|
|
48
|
+
use_gpu = device.type == "cuda"
|
|
49
|
+
# Dispatch to the appropriate implementation
|
|
50
|
+
if use_gpu:
|
|
51
|
+
if smatrixType == SMatrixType.CSR:
|
|
52
|
+
raise NotImplementedError("GPU Chambolle Pock (LS-TV) with CSR not implemented.")
|
|
53
|
+
elif smatrixType == SMatrixType.SELL:
|
|
54
|
+
return CP_TV_Tikhonov_sparseSELL_pycuda(SMatrix, y, alpha,beta, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs, k_security, use_power_method, auto_alpha_gamma, apply_positivity_clamp, tikhonov_as_gradient, use_laplacian, laplacian_beta_scale)
|
|
55
|
+
elif smatrixType == SMatrixType.DENSE:
|
|
56
|
+
return CP_TV_dense(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-TV).")
|
|
59
|
+
else:
|
|
60
|
+
raise NotImplementedError("CPU Chambolle Pock (LS-TV) not implemented.")
|
|
61
|
+
|
|
62
|
+
def CP_KL(
|
|
63
|
+
SMatrix,
|
|
64
|
+
y,
|
|
65
|
+
alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
|
|
66
|
+
beta=1e-4, # Tikhonov regularization parameter
|
|
67
|
+
theta=1.0,
|
|
68
|
+
numIterations=5000,
|
|
69
|
+
isSavingEachIteration=True,
|
|
70
|
+
L=None,
|
|
71
|
+
withTumor=True,
|
|
72
|
+
device=None,
|
|
73
|
+
max_saves=5000,
|
|
74
|
+
show_logs=True,
|
|
75
|
+
smatrixType=SMatrixType.SELL,
|
|
76
|
+
k_security=0.8,
|
|
77
|
+
use_power_method=True,
|
|
78
|
+
auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
|
|
79
|
+
apply_positivity_clamp=True,
|
|
80
|
+
tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
|
|
81
|
+
use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
|
|
82
|
+
laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
|
|
83
|
+
):
|
|
84
|
+
# try:
|
|
85
|
+
tumor_str = "WITH" if withTumor else "WITHOUT"
|
|
86
|
+
# Auto-select device and method
|
|
87
|
+
if device is None:
|
|
88
|
+
if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
|
|
89
|
+
device = torch.device(f"cuda:{config.select_best_gpu()}")
|
|
90
|
+
use_gpu = True
|
|
91
|
+
else:
|
|
92
|
+
device = torch.device("cpu")
|
|
93
|
+
use_gpu = False
|
|
94
|
+
else:
|
|
95
|
+
use_gpu = device.type == "cuda"
|
|
96
|
+
# Dispatch to the appropriate implementation
|
|
97
|
+
if use_gpu:
|
|
98
|
+
if smatrixType == SMatrixType.CSR:
|
|
99
|
+
raise NotImplementedError("GPU Chambolle Pock (LS-KL) with CSR not implemented.")
|
|
100
|
+
elif smatrixType == SMatrixType.SELL:
|
|
101
|
+
raise NotImplementedError("GPU Chambolle Pock (LS-KL) with SELL not implemented.")
|
|
102
|
+
elif smatrixType == SMatrixType.DENSE:
|
|
103
|
+
return CP_KL(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
|
|
104
|
+
else:
|
|
105
|
+
raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-KL).")
|
|
106
|
+
else:
|
|
107
|
+
raise NotImplementedError("CPU Chambolle Pock (LS-KL) not implemented.")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def CP_TV_dense(
|
|
14
113
|
SMatrix,
|
|
15
114
|
y,
|
|
16
115
|
alpha=1e-1,
|
|
@@ -21,6 +120,7 @@ def CP_TV(
|
|
|
21
120
|
withTumor=True,
|
|
22
121
|
device=None,
|
|
23
122
|
max_saves=5000,
|
|
123
|
+
show_logs=True,
|
|
24
124
|
):
|
|
25
125
|
"""
|
|
26
126
|
Chambolle-Pock algorithm for Total Variation (TV) regularization.
|
|
@@ -92,10 +192,10 @@ def CP_TV(
|
|
|
92
192
|
# Description for progress bar
|
|
93
193
|
tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
|
|
94
194
|
device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
|
|
95
|
-
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
|
|
195
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
|
|
96
196
|
|
|
97
|
-
|
|
98
|
-
for
|
|
197
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
198
|
+
for it in iterator:
|
|
99
199
|
# Update p (TV proximal step)
|
|
100
200
|
grad_x = gradient(x_tilde.reshape(Z, X))
|
|
101
201
|
p = proj_l2(p + sigma * grad_x, alpha)
|
|
@@ -113,9 +213,9 @@ def CP_TV(
|
|
|
113
213
|
x_tilde = x + theta * (x - x_old)
|
|
114
214
|
|
|
115
215
|
# Save intermediate result if needed
|
|
116
|
-
if isSavingEachIteration and
|
|
216
|
+
if isSavingEachIteration and it in save_indices:
|
|
117
217
|
I_reconMatrix.append(x.reshape(Z, X).clone() * (norm_y / norm_A))
|
|
118
|
-
saved_indices.append(
|
|
218
|
+
saved_indices.append(it)
|
|
119
219
|
|
|
120
220
|
# Return results
|
|
121
221
|
if isSavingEachIteration:
|
|
@@ -123,6 +223,337 @@ def CP_TV(
|
|
|
123
223
|
else:
|
|
124
224
|
return (x.reshape(Z, X) * (norm_y / norm_A)).cpu().numpy(), None
|
|
125
225
|
|
|
226
|
+
def CP_TV_Tikhonov_sparseSELL_pycuda(
|
|
227
|
+
SMatrix,
|
|
228
|
+
y,
|
|
229
|
+
alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
|
|
230
|
+
beta=1e-4, # Tikhonov regularization parameter
|
|
231
|
+
theta=1.0,
|
|
232
|
+
numIterations=2000,
|
|
233
|
+
isSavingEachIteration=True,
|
|
234
|
+
L=None,
|
|
235
|
+
tumor_str="",
|
|
236
|
+
device=None,
|
|
237
|
+
max_saves=2000,
|
|
238
|
+
show_logs=True,
|
|
239
|
+
k_security=0.8,
|
|
240
|
+
use_power_method=True,
|
|
241
|
+
auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
|
|
242
|
+
apply_positivity_clamp=True,
|
|
243
|
+
tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
|
|
244
|
+
use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
|
|
245
|
+
laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
|
|
246
|
+
):
|
|
247
|
+
|
|
248
|
+
"""
|
|
249
|
+
CP-TV + Tikhonov + Laplacian (Hessian scalar) penalty integrated.
|
|
250
|
+
Returns (I_reconMatrix, saved_indices) if isSavingEachIteration else (x_final, None).
|
|
251
|
+
"""
|
|
252
|
+
# ----- begin main -----
|
|
253
|
+
if SMatrix.ctx:
|
|
254
|
+
SMatrix.ctx.push()
|
|
255
|
+
|
|
256
|
+
# prepare variables
|
|
257
|
+
dtype = np.float32
|
|
258
|
+
TN = int(SMatrix.N * SMatrix.T)
|
|
259
|
+
ZX = int(SMatrix.Z * SMatrix.X)
|
|
260
|
+
Z, X = SMatrix.Z, SMatrix.X
|
|
261
|
+
block_size = 256
|
|
262
|
+
|
|
263
|
+
# existing kernels
|
|
264
|
+
projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
|
|
265
|
+
backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
|
|
266
|
+
axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
|
|
267
|
+
minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
|
|
268
|
+
gradient_kernel = SMatrix.sparse_mod.get_function("gradient_kernel")
|
|
269
|
+
divergence_kernel = SMatrix.sparse_mod.get_function("divergence_kernel")
|
|
270
|
+
proj_tv_kernel = SMatrix.sparse_mod.get_function("proj_tv_kernel")
|
|
271
|
+
|
|
272
|
+
# optional kernels (laplacian & clamp)
|
|
273
|
+
has_laplacian = False
|
|
274
|
+
has_clamp_kernel = False
|
|
275
|
+
try:
|
|
276
|
+
laplacian_kernel = SMatrix.sparse_mod.get_function("laplacian_kernel")
|
|
277
|
+
laplacian_adj_kernel = SMatrix.sparse_mod.get_function("laplacian_adj_kernel")
|
|
278
|
+
has_laplacian = True
|
|
279
|
+
except Exception:
|
|
280
|
+
has_laplacian = False
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
clamp_positive_kernel = SMatrix.sparse_mod.get_function("clamp_positive_kernel")
|
|
284
|
+
has_clamp_kernel = True
|
|
285
|
+
except Exception:
|
|
286
|
+
has_clamp_kernel = False
|
|
287
|
+
|
|
288
|
+
stream = drv.Stream()
|
|
289
|
+
|
|
290
|
+
# estimate L operator norm if needed
|
|
291
|
+
if use_power_method or L is None:
|
|
292
|
+
L_LS_sq = power_method_estimate_L__SELL(SMatrix, stream, n_it=20, block_size=block_size)
|
|
293
|
+
L_nabla_sq = 8.0
|
|
294
|
+
L_op_norm = np.sqrt(L_LS_sq + L_nabla_sq)
|
|
295
|
+
if L_op_norm < 1e-6:
|
|
296
|
+
L_op_norm = 1.0
|
|
297
|
+
else:
|
|
298
|
+
L_op_norm = L
|
|
299
|
+
|
|
300
|
+
tau = np.float32(k_security / L_op_norm)
|
|
301
|
+
sigma = np.float32(k_security / L_op_norm)
|
|
302
|
+
|
|
303
|
+
# prepare y and normalization
|
|
304
|
+
y = y.T.astype(dtype).reshape(-1)
|
|
305
|
+
maxy = float(np.max(np.abs(y))) if y.size > 0 else 0.0
|
|
306
|
+
if maxy > 0:
|
|
307
|
+
y_normed = (y / maxy).copy()
|
|
308
|
+
else:
|
|
309
|
+
y_normed = y.copy()
|
|
310
|
+
|
|
311
|
+
# GPU allocations
|
|
312
|
+
bufs = []
|
|
313
|
+
y_gpu = drv.mem_alloc(y_normed.nbytes); bufs.append(y_gpu)
|
|
314
|
+
drv.memcpy_htod_async(y_gpu, y_normed.T.flatten(), stream)
|
|
315
|
+
|
|
316
|
+
x_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_gpu)
|
|
317
|
+
drv.memset_d32_async(x_gpu, 0, ZX, stream)
|
|
318
|
+
x_old_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_old_gpu)
|
|
319
|
+
x_tilde_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_tilde_gpu)
|
|
320
|
+
drv.memcpy_dtod_async(x_tilde_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
|
|
321
|
+
|
|
322
|
+
p_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(p_gpu)
|
|
323
|
+
q_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(q_gpu)
|
|
324
|
+
drv.memset_d32_async(p_gpu, 0, 2 * ZX, stream)
|
|
325
|
+
drv.memset_d32_async(q_gpu, 0, TN, stream)
|
|
326
|
+
|
|
327
|
+
grad_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(grad_gpu)
|
|
328
|
+
div_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(div_gpu)
|
|
329
|
+
Ax_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(Ax_gpu)
|
|
330
|
+
ATq_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(ATq_gpu)
|
|
331
|
+
zero_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(zero_gpu)
|
|
332
|
+
drv.memset_d32_async(zero_gpu, 0, ZX, stream)
|
|
333
|
+
|
|
334
|
+
# Laplacian buffers (if enabled and kernel available)
|
|
335
|
+
use_lap = use_laplacian and has_laplacian and (beta > 0)
|
|
336
|
+
if use_lap:
|
|
337
|
+
lap_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(lap_gpu)
|
|
338
|
+
r_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(r_gpu)
|
|
339
|
+
drv.memset_d32_async(r_gpu, 0, ZX, stream)
|
|
340
|
+
# scalar beta for laplacian (allow separate scale)
|
|
341
|
+
beta_lap = float(beta) * float(laplacian_beta_scale)
|
|
342
|
+
inv_1_plus_sigma_beta = np.float32(1.0 / (1.0 + float(sigma) * beta_lap))
|
|
343
|
+
|
|
344
|
+
# host buffers for logs
|
|
345
|
+
x_host = np.empty(ZX, dtype=dtype)
|
|
346
|
+
Ax_host = np.empty(TN, dtype=dtype)
|
|
347
|
+
q_host = np.empty(TN, dtype=dtype)
|
|
348
|
+
p_host = np.empty(2 * ZX, dtype=dtype)
|
|
349
|
+
ATq_host = np.empty(ZX, dtype=dtype)
|
|
350
|
+
|
|
351
|
+
# compute initial backprojection for auto-alpha
|
|
352
|
+
drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
|
|
353
|
+
backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
354
|
+
y_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
|
|
355
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
356
|
+
stream.synchronize()
|
|
357
|
+
drv.memcpy_dtoh(x_host, ATq_gpu)
|
|
358
|
+
|
|
359
|
+
# auto alpha if requested
|
|
360
|
+
if alpha is None:
|
|
361
|
+
drv.memcpy_htod_async(x_gpu, x_host, stream)
|
|
362
|
+
projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
363
|
+
x_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
|
|
364
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
365
|
+
stream.synchronize()
|
|
366
|
+
drv.memcpy_dtoh(Ax_host, Ax_gpu)
|
|
367
|
+
resid = Ax_host - y_normed[:TN]
|
|
368
|
+
data_term = 0.5 * float(np.dot(resid, resid))
|
|
369
|
+
tv_term = float(compute_TV_cpu(x_host, Z, X)) + 1e-12
|
|
370
|
+
alpha = float(auto_alpha_gamma * data_term / tv_term)
|
|
371
|
+
if show_logs:
|
|
372
|
+
print(f"[auto-alpha] data_term={data_term:.6e}, tv_term={tv_term:.6e}, alpha_set={alpha:.6e}")
|
|
373
|
+
|
|
374
|
+
# tikhonov prox multiplicative scale
|
|
375
|
+
if tikhonov_as_gradient:
|
|
376
|
+
tikh_scale = None
|
|
377
|
+
else:
|
|
378
|
+
tikh_scale = np.float32(1.0 / (1.0 + 2.0 * tau * beta)) if beta > 0 else np.float32(1.0)
|
|
379
|
+
|
|
380
|
+
# saving policy
|
|
381
|
+
if numIterations <= max_saves:
|
|
382
|
+
save_indices_all = list(range(0, numIterations + 1))
|
|
383
|
+
else:
|
|
384
|
+
step = max(1, numIterations // max_saves)
|
|
385
|
+
save_indices_all = list(range(0, numIterations + 1, step))
|
|
386
|
+
|
|
387
|
+
device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
|
|
388
|
+
if show_logs:
|
|
389
|
+
if (alpha is None or alpha == 0) and (beta is None or beta == 0):
|
|
390
|
+
print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} lap_enabled={use_lap}")
|
|
391
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS) -- {tumor_str} -- {device_str}"
|
|
392
|
+
if alpha is None or alpha == 0:
|
|
393
|
+
print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} beta={beta:.4e} lap_enabled={use_lap}")
|
|
394
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-Tikhonov) -- {tumor_str} -- {device_str}"
|
|
395
|
+
elif beta is None or beta == 0:
|
|
396
|
+
print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
|
|
397
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) -- {tumor_str} -- {device_str}"
|
|
398
|
+
else:
|
|
399
|
+
print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
|
|
400
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV-Tikhonov) -- {tumor_str} -- {device_str}"
|
|
401
|
+
|
|
402
|
+
I_reconMatrix = []
|
|
403
|
+
saved_indices = []
|
|
404
|
+
if isSavingEachIteration and 0 in save_indices_all:
|
|
405
|
+
drv.memcpy_dtoh(x_host, x_gpu)
|
|
406
|
+
x0 = x_host.reshape((Z, X)).copy()
|
|
407
|
+
if maxy > 0:
|
|
408
|
+
x0 *= maxy
|
|
409
|
+
I_reconMatrix.append(x0)
|
|
410
|
+
saved_indices.append(0)
|
|
411
|
+
|
|
412
|
+
# main loop
|
|
413
|
+
try:
|
|
414
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
415
|
+
for it in iterator:
|
|
416
|
+
# 1) dual p update (TV)
|
|
417
|
+
gradient_kernel(grad_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
|
|
418
|
+
block=(block_size, 1, 1),
|
|
419
|
+
grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
|
|
420
|
+
stream=stream)
|
|
421
|
+
_call_axpby(axpby_kernel, p_gpu, p_gpu, grad_gpu, 1.0, sigma, 2 * ZX, stream, block_size)
|
|
422
|
+
proj_tv_kernel(p_gpu, np.float32(alpha), np.int32(ZX),
|
|
423
|
+
block=(block_size, 1, 1),
|
|
424
|
+
grid=((ZX + block_size - 1) // block_size, 1, 1),
|
|
425
|
+
stream=stream)
|
|
426
|
+
|
|
427
|
+
# 2) dual q update (data fidelity)
|
|
428
|
+
projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
429
|
+
x_tilde_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
|
|
430
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
431
|
+
_call_axpby(axpby_kernel, Ax_gpu, Ax_gpu, y_gpu, 1.0, -1.0, TN, stream, block_size)
|
|
432
|
+
_call_axpby(axpby_kernel, q_gpu, q_gpu, Ax_gpu, 1.0 / (1.0 + sigma), sigma / (1.0 + sigma), TN, stream, block_size)
|
|
433
|
+
|
|
434
|
+
# optional Laplacian dual update
|
|
435
|
+
if use_lap:
|
|
436
|
+
# compute Laplacian of x_tilde -> lap_gpu
|
|
437
|
+
laplacian_kernel(lap_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
|
|
438
|
+
block=(block_size, 1, 1),
|
|
439
|
+
grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
|
|
440
|
+
stream=stream)
|
|
441
|
+
# r = r + sigma * lap
|
|
442
|
+
_call_axpby(axpby_kernel, r_gpu, r_gpu, lap_gpu, 1.0, sigma, ZX, stream, block_size)
|
|
443
|
+
# r = r / (1 + sigma * beta_lap)
|
|
444
|
+
_call_axpby(axpby_kernel, r_gpu, r_gpu, zero_gpu, inv_1_plus_sigma_beta, 0.0, ZX, stream, block_size)
|
|
445
|
+
|
|
446
|
+
# 3) primal x update
|
|
447
|
+
drv.memcpy_dtod_async(x_old_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
|
|
448
|
+
divergence_kernel(div_gpu, p_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
|
|
449
|
+
block=(block_size, 1, 1),
|
|
450
|
+
grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
|
|
451
|
+
stream=stream)
|
|
452
|
+
drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
|
|
453
|
+
backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
454
|
+
q_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
|
|
455
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
456
|
+
# ATq - div
|
|
457
|
+
_call_minus_axpy(minus_axpy_kernel, ATq_gpu, div_gpu, 1.0, ZX, stream, block_size)
|
|
458
|
+
|
|
459
|
+
# if laplacian is used, add H^T r into ATq
|
|
460
|
+
if use_lap:
|
|
461
|
+
# compute laplacian_adj_kernel(temp, r)
|
|
462
|
+
# reuse grad_gpu as temporary if safe (its content used earlier, but not reused until later)
|
|
463
|
+
laplacian_adj_kernel(grad_gpu, r_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
|
|
464
|
+
block=(block_size, 1, 1),
|
|
465
|
+
grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
|
|
466
|
+
stream=stream)
|
|
467
|
+
# ATq_gpu += temp (grad_gpu)
|
|
468
|
+
_call_axpby(axpby_kernel, ATq_gpu, ATq_gpu, grad_gpu, 1.0, 1.0, ZX, stream, block_size)
|
|
469
|
+
|
|
470
|
+
# x = x_old - tau * ATq_buffer
|
|
471
|
+
_call_minus_axpy(minus_axpy_kernel, x_gpu, ATq_gpu, tau, ZX, stream, block_size)
|
|
472
|
+
|
|
473
|
+
# Tikhonov
|
|
474
|
+
if beta > 0:
|
|
475
|
+
if tikhonov_as_gradient:
|
|
476
|
+
mul = 1.0 - 2.0 * float(tau) * float(beta)
|
|
477
|
+
if mul <= 0.0:
|
|
478
|
+
# fallback to prox multiplicative stable
|
|
479
|
+
fallback_scale = np.float32(1.0 / (1.0 + 2.0 * float(tau) * float(beta)))
|
|
480
|
+
_call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, fallback_scale, 0.0, ZX, stream, block_size)
|
|
481
|
+
else:
|
|
482
|
+
# x *= mul => implemented as axpby: out = 1* x + (mul-1)*x
|
|
483
|
+
_call_axpby(axpby_kernel, x_gpu, x_gpu, x_gpu, 1.0, np.float32(mul - 1.0), ZX, stream, block_size)
|
|
484
|
+
else:
|
|
485
|
+
_call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, tikh_scale, np.float32(0.0), ZX, stream, block_size)
|
|
486
|
+
|
|
487
|
+
# positivity clamp (prefer GPU kernel if available)
|
|
488
|
+
if apply_positivity_clamp:
|
|
489
|
+
if has_clamp_kernel:
|
|
490
|
+
# in-place clamp on GPU
|
|
491
|
+
clamp_positive_kernel(x_gpu, np.int32(ZX),
|
|
492
|
+
block=(block_size, 1, 1),
|
|
493
|
+
grid=((ZX + block_size - 1) // block_size, 1, 1),
|
|
494
|
+
stream=stream)
|
|
495
|
+
else:
|
|
496
|
+
# fallback CPU roundtrip (slower)
|
|
497
|
+
stream.synchronize()
|
|
498
|
+
drv.memcpy_dtoh(x_host, x_gpu)
|
|
499
|
+
np.maximum(x_host, 0.0, out=x_host)
|
|
500
|
+
drv.memcpy_htod_async(x_gpu, x_host, stream)
|
|
501
|
+
|
|
502
|
+
# extrapolation
|
|
503
|
+
_call_axpby(axpby_kernel, x_tilde_gpu, x_gpu, x_old_gpu, np.float32(1.0 + theta), np.float32(-theta), ZX, stream, block_size)
|
|
504
|
+
|
|
505
|
+
# saves
|
|
506
|
+
if isSavingEachIteration and (it + 1) in save_indices_all:
|
|
507
|
+
stream.synchronize()
|
|
508
|
+
drv.memcpy_dtoh(x_host, x_gpu)
|
|
509
|
+
x_saved = x_host.reshape((Z, X)).copy()
|
|
510
|
+
if maxy > 0:
|
|
511
|
+
x_saved *= maxy
|
|
512
|
+
I_reconMatrix.append(x_saved)
|
|
513
|
+
saved_indices.append(it + 1)
|
|
514
|
+
|
|
515
|
+
stream.synchronize()
|
|
516
|
+
drv.memcpy_dtoh(x_host, x_gpu)
|
|
517
|
+
x_final = x_host.reshape((Z, X)).copy()
|
|
518
|
+
if maxy > 0:
|
|
519
|
+
x_final *= maxy
|
|
520
|
+
if isSavingEachIteration and len(I_reconMatrix):
|
|
521
|
+
for i in range(len(I_reconMatrix)):
|
|
522
|
+
I_reconMatrix[i] *= maxy
|
|
523
|
+
|
|
524
|
+
# free buffers
|
|
525
|
+
for buff in bufs:
|
|
526
|
+
try:
|
|
527
|
+
buff.free()
|
|
528
|
+
except:
|
|
529
|
+
pass
|
|
530
|
+
|
|
531
|
+
if SMatrix.ctx:
|
|
532
|
+
SMatrix.ctx.pop()
|
|
533
|
+
|
|
534
|
+
if isSavingEachIteration:
|
|
535
|
+
return I_reconMatrix, saved_indices
|
|
536
|
+
else:
|
|
537
|
+
return x_final, None
|
|
538
|
+
|
|
539
|
+
except Exception as e:
|
|
540
|
+
# cleanup robustly
|
|
541
|
+
print("Error in CP_TV_Tikhonov+Lap (robust):", e)
|
|
542
|
+
try:
|
|
543
|
+
for buff in bufs:
|
|
544
|
+
try:
|
|
545
|
+
buff.free()
|
|
546
|
+
except:
|
|
547
|
+
pass
|
|
548
|
+
except:
|
|
549
|
+
pass
|
|
550
|
+
try:
|
|
551
|
+
if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
|
|
552
|
+
SMatrix.ctx.pop()
|
|
553
|
+
except:
|
|
554
|
+
pass
|
|
555
|
+
raise
|
|
556
|
+
|
|
126
557
|
|
|
127
558
|
def CP_KL(
|
|
128
559
|
SMatrix,
|
|
@@ -132,9 +563,10 @@ def CP_KL(
|
|
|
132
563
|
numIterations=5000,
|
|
133
564
|
isSavingEachIteration=True,
|
|
134
565
|
L=None,
|
|
135
|
-
|
|
566
|
+
tumor_str="",
|
|
136
567
|
device=None,
|
|
137
568
|
max_saves=5000,
|
|
569
|
+
show_logs=True,
|
|
138
570
|
):
|
|
139
571
|
"""
|
|
140
572
|
Chambolle-Pock algorithm for Kullback-Leibler (KL) divergence regularization.
|
|
@@ -193,12 +625,11 @@ def CP_KL(
|
|
|
193
625
|
saved_indices = [0]
|
|
194
626
|
|
|
195
627
|
# Description for progress bar
|
|
196
|
-
tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
|
|
197
628
|
device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
|
|
198
629
|
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (KL) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
|
|
199
630
|
|
|
200
|
-
|
|
201
|
-
for iteration in
|
|
631
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
632
|
+
for iteration in iterator:
|
|
202
633
|
# Update q (proximal step for F*)
|
|
203
634
|
q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)
|
|
204
635
|
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# sparse_matrix_gpu.py
|
|
2
1
|
import pycuda.driver as drv
|
|
3
2
|
import numpy as np
|
|
4
3
|
from pycuda.compiler import SourceModule
|
|
@@ -26,6 +25,14 @@ class SparseSMatrix_CSR:
|
|
|
26
25
|
self.X = manip.AcousticFields[0].field.shape[2]
|
|
27
26
|
self.block_rows = block_rows
|
|
28
27
|
self.relative_threshold = relative_threshold
|
|
28
|
+
|
|
29
|
+
# --- FIX: Résolution du chemin du .cubin (dans AOT_Recon/) ---
|
|
30
|
+
# Le fichier SparseSMatrix_CSR.py est dans AOT_Recon/AOT_SparseSMatrix/
|
|
31
|
+
# On remonte d'un répertoire pour atteindre AOT_Recon/
|
|
32
|
+
cubin_parent_dir = os.path.dirname(os.path.dirname(__file__))
|
|
33
|
+
self.module_path = os.path.join(cubin_parent_dir, "AOT_biomaps_kernels.cubin")
|
|
34
|
+
# --- FIN FIX ---
|
|
35
|
+
|
|
29
36
|
self.h_dense = None
|
|
30
37
|
self.row_ptr = None
|
|
31
38
|
self.row_ptr_gpu = None
|
|
@@ -41,22 +48,24 @@ class SparseSMatrix_CSR:
|
|
|
41
48
|
def __exit__(self, exc_type, exc, tb):
|
|
42
49
|
self.free()
|
|
43
50
|
|
|
44
|
-
def load_precompiled_module(self
|
|
51
|
+
def load_precompiled_module(self):
|
|
52
|
+
"""
|
|
53
|
+
Charge le module CUDA pré-compilé (.cubin) en utilisant le chemin résolu.
|
|
54
|
+
Supprime la logique de compilation JIT.
|
|
55
|
+
"""
|
|
56
|
+
so_path = self.module_path # Utilise le chemin résolu dans __init__
|
|
57
|
+
|
|
58
|
+
if not os.path.exists(so_path):
|
|
59
|
+
raise FileNotFoundError(
|
|
60
|
+
f"Le module CUDA {os.path.basename(so_path)} est introuvable au chemin: {so_path}. "
|
|
61
|
+
"Assurez-vous qu'il est compilé et bien placé."
|
|
62
|
+
)
|
|
63
|
+
|
|
45
64
|
try:
|
|
46
|
-
# If a PTX or cubin is provided via path
|
|
47
65
|
self.sparse_mod = drv.module_from_file(so_path)
|
|
48
66
|
print(f"✅ Module CUDA chargé depuis {so_path}")
|
|
49
|
-
except Exception:
|
|
50
|
-
|
|
51
|
-
src_path = os.path.join(os.path.dirname(__file__), 'AOT_biomaps_kernels.cu')
|
|
52
|
-
if os.path.exists(src_path):
|
|
53
|
-
print("Compilation JIT du kernel CUDA depuis source...")
|
|
54
|
-
with open(src_path, 'r') as f:
|
|
55
|
-
src = f.read()
|
|
56
|
-
self.sparse_mod = SourceModule(src, no_extern_c=True)
|
|
57
|
-
print("✅ Module compilé JIT")
|
|
58
|
-
else:
|
|
59
|
-
raise
|
|
67
|
+
except Exception as e:
|
|
68
|
+
raise RuntimeError(f"Le fichier {os.path.basename(so_path)} a été trouvé, mais PyCUDA n'a pas pu le charger. Vérifiez la compatibilité.") from e
|
|
60
69
|
|
|
61
70
|
def estimate_nnz_cpu(self):
|
|
62
71
|
"""Estimation rapide (non-exacte) — utile si tu veux une estimation faible.
|
|
@@ -80,12 +89,10 @@ class SparseSMatrix_CSR:
|
|
|
80
89
|
bytes_float = np.dtype(np.float32).itemsize
|
|
81
90
|
|
|
82
91
|
# Charge module
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
else:
|
|
86
|
-
self.load_precompiled_module('AOT_biomaps_kernels.cubin')
|
|
92
|
+
# FIX: Toujours charger depuis self.module_path (résolu)
|
|
93
|
+
self.load_precompiled_module()
|
|
87
94
|
|
|
88
|
-
count_nnz_kernel = self.sparse_mod.get_function('
|
|
95
|
+
count_nnz_kernel = self.sparse_mod.get_function('count_nnz_rows_kernel')
|
|
89
96
|
fill_csr_kernel = self.sparse_mod.get_function('fill_kernel__CSR')
|
|
90
97
|
|
|
91
98
|
# allocate host row_ptr
|
|
@@ -110,6 +117,7 @@ class SparseSMatrix_CSR:
|
|
|
110
117
|
drv.memcpy_htod(dense_block_gpu, dense_block_host)
|
|
111
118
|
|
|
112
119
|
grid = ((current_rows + block_size - 1) // block_size, 1, 1)
|
|
120
|
+
# Note: Assuming 'count_nnz_per_row_kernel' is the correct name (verified by user in prior steps)
|
|
113
121
|
count_nnz_kernel(dense_block_gpu, row_nnz_gpu,
|
|
114
122
|
np.int32(current_rows), np.int32(num_cols),
|
|
115
123
|
np.float32(self.relative_threshold),
|
|
@@ -182,7 +190,11 @@ class SparseSMatrix_CSR:
|
|
|
182
190
|
drv.memset_d32(col_sum_gpu, 0, ZX)
|
|
183
191
|
|
|
184
192
|
# 2) Récupérer le kernel
|
|
185
|
-
|
|
193
|
+
# FIX: Utiliser le nom générique 'accumulate_columns_atomic' comme dans SELL (si le binaire est partagé)
|
|
194
|
+
# Si le développeur utilise la convention __CSR, on la garde.
|
|
195
|
+
# Basé sur notre historique SELL, le nom est probablement générique 'accumulate_columns_atomic'.
|
|
196
|
+
# Je vais supposer que le nom est générique pour éviter une LogicError ici aussi.
|
|
197
|
+
acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic")
|
|
186
198
|
|
|
187
199
|
# 3) Lancer le kernel
|
|
188
200
|
threads = 256
|
|
@@ -210,12 +222,23 @@ class SparseSMatrix_CSR:
|
|
|
210
222
|
drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
|
|
211
223
|
|
|
212
224
|
def getMatrixSize(self):
|
|
225
|
+
"""
|
|
226
|
+
Retourne la taille totale de la matrice CSR en Go (en sommant la mémoire GPU).
|
|
227
|
+
Utilise les attributs de taille stockés pour contourner l'AttributeError de DeviceAllocation.
|
|
228
|
+
"""
|
|
229
|
+
# Note: L'utilisateur doit s'assurer que self.row_ptr existe avant cet appel.
|
|
213
230
|
if self.row_ptr is None:
|
|
214
231
|
return {"error": "La matrice sparse n'est pas encore allouée."}
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
232
|
+
|
|
233
|
+
total_bytes = 0
|
|
234
|
+
|
|
235
|
+
# Somme des tailles stockées (Taille calculée et attribuée dans allocate et compute_norm_factor_from_csr)
|
|
236
|
+
total_bytes += getattr(self, 'row_ptr_gpu_size', 0)
|
|
237
|
+
total_bytes += getattr(self, 'col_ind_gpu_size', 0)
|
|
238
|
+
total_bytes += getattr(self, 'values_gpu_size', 0)
|
|
239
|
+
total_bytes += getattr(self, 'norm_factor_inv_gpu_size', 0)
|
|
240
|
+
|
|
241
|
+
return total_bytes / (1024**3)
|
|
219
242
|
|
|
220
243
|
def free(self):
|
|
221
244
|
try:
|
|
@@ -248,5 +271,4 @@ class SparseSMatrix_CSR:
|
|
|
248
271
|
num_cols = int(self.Z * self.X)
|
|
249
272
|
total_nnz = int(self.row_ptr[-1])
|
|
250
273
|
density = total_nnz / (num_rows * num_cols)
|
|
251
|
-
return density
|
|
252
|
-
|
|
274
|
+
return density
|