AOT-biomaps 2.9.279__py3-none-any.whl → 2.9.300__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of AOT-biomaps might be problematic. Click here for more details.
- AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +16 -19
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +193 -109
- AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +442 -11
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +8 -15
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +26 -23
- AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
- AOT_biomaps/AOT_Recon/AlgebraicRecon.py +2 -8
- AOT_biomaps/AOT_Recon/PrimalDualRecon.py +94 -41
- AOT_biomaps/AOT_Recon/ReconTools.py +78 -1
- AOT_biomaps/__init__.py +22 -1
- {aot_biomaps-2.9.279.dist-info → aot_biomaps-2.9.300.dist-info}/METADATA +1 -1
- {aot_biomaps-2.9.279.dist-info → aot_biomaps-2.9.300.dist-info}/RECORD +14 -14
- {aot_biomaps-2.9.279.dist-info → aot_biomaps-2.9.300.dist-info}/WHEEL +0 -0
- {aot_biomaps-2.9.279.dist-info → aot_biomaps-2.9.300.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star
|
|
1
|
+
from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star, _call_axpby, _call_minus_axpy, compute_TV_cpu, power_method_estimate_L__SELL, calculate_memory_requirement, check_gpu_memory
|
|
2
2
|
from AOT_biomaps.Config import config
|
|
3
|
-
from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType
|
|
3
|
+
from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType, SMatrixType
|
|
4
4
|
import torch
|
|
5
5
|
from tqdm import trange
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pycuda.driver as drv
|
|
6
8
|
|
|
7
9
|
'''
|
|
8
10
|
This module implements Primal-Dual Hybrid Gradient (PDHG) methods for solving inverse problems in Acousto-Optic Tomography.
|
|
@@ -11,6 +13,103 @@ The methods can run on both CPU and GPU, with configurations set in the AOT_biom
|
|
|
11
13
|
'''
|
|
12
14
|
|
|
13
15
|
def CP_TV(
|
|
16
|
+
SMatrix,
|
|
17
|
+
y,
|
|
18
|
+
alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
|
|
19
|
+
beta=1e-4, # Tikhonov regularization parameter
|
|
20
|
+
theta=1.0,
|
|
21
|
+
numIterations=5000,
|
|
22
|
+
isSavingEachIteration=True,
|
|
23
|
+
L=None,
|
|
24
|
+
withTumor=True,
|
|
25
|
+
device=None,
|
|
26
|
+
max_saves=5000,
|
|
27
|
+
show_logs=True,
|
|
28
|
+
smatrixType=SMatrixType.SELL,
|
|
29
|
+
k_security=0.8,
|
|
30
|
+
use_power_method=True,
|
|
31
|
+
auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
|
|
32
|
+
apply_positivity_clamp=True,
|
|
33
|
+
tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
|
|
34
|
+
use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
|
|
35
|
+
laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
|
|
36
|
+
):
|
|
37
|
+
# try:
|
|
38
|
+
tumor_str = "WITH" if withTumor else "WITHOUT"
|
|
39
|
+
# Auto-select device and method
|
|
40
|
+
if device is None:
|
|
41
|
+
if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
|
|
42
|
+
device = torch.device(f"cuda:{config.select_best_gpu()}")
|
|
43
|
+
use_gpu = True
|
|
44
|
+
else:
|
|
45
|
+
device = torch.device("cpu")
|
|
46
|
+
use_gpu = False
|
|
47
|
+
else:
|
|
48
|
+
use_gpu = device.type == "cuda"
|
|
49
|
+
# Dispatch to the appropriate implementation
|
|
50
|
+
if use_gpu:
|
|
51
|
+
if smatrixType == SMatrixType.CSR:
|
|
52
|
+
raise NotImplementedError("GPU Chambolle Pock (LS-TV) with CSR not implemented.")
|
|
53
|
+
elif smatrixType == SMatrixType.SELL:
|
|
54
|
+
return CP_TV_Tikhonov_sparseSELL_pycuda(SMatrix, y, alpha,beta, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs, k_security, use_power_method, auto_alpha_gamma, apply_positivity_clamp, tikhonov_as_gradient, use_laplacian, laplacian_beta_scale)
|
|
55
|
+
elif smatrixType == SMatrixType.DENSE:
|
|
56
|
+
return CP_TV_dense(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-TV).")
|
|
59
|
+
else:
|
|
60
|
+
raise NotImplementedError("CPU Chambolle Pock (LS-TV) not implemented.")
|
|
61
|
+
|
|
62
|
+
def CP_KL(
|
|
63
|
+
SMatrix,
|
|
64
|
+
y,
|
|
65
|
+
alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
|
|
66
|
+
beta=1e-4, # Tikhonov regularization parameter
|
|
67
|
+
theta=1.0,
|
|
68
|
+
numIterations=5000,
|
|
69
|
+
isSavingEachIteration=True,
|
|
70
|
+
L=None,
|
|
71
|
+
withTumor=True,
|
|
72
|
+
device=None,
|
|
73
|
+
max_saves=5000,
|
|
74
|
+
show_logs=True,
|
|
75
|
+
smatrixType=SMatrixType.SELL,
|
|
76
|
+
k_security=0.8,
|
|
77
|
+
use_power_method=True,
|
|
78
|
+
auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
|
|
79
|
+
apply_positivity_clamp=True,
|
|
80
|
+
tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
|
|
81
|
+
use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
|
|
82
|
+
laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
|
|
83
|
+
):
|
|
84
|
+
# try:
|
|
85
|
+
tumor_str = "WITH" if withTumor else "WITHOUT"
|
|
86
|
+
# Auto-select device and method
|
|
87
|
+
if device is None:
|
|
88
|
+
if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
|
|
89
|
+
device = torch.device(f"cuda:{config.select_best_gpu()}")
|
|
90
|
+
use_gpu = True
|
|
91
|
+
else:
|
|
92
|
+
device = torch.device("cpu")
|
|
93
|
+
use_gpu = False
|
|
94
|
+
else:
|
|
95
|
+
use_gpu = device.type == "cuda"
|
|
96
|
+
# Dispatch to the appropriate implementation
|
|
97
|
+
if use_gpu:
|
|
98
|
+
if smatrixType == SMatrixType.CSR:
|
|
99
|
+
raise NotImplementedError("GPU Chambolle Pock (LS-KL) with CSR not implemented.")
|
|
100
|
+
elif smatrixType == SMatrixType.SELL:
|
|
101
|
+
raise NotImplementedError("GPU Chambolle Pock (LS-KL) with SELL not implemented.")
|
|
102
|
+
elif smatrixType == SMatrixType.DENSE:
|
|
103
|
+
return CP_KL(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
|
|
104
|
+
else:
|
|
105
|
+
raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-KL).")
|
|
106
|
+
else:
|
|
107
|
+
raise NotImplementedError("CPU Chambolle Pock (LS-KL) not implemented.")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def CP_TV_dense(
|
|
14
113
|
SMatrix,
|
|
15
114
|
y,
|
|
16
115
|
alpha=1e-1,
|
|
@@ -21,6 +120,7 @@ def CP_TV(
|
|
|
21
120
|
withTumor=True,
|
|
22
121
|
device=None,
|
|
23
122
|
max_saves=5000,
|
|
123
|
+
show_logs=True,
|
|
24
124
|
):
|
|
25
125
|
"""
|
|
26
126
|
Chambolle-Pock algorithm for Total Variation (TV) regularization.
|
|
@@ -92,10 +192,10 @@ def CP_TV(
|
|
|
92
192
|
# Description for progress bar
|
|
93
193
|
tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
|
|
94
194
|
device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
|
|
95
|
-
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
|
|
195
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
|
|
96
196
|
|
|
97
|
-
|
|
98
|
-
for
|
|
197
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
198
|
+
for it in iterator:
|
|
99
199
|
# Update p (TV proximal step)
|
|
100
200
|
grad_x = gradient(x_tilde.reshape(Z, X))
|
|
101
201
|
p = proj_l2(p + sigma * grad_x, alpha)
|
|
@@ -113,9 +213,9 @@ def CP_TV(
|
|
|
113
213
|
x_tilde = x + theta * (x - x_old)
|
|
114
214
|
|
|
115
215
|
# Save intermediate result if needed
|
|
116
|
-
if isSavingEachIteration and
|
|
216
|
+
if isSavingEachIteration and it in save_indices:
|
|
117
217
|
I_reconMatrix.append(x.reshape(Z, X).clone() * (norm_y / norm_A))
|
|
118
|
-
saved_indices.append(
|
|
218
|
+
saved_indices.append(it)
|
|
119
219
|
|
|
120
220
|
# Return results
|
|
121
221
|
if isSavingEachIteration:
|
|
@@ -123,6 +223,337 @@ def CP_TV(
|
|
|
123
223
|
else:
|
|
124
224
|
return (x.reshape(Z, X) * (norm_y / norm_A)).cpu().numpy(), None
|
|
125
225
|
|
|
226
|
+
def CP_TV_Tikhonov_sparseSELL_pycuda(
|
|
227
|
+
SMatrix,
|
|
228
|
+
y,
|
|
229
|
+
alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
|
|
230
|
+
beta=1e-4, # Tikhonov regularization parameter
|
|
231
|
+
theta=1.0,
|
|
232
|
+
numIterations=2000,
|
|
233
|
+
isSavingEachIteration=True,
|
|
234
|
+
L=None,
|
|
235
|
+
tumor_str="",
|
|
236
|
+
device=None,
|
|
237
|
+
max_saves=2000,
|
|
238
|
+
show_logs=True,
|
|
239
|
+
k_security=0.8,
|
|
240
|
+
use_power_method=True,
|
|
241
|
+
auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
|
|
242
|
+
apply_positivity_clamp=True,
|
|
243
|
+
tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
|
|
244
|
+
use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
|
|
245
|
+
laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
|
|
246
|
+
):
|
|
247
|
+
|
|
248
|
+
"""
|
|
249
|
+
CP-TV + Tikhonov + Laplacian (Hessian scalar) penalty integrated.
|
|
250
|
+
Returns (I_reconMatrix, saved_indices) if isSavingEachIteration else (x_final, None).
|
|
251
|
+
"""
|
|
252
|
+
# ----- begin main -----
|
|
253
|
+
if SMatrix.ctx:
|
|
254
|
+
SMatrix.ctx.push()
|
|
255
|
+
|
|
256
|
+
# prepare variables
|
|
257
|
+
dtype = np.float32
|
|
258
|
+
TN = int(SMatrix.N * SMatrix.T)
|
|
259
|
+
ZX = int(SMatrix.Z * SMatrix.X)
|
|
260
|
+
Z, X = SMatrix.Z, SMatrix.X
|
|
261
|
+
block_size = 256
|
|
262
|
+
|
|
263
|
+
# existing kernels
|
|
264
|
+
projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
|
|
265
|
+
backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
|
|
266
|
+
axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
|
|
267
|
+
minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
|
|
268
|
+
gradient_kernel = SMatrix.sparse_mod.get_function("gradient_kernel")
|
|
269
|
+
divergence_kernel = SMatrix.sparse_mod.get_function("divergence_kernel")
|
|
270
|
+
proj_tv_kernel = SMatrix.sparse_mod.get_function("proj_tv_kernel")
|
|
271
|
+
|
|
272
|
+
# optional kernels (laplacian & clamp)
|
|
273
|
+
has_laplacian = False
|
|
274
|
+
has_clamp_kernel = False
|
|
275
|
+
try:
|
|
276
|
+
laplacian_kernel = SMatrix.sparse_mod.get_function("laplacian_kernel")
|
|
277
|
+
laplacian_adj_kernel = SMatrix.sparse_mod.get_function("laplacian_adj_kernel")
|
|
278
|
+
has_laplacian = True
|
|
279
|
+
except Exception:
|
|
280
|
+
has_laplacian = False
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
clamp_positive_kernel = SMatrix.sparse_mod.get_function("clamp_positive_kernel")
|
|
284
|
+
has_clamp_kernel = True
|
|
285
|
+
except Exception:
|
|
286
|
+
has_clamp_kernel = False
|
|
287
|
+
|
|
288
|
+
stream = drv.Stream()
|
|
289
|
+
|
|
290
|
+
# estimate L operator norm if needed
|
|
291
|
+
if use_power_method or L is None:
|
|
292
|
+
L_LS_sq = power_method_estimate_L__SELL(SMatrix, stream, n_it=20, block_size=block_size)
|
|
293
|
+
L_nabla_sq = 8.0
|
|
294
|
+
L_op_norm = np.sqrt(L_LS_sq + L_nabla_sq)
|
|
295
|
+
if L_op_norm < 1e-6:
|
|
296
|
+
L_op_norm = 1.0
|
|
297
|
+
else:
|
|
298
|
+
L_op_norm = L
|
|
299
|
+
|
|
300
|
+
tau = np.float32(k_security / L_op_norm)
|
|
301
|
+
sigma = np.float32(k_security / L_op_norm)
|
|
302
|
+
|
|
303
|
+
# prepare y and normalization
|
|
304
|
+
y = y.T.astype(dtype).reshape(-1)
|
|
305
|
+
maxy = float(np.max(np.abs(y))) if y.size > 0 else 0.0
|
|
306
|
+
if maxy > 0:
|
|
307
|
+
y_normed = (y / maxy).copy()
|
|
308
|
+
else:
|
|
309
|
+
y_normed = y.copy()
|
|
310
|
+
|
|
311
|
+
# GPU allocations
|
|
312
|
+
bufs = []
|
|
313
|
+
y_gpu = drv.mem_alloc(y_normed.nbytes); bufs.append(y_gpu)
|
|
314
|
+
drv.memcpy_htod_async(y_gpu, y_normed.T.flatten(), stream)
|
|
315
|
+
|
|
316
|
+
x_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_gpu)
|
|
317
|
+
drv.memset_d32_async(x_gpu, 0, ZX, stream)
|
|
318
|
+
x_old_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_old_gpu)
|
|
319
|
+
x_tilde_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_tilde_gpu)
|
|
320
|
+
drv.memcpy_dtod_async(x_tilde_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
|
|
321
|
+
|
|
322
|
+
p_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(p_gpu)
|
|
323
|
+
q_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(q_gpu)
|
|
324
|
+
drv.memset_d32_async(p_gpu, 0, 2 * ZX, stream)
|
|
325
|
+
drv.memset_d32_async(q_gpu, 0, TN, stream)
|
|
326
|
+
|
|
327
|
+
grad_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(grad_gpu)
|
|
328
|
+
div_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(div_gpu)
|
|
329
|
+
Ax_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(Ax_gpu)
|
|
330
|
+
ATq_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(ATq_gpu)
|
|
331
|
+
zero_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(zero_gpu)
|
|
332
|
+
drv.memset_d32_async(zero_gpu, 0, ZX, stream)
|
|
333
|
+
|
|
334
|
+
# Laplacian buffers (if enabled and kernel available)
|
|
335
|
+
use_lap = use_laplacian and has_laplacian and (beta > 0)
|
|
336
|
+
if use_lap:
|
|
337
|
+
lap_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(lap_gpu)
|
|
338
|
+
r_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(r_gpu)
|
|
339
|
+
drv.memset_d32_async(r_gpu, 0, ZX, stream)
|
|
340
|
+
# scalar beta for laplacian (allow separate scale)
|
|
341
|
+
beta_lap = float(beta) * float(laplacian_beta_scale)
|
|
342
|
+
inv_1_plus_sigma_beta = np.float32(1.0 / (1.0 + float(sigma) * beta_lap))
|
|
343
|
+
|
|
344
|
+
# host buffers for logs
|
|
345
|
+
x_host = np.empty(ZX, dtype=dtype)
|
|
346
|
+
Ax_host = np.empty(TN, dtype=dtype)
|
|
347
|
+
q_host = np.empty(TN, dtype=dtype)
|
|
348
|
+
p_host = np.empty(2 * ZX, dtype=dtype)
|
|
349
|
+
ATq_host = np.empty(ZX, dtype=dtype)
|
|
350
|
+
|
|
351
|
+
# compute initial backprojection for auto-alpha
|
|
352
|
+
drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
|
|
353
|
+
backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
354
|
+
y_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
|
|
355
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
356
|
+
stream.synchronize()
|
|
357
|
+
drv.memcpy_dtoh(x_host, ATq_gpu)
|
|
358
|
+
|
|
359
|
+
# auto alpha if requested
|
|
360
|
+
if alpha is None:
|
|
361
|
+
drv.memcpy_htod_async(x_gpu, x_host, stream)
|
|
362
|
+
projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
363
|
+
x_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
|
|
364
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
365
|
+
stream.synchronize()
|
|
366
|
+
drv.memcpy_dtoh(Ax_host, Ax_gpu)
|
|
367
|
+
resid = Ax_host - y_normed[:TN]
|
|
368
|
+
data_term = 0.5 * float(np.dot(resid, resid))
|
|
369
|
+
tv_term = float(compute_TV_cpu(x_host, Z, X)) + 1e-12
|
|
370
|
+
alpha = float(auto_alpha_gamma * data_term / tv_term)
|
|
371
|
+
if show_logs:
|
|
372
|
+
print(f"[auto-alpha] data_term={data_term:.6e}, tv_term={tv_term:.6e}, alpha_set={alpha:.6e}")
|
|
373
|
+
|
|
374
|
+
# tikhonov prox multiplicative scale
|
|
375
|
+
if tikhonov_as_gradient:
|
|
376
|
+
tikh_scale = None
|
|
377
|
+
else:
|
|
378
|
+
tikh_scale = np.float32(1.0 / (1.0 + 2.0 * tau * beta)) if beta > 0 else np.float32(1.0)
|
|
379
|
+
|
|
380
|
+
# saving policy
|
|
381
|
+
if numIterations <= max_saves:
|
|
382
|
+
save_indices_all = list(range(0, numIterations + 1))
|
|
383
|
+
else:
|
|
384
|
+
step = max(1, numIterations // max_saves)
|
|
385
|
+
save_indices_all = list(range(0, numIterations + 1, step))
|
|
386
|
+
|
|
387
|
+
device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
|
|
388
|
+
if show_logs:
|
|
389
|
+
if (alpha is None or alpha == 0) and (beta is None or beta == 0):
|
|
390
|
+
print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} lap_enabled={use_lap}")
|
|
391
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS) -- {tumor_str} -- {device_str}"
|
|
392
|
+
if alpha is None or alpha == 0:
|
|
393
|
+
print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} beta={beta:.4e} lap_enabled={use_lap}")
|
|
394
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-Tikhonov) -- {tumor_str} -- {device_str}"
|
|
395
|
+
elif beta is None or beta == 0:
|
|
396
|
+
print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
|
|
397
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) -- {tumor_str} -- {device_str}"
|
|
398
|
+
else:
|
|
399
|
+
print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
|
|
400
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV-Tikhonov) -- {tumor_str} -- {device_str}"
|
|
401
|
+
|
|
402
|
+
I_reconMatrix = []
|
|
403
|
+
saved_indices = []
|
|
404
|
+
if isSavingEachIteration and 0 in save_indices_all:
|
|
405
|
+
drv.memcpy_dtoh(x_host, x_gpu)
|
|
406
|
+
x0 = x_host.reshape((Z, X)).copy()
|
|
407
|
+
if maxy > 0:
|
|
408
|
+
x0 *= maxy
|
|
409
|
+
I_reconMatrix.append(x0)
|
|
410
|
+
saved_indices.append(0)
|
|
411
|
+
|
|
412
|
+
# main loop
|
|
413
|
+
try:
|
|
414
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
415
|
+
for it in iterator:
|
|
416
|
+
# 1) dual p update (TV)
|
|
417
|
+
gradient_kernel(grad_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
|
|
418
|
+
block=(block_size, 1, 1),
|
|
419
|
+
grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
|
|
420
|
+
stream=stream)
|
|
421
|
+
_call_axpby(axpby_kernel, p_gpu, p_gpu, grad_gpu, 1.0, sigma, 2 * ZX, stream, block_size)
|
|
422
|
+
proj_tv_kernel(p_gpu, np.float32(alpha), np.int32(ZX),
|
|
423
|
+
block=(block_size, 1, 1),
|
|
424
|
+
grid=((ZX + block_size - 1) // block_size, 1, 1),
|
|
425
|
+
stream=stream)
|
|
426
|
+
|
|
427
|
+
# 2) dual q update (data fidelity)
|
|
428
|
+
projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
429
|
+
x_tilde_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
|
|
430
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
431
|
+
_call_axpby(axpby_kernel, Ax_gpu, Ax_gpu, y_gpu, 1.0, -1.0, TN, stream, block_size)
|
|
432
|
+
_call_axpby(axpby_kernel, q_gpu, q_gpu, Ax_gpu, 1.0 / (1.0 + sigma), sigma / (1.0 + sigma), TN, stream, block_size)
|
|
433
|
+
|
|
434
|
+
# optional Laplacian dual update
|
|
435
|
+
if use_lap:
|
|
436
|
+
# compute Laplacian of x_tilde -> lap_gpu
|
|
437
|
+
laplacian_kernel(lap_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
|
|
438
|
+
block=(block_size, 1, 1),
|
|
439
|
+
grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
|
|
440
|
+
stream=stream)
|
|
441
|
+
# r = r + sigma * lap
|
|
442
|
+
_call_axpby(axpby_kernel, r_gpu, r_gpu, lap_gpu, 1.0, sigma, ZX, stream, block_size)
|
|
443
|
+
# r = r / (1 + sigma * beta_lap)
|
|
444
|
+
_call_axpby(axpby_kernel, r_gpu, r_gpu, zero_gpu, inv_1_plus_sigma_beta, 0.0, ZX, stream, block_size)
|
|
445
|
+
|
|
446
|
+
# 3) primal x update
|
|
447
|
+
drv.memcpy_dtod_async(x_old_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
|
|
448
|
+
divergence_kernel(div_gpu, p_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
|
|
449
|
+
block=(block_size, 1, 1),
|
|
450
|
+
grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
|
|
451
|
+
stream=stream)
|
|
452
|
+
drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
|
|
453
|
+
backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
454
|
+
q_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
|
|
455
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
456
|
+
# ATq - div
|
|
457
|
+
_call_minus_axpy(minus_axpy_kernel, ATq_gpu, div_gpu, 1.0, ZX, stream, block_size)
|
|
458
|
+
|
|
459
|
+
# if laplacian is used, add H^T r into ATq
|
|
460
|
+
if use_lap:
|
|
461
|
+
# compute laplacian_adj_kernel(temp, r)
|
|
462
|
+
# reuse grad_gpu as temporary if safe (its content used earlier, but not reused until later)
|
|
463
|
+
laplacian_adj_kernel(grad_gpu, r_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
|
|
464
|
+
block=(block_size, 1, 1),
|
|
465
|
+
grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
|
|
466
|
+
stream=stream)
|
|
467
|
+
# ATq_gpu += temp (grad_gpu)
|
|
468
|
+
_call_axpby(axpby_kernel, ATq_gpu, ATq_gpu, grad_gpu, 1.0, 1.0, ZX, stream, block_size)
|
|
469
|
+
|
|
470
|
+
# x = x_old - tau * ATq_buffer
|
|
471
|
+
_call_minus_axpy(minus_axpy_kernel, x_gpu, ATq_gpu, tau, ZX, stream, block_size)
|
|
472
|
+
|
|
473
|
+
# Tikhonov
|
|
474
|
+
if beta > 0:
|
|
475
|
+
if tikhonov_as_gradient:
|
|
476
|
+
mul = 1.0 - 2.0 * float(tau) * float(beta)
|
|
477
|
+
if mul <= 0.0:
|
|
478
|
+
# fallback to prox multiplicative stable
|
|
479
|
+
fallback_scale = np.float32(1.0 / (1.0 + 2.0 * float(tau) * float(beta)))
|
|
480
|
+
_call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, fallback_scale, 0.0, ZX, stream, block_size)
|
|
481
|
+
else:
|
|
482
|
+
# x *= mul => implemented as axpby: out = 1* x + (mul-1)*x
|
|
483
|
+
_call_axpby(axpby_kernel, x_gpu, x_gpu, x_gpu, 1.0, np.float32(mul - 1.0), ZX, stream, block_size)
|
|
484
|
+
else:
|
|
485
|
+
_call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, tikh_scale, np.float32(0.0), ZX, stream, block_size)
|
|
486
|
+
|
|
487
|
+
# positivity clamp (prefer GPU kernel if available)
|
|
488
|
+
if apply_positivity_clamp:
|
|
489
|
+
if has_clamp_kernel:
|
|
490
|
+
# in-place clamp on GPU
|
|
491
|
+
clamp_positive_kernel(x_gpu, np.int32(ZX),
|
|
492
|
+
block=(block_size, 1, 1),
|
|
493
|
+
grid=((ZX + block_size - 1) // block_size, 1, 1),
|
|
494
|
+
stream=stream)
|
|
495
|
+
else:
|
|
496
|
+
# fallback CPU roundtrip (slower)
|
|
497
|
+
stream.synchronize()
|
|
498
|
+
drv.memcpy_dtoh(x_host, x_gpu)
|
|
499
|
+
np.maximum(x_host, 0.0, out=x_host)
|
|
500
|
+
drv.memcpy_htod_async(x_gpu, x_host, stream)
|
|
501
|
+
|
|
502
|
+
# extrapolation
|
|
503
|
+
_call_axpby(axpby_kernel, x_tilde_gpu, x_gpu, x_old_gpu, np.float32(1.0 + theta), np.float32(-theta), ZX, stream, block_size)
|
|
504
|
+
|
|
505
|
+
# saves
|
|
506
|
+
if isSavingEachIteration and (it + 1) in save_indices_all:
|
|
507
|
+
stream.synchronize()
|
|
508
|
+
drv.memcpy_dtoh(x_host, x_gpu)
|
|
509
|
+
x_saved = x_host.reshape((Z, X)).copy()
|
|
510
|
+
if maxy > 0:
|
|
511
|
+
x_saved *= maxy
|
|
512
|
+
I_reconMatrix.append(x_saved)
|
|
513
|
+
saved_indices.append(it + 1)
|
|
514
|
+
|
|
515
|
+
stream.synchronize()
|
|
516
|
+
drv.memcpy_dtoh(x_host, x_gpu)
|
|
517
|
+
x_final = x_host.reshape((Z, X)).copy()
|
|
518
|
+
if maxy > 0:
|
|
519
|
+
x_final *= maxy
|
|
520
|
+
if isSavingEachIteration and len(I_reconMatrix):
|
|
521
|
+
for i in range(len(I_reconMatrix)):
|
|
522
|
+
I_reconMatrix[i] *= maxy
|
|
523
|
+
|
|
524
|
+
# free buffers
|
|
525
|
+
for buff in bufs:
|
|
526
|
+
try:
|
|
527
|
+
buff.free()
|
|
528
|
+
except:
|
|
529
|
+
pass
|
|
530
|
+
|
|
531
|
+
if SMatrix.ctx:
|
|
532
|
+
SMatrix.ctx.pop()
|
|
533
|
+
|
|
534
|
+
if isSavingEachIteration:
|
|
535
|
+
return I_reconMatrix, saved_indices
|
|
536
|
+
else:
|
|
537
|
+
return x_final, None
|
|
538
|
+
|
|
539
|
+
except Exception as e:
|
|
540
|
+
# cleanup robustly
|
|
541
|
+
print("Error in CP_TV_Tikhonov+Lap (robust):", e)
|
|
542
|
+
try:
|
|
543
|
+
for buff in bufs:
|
|
544
|
+
try:
|
|
545
|
+
buff.free()
|
|
546
|
+
except:
|
|
547
|
+
pass
|
|
548
|
+
except:
|
|
549
|
+
pass
|
|
550
|
+
try:
|
|
551
|
+
if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
|
|
552
|
+
SMatrix.ctx.pop()
|
|
553
|
+
except:
|
|
554
|
+
pass
|
|
555
|
+
raise
|
|
556
|
+
|
|
126
557
|
|
|
127
558
|
def CP_KL(
|
|
128
559
|
SMatrix,
|
|
@@ -132,9 +563,10 @@ def CP_KL(
|
|
|
132
563
|
numIterations=5000,
|
|
133
564
|
isSavingEachIteration=True,
|
|
134
565
|
L=None,
|
|
135
|
-
|
|
566
|
+
tumor_str="",
|
|
136
567
|
device=None,
|
|
137
568
|
max_saves=5000,
|
|
569
|
+
show_logs=True,
|
|
138
570
|
):
|
|
139
571
|
"""
|
|
140
572
|
Chambolle-Pock algorithm for Kullback-Leibler (KL) divergence regularization.
|
|
@@ -193,12 +625,11 @@ def CP_KL(
|
|
|
193
625
|
saved_indices = [0]
|
|
194
626
|
|
|
195
627
|
# Description for progress bar
|
|
196
|
-
tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
|
|
197
628
|
device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
|
|
198
629
|
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (KL) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
|
|
199
630
|
|
|
200
|
-
|
|
201
|
-
for iteration in
|
|
631
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
632
|
+
for iteration in iterator:
|
|
202
633
|
# Update q (proximal step for F*)
|
|
203
634
|
q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)
|
|
204
635
|
|
|
@@ -224,27 +224,20 @@ class SparseSMatrix_CSR:
|
|
|
224
224
|
def getMatrixSize(self):
|
|
225
225
|
"""
|
|
226
226
|
Retourne la taille totale de la matrice CSR en Go (en sommant la mémoire GPU).
|
|
227
|
+
Utilise les attributs de taille stockés pour contourner l'AttributeError de DeviceAllocation.
|
|
227
228
|
"""
|
|
229
|
+
# Note: L'utilisateur doit s'assurer que self.row_ptr existe avant cet appel.
|
|
228
230
|
if self.row_ptr is None:
|
|
229
231
|
return {"error": "La matrice sparse n'est pas encore allouée."}
|
|
230
232
|
|
|
231
233
|
total_bytes = 0
|
|
232
|
-
|
|
233
|
-
# Mémoire GPU (row_ptr_gpu, col_ind_gpu, values_gpu, norm_factor_inv_gpu)
|
|
234
|
-
if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
|
|
235
|
-
total_bytes += self.row_ptr_gpu.size
|
|
236
|
-
if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
|
|
237
|
-
total_bytes += self.col_ind_gpu.size
|
|
238
|
-
if hasattr(self, 'values_gpu') and self.values_gpu:
|
|
239
|
-
total_bytes += self.values_gpu.size
|
|
240
|
-
if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
|
|
241
|
-
total_bytes += self.norm_factor_inv_gpu.size
|
|
242
|
-
|
|
243
|
-
# NOTE: Les versions précédentes utilisaient le .size de l'objet DeviceAllocation,
|
|
244
|
-
# qui était problématique. Si l'erreur se reproduit ici, il faudra
|
|
245
|
-
# stocker la taille en octets comme nous l'avons fait pour SELL.
|
|
246
|
-
# Pour l'instant, nous conservons la méthode getMatrixSize originale de CSR.
|
|
247
234
|
|
|
235
|
+
# Somme des tailles stockées (Taille calculée et attribuée dans allocate et compute_norm_factor_from_csr)
|
|
236
|
+
total_bytes += getattr(self, 'row_ptr_gpu_size', 0)
|
|
237
|
+
total_bytes += getattr(self, 'col_ind_gpu_size', 0)
|
|
238
|
+
total_bytes += getattr(self, 'values_gpu_size', 0)
|
|
239
|
+
total_bytes += getattr(self, 'norm_factor_inv_gpu_size', 0)
|
|
240
|
+
|
|
248
241
|
return total_bytes / (1024**3)
|
|
249
242
|
|
|
250
243
|
def free(self):
|
|
@@ -92,13 +92,11 @@ class SparseSMatrix_SELL:
|
|
|
92
92
|
def allocate(self):
|
|
93
93
|
"""
|
|
94
94
|
Build SELL-C-σ directly from manip AcousticFields in streaming blocks.
|
|
95
|
-
|
|
95
|
+
Corrected: per-block row_nnz copy, zeroing of host block, proper sync.
|
|
96
96
|
"""
|
|
97
97
|
if self.sparse_mod is None:
|
|
98
98
|
raise RuntimeError("CUDA module not loaded. Check compilation.")
|
|
99
99
|
|
|
100
|
-
# NOTE: Les noms de kernel (count_nnz_rows_kernel, fill_kernel__SELL) sont utilisés
|
|
101
|
-
# car ils sont présents dans la classe fonctionnelle.
|
|
102
100
|
count_kernel = self.sparse_mod.get_function("count_nnz_rows_kernel")
|
|
103
101
|
fill_kernel = self.sparse_mod.get_function("fill_kernel__SELL")
|
|
104
102
|
|
|
@@ -106,16 +104,14 @@ class SparseSMatrix_SELL:
|
|
|
106
104
|
num_cols = int(self.Z * self.X)
|
|
107
105
|
C = int(self.slice_height)
|
|
108
106
|
|
|
109
|
-
# host temporary block
|
|
110
107
|
br = int(self.block_rows)
|
|
111
|
-
bytes_per_elem = np.dtype(np.float32).itemsize
|
|
112
108
|
dense_host = np.empty((br, num_cols), dtype=np.float32)
|
|
113
109
|
|
|
114
|
-
# Allocation
|
|
110
|
+
# Allocation dense buffer on device (size = br * num_cols)
|
|
115
111
|
dense_gpu_size = dense_host.nbytes
|
|
116
112
|
dense_gpu = drv.mem_alloc(dense_gpu_size)
|
|
117
113
|
|
|
118
|
-
# 1) count nnz per row (
|
|
114
|
+
# 1) count nnz per row (per block)
|
|
119
115
|
row_nnz = np.zeros(num_rows, dtype=np.int32)
|
|
120
116
|
row_nnz_gpu_block_size = br * np.dtype(np.int32).itemsize
|
|
121
117
|
row_nnz_gpu_block = drv.mem_alloc(row_nnz_gpu_block_size)
|
|
@@ -123,17 +119,19 @@ class SparseSMatrix_SELL:
|
|
|
123
119
|
block = 256
|
|
124
120
|
for b in trange(0, num_rows, br, desc="Count NNZ per row"):
|
|
125
121
|
R = min(br, num_rows - b)
|
|
126
|
-
#
|
|
122
|
+
# zero the host block to avoid garbage in tail when R < br
|
|
123
|
+
dense_host.fill(0.0)
|
|
127
124
|
for i in range(R):
|
|
128
125
|
rg = b + i
|
|
129
126
|
n_idx = rg // self.T
|
|
130
127
|
t_idx = rg % self.T
|
|
131
128
|
dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
|
|
132
|
-
# copy
|
|
129
|
+
# copy whole buffer (safe because we zeroed tail)
|
|
133
130
|
drv.memcpy_htod(dense_gpu, dense_host)
|
|
134
131
|
grid = ((R + block - 1) // block, 1, 1)
|
|
135
132
|
count_kernel(dense_gpu, row_nnz_gpu_block, np.int32(R), np.int32(num_cols), np.float32(self.relative_threshold),
|
|
136
|
-
|
|
133
|
+
block=(block,1,1), grid=grid)
|
|
134
|
+
drv.Context.synchronize()
|
|
137
135
|
tmp = np.empty(R, dtype=np.int32)
|
|
138
136
|
drv.memcpy_dtoh(tmp, row_nnz_gpu_block)
|
|
139
137
|
row_nnz[b:b+R] = tmp
|
|
@@ -148,7 +146,6 @@ class SparseSMatrix_SELL:
|
|
|
148
146
|
r0 = s * C
|
|
149
147
|
r1 = min(num_rows, r0 + C)
|
|
150
148
|
slice_len[s] = int(np.max(row_nnz[r0:r1])) if (r1>r0) else 0
|
|
151
|
-
# slice_ptr (int64)
|
|
152
149
|
slice_ptr = np.zeros(num_slices + 1, dtype=np.int64)
|
|
153
150
|
for s in range(num_slices):
|
|
154
151
|
slice_ptr[s+1] = slice_ptr[s] + (slice_len[s] * C)
|
|
@@ -160,9 +157,14 @@ class SparseSMatrix_SELL:
|
|
|
160
157
|
self.sell_values_gpu_size = total_storage * np.dtype(np.float32).itemsize
|
|
161
158
|
self.sell_colinds_gpu_size = total_storage * np.dtype(np.uint32).itemsize
|
|
162
159
|
|
|
160
|
+
# allocate and optionally zero them
|
|
163
161
|
self.sell_values_gpu = drv.mem_alloc(self.sell_values_gpu_size)
|
|
162
|
+
# It's good practice to zero the values buffer to avoid leftover memory
|
|
163
|
+
drv.memset_d32(self.sell_values_gpu, 0, total_storage)
|
|
164
|
+
|
|
164
165
|
self.sell_colinds_gpu = drv.mem_alloc(self.sell_colinds_gpu_size)
|
|
165
|
-
|
|
166
|
+
drv.memset_d32(self.sell_colinds_gpu, 0, total_storage)
|
|
167
|
+
|
|
166
168
|
# allocate slice metadata on device
|
|
167
169
|
self.slice_ptr = slice_ptr
|
|
168
170
|
self.slice_len = slice_len
|
|
@@ -177,29 +179,28 @@ class SparseSMatrix_SELL:
|
|
|
177
179
|
drv.memcpy_htod(self.slice_len_gpu, self.slice_len)
|
|
178
180
|
|
|
179
181
|
# 3) fill SELL arrays by streaming blocks again (use GPU fill kernel)
|
|
180
|
-
# reuse dense_host and allocate new dense_gpu
|
|
181
182
|
dense_host = np.empty((br, num_cols), dtype=np.float32)
|
|
183
|
+
dense_gpu = drv.mem_alloc(dense_host.nbytes)
|
|
182
184
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
# we also need row_nnz on device per-block; supply global row_nnz on host but the kernel recomputes threshold
|
|
187
|
-
row_nnz_host_gpu_size = br * np.dtype(np.int32).itemsize
|
|
188
|
-
row_nnz_host_gpu = drv.mem_alloc(row_nnz_host_gpu_size)
|
|
185
|
+
# For per-block row_nnz pointer we allocate a buffer of max block size once, then reuse
|
|
186
|
+
row_nnz_host_gpu = drv.mem_alloc(br * np.dtype(np.int32).itemsize)
|
|
189
187
|
|
|
190
188
|
for b in trange(0, num_rows, br, desc="Fill SELL"):
|
|
191
189
|
R = min(br, num_rows - b)
|
|
190
|
+
dense_host.fill(0.0)
|
|
192
191
|
for i in range(R):
|
|
193
192
|
rg = b + i
|
|
194
193
|
n_idx = rg // self.T
|
|
195
194
|
t_idx = rg % self.T
|
|
196
195
|
dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
|
|
196
|
+
# copy host block
|
|
197
197
|
drv.memcpy_htod(dense_gpu, dense_host)
|
|
198
|
-
#
|
|
199
|
-
|
|
198
|
+
# copy corresponding row_nnz slice (only R entries)
|
|
199
|
+
drv.memcpy_htod(row_nnz_host_gpu, row_nnz[b:b+R])
|
|
200
|
+
|
|
200
201
|
grid = ((R + block - 1) // block, 1, 1)
|
|
201
202
|
fill_kernel(dense_gpu,
|
|
202
|
-
|
|
203
|
+
row_nnz_host_gpu,
|
|
203
204
|
self.slice_ptr_gpu,
|
|
204
205
|
self.slice_len_gpu,
|
|
205
206
|
self.sell_colinds_gpu,
|
|
@@ -210,12 +211,14 @@ class SparseSMatrix_SELL:
|
|
|
210
211
|
np.int32(C),
|
|
211
212
|
np.float32(self.relative_threshold),
|
|
212
213
|
block=(block,1,1), grid=grid)
|
|
214
|
+
drv.Context.synchronize()
|
|
215
|
+
|
|
213
216
|
dense_gpu.free()
|
|
214
217
|
row_nnz_host_gpu.free()
|
|
215
218
|
|
|
216
219
|
# 4) compute norm_factor_inv via GPU accumulate (col sums)
|
|
217
220
|
self.compute_norm_factor()
|
|
218
|
-
|
|
221
|
+
|
|
219
222
|
def apply_apodization_gpu(self, window_vector_gpu):
|
|
220
223
|
"""
|
|
221
224
|
Applique le fenêtrage directement sur self.sell_values_gpu
|