AOT-biomaps 2.9.138__py3-none-any.whl → 2.9.279__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of AOT-biomaps might be problematic. Click here for more details.
- AOT_biomaps/AOT_Acoustic/AcousticTools.py +35 -115
- AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
- AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +22 -18
- AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
- AOT_biomaps/AOT_Experiment/_mainExperiment.py +102 -68
- AOT_biomaps/AOT_Optic/_mainOptic.py +124 -58
- AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +72 -108
- AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +474 -289
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +173 -68
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +360 -154
- AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +150 -111
- AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +281 -0
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +328 -0
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
- AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
- AOT_biomaps/AOT_Recon/AlgebraicRecon.py +359 -238
- AOT_biomaps/AOT_Recon/AnalyticRecon.py +29 -41
- AOT_biomaps/AOT_Recon/BayesianRecon.py +165 -91
- AOT_biomaps/AOT_Recon/DeepLearningRecon.py +4 -1
- AOT_biomaps/AOT_Recon/PrimalDualRecon.py +175 -31
- AOT_biomaps/AOT_Recon/ReconEnums.py +38 -3
- AOT_biomaps/AOT_Recon/ReconTools.py +184 -77
- AOT_biomaps/AOT_Recon/__init__.py +1 -0
- AOT_biomaps/AOT_Recon/_mainRecon.py +144 -74
- AOT_biomaps/__init__.py +4 -36
- {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/METADATA +2 -1
- aot_biomaps-2.9.279.dist-info/RECORD +47 -0
- aot_biomaps-2.9.138.dist-info/RECORD +0 -43
- {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/WHEEL +0 -0
- {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/top_level.txt +0 -0
|
@@ -1,314 +1,499 @@
|
|
|
1
1
|
from AOT_biomaps.Config import config
|
|
2
|
-
import
|
|
2
|
+
from AOT_biomaps.AOT_Recon.ReconTools import calculate_memory_requirement, check_gpu_memory
|
|
3
|
+
from AOT_biomaps.AOT_Recon.ReconEnums import SMatrixType
|
|
4
|
+
|
|
3
5
|
import torch
|
|
4
6
|
import numpy as np
|
|
5
|
-
import os
|
|
6
7
|
from tqdm import trange
|
|
8
|
+
import pycuda.driver as drv
|
|
9
|
+
import torch.cuda
|
|
10
|
+
import gc
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def LS(
|
|
15
|
+
SMatrix,
|
|
16
|
+
y,
|
|
17
|
+
numIterations=100,
|
|
18
|
+
isSavingEachIteration=True,
|
|
19
|
+
withTumor=True,
|
|
20
|
+
alpha=1e-1,
|
|
21
|
+
device=None,
|
|
22
|
+
use_numba=False,
|
|
23
|
+
denominator_threshold=1e-6,
|
|
24
|
+
max_saves=5000,
|
|
25
|
+
show_logs=True,
|
|
26
|
+
smatrixType=SMatrixType.SELL,
|
|
27
|
+
Z=350,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Least Squares reconstruction using Projected Gradient Descent (PGD) with non-negativity constraint.
|
|
31
|
+
Currently only implements the stable GPU version.
|
|
32
|
+
"""
|
|
33
|
+
tumor_str = "WITH" if withTumor else "WITHOUT"
|
|
34
|
+
# Auto-select device and method
|
|
35
|
+
if device is None:
|
|
36
|
+
if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
|
|
37
|
+
device = torch.device(f"cuda:{config.select_best_gpu()}")
|
|
38
|
+
use_gpu = True
|
|
39
|
+
else:
|
|
40
|
+
device = torch.device("cpu")
|
|
41
|
+
use_gpu = False
|
|
42
|
+
else:
|
|
43
|
+
use_gpu = device.type == "cuda"
|
|
44
|
+
# Dispatch to the appropriate implementation
|
|
45
|
+
if use_gpu:
|
|
46
|
+
if smatrixType == SMatrixType.CSR:
|
|
47
|
+
return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
|
|
48
|
+
elif smatrixType == SMatrixType.SELL:
|
|
49
|
+
return _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
|
|
50
|
+
elif smatrixType == SMatrixType.DENSE:
|
|
51
|
+
return _LS_GPU_stable(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold,show_logs)
|
|
52
|
+
else:
|
|
53
|
+
raise ValueError("Unsupported SMatrixType for GPU LS.")
|
|
54
|
+
else:
|
|
55
|
+
raise NotImplementedError("Only GPU implementations are currently available for LS.")
|
|
7
56
|
|
|
8
|
-
def
|
|
57
|
+
def _LS_GPU_stable(SMatrix, y, numIterations, alpha, isSavingEachIteration, tumor_str, max_saves=5000, show_logs=True):
|
|
58
|
+
"""
|
|
59
|
+
Stable GPU implementation of LS using projected gradient descent with diagonal preconditioner.
|
|
60
|
+
"""
|
|
9
61
|
device = torch.device(f"cuda:{config.select_best_gpu()}")
|
|
10
62
|
T, Z, X, N = SMatrix.shape
|
|
11
63
|
ZX = Z * X
|
|
12
64
|
TN = T * N
|
|
13
|
-
|
|
14
|
-
raise ValueError(f"Expected y shape: ({T}, {N}), got {y.shape}")
|
|
15
|
-
|
|
65
|
+
# 1. Conversion et normalisation
|
|
16
66
|
A_flat = torch.from_numpy(SMatrix).to(device=device, dtype=torch.float32).permute(0, 3, 1, 2).reshape(TN, ZX)
|
|
17
67
|
y_flat = torch.from_numpy(y).to(device=device, dtype=torch.float32).reshape(TN)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
if isSavingEachIteration
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
break
|
|
65
|
-
|
|
66
|
-
# Normalisation finale entre 0 et 1
|
|
67
|
-
if torch.max(theta_flat) > 0:
|
|
68
|
-
theta_flat = theta_flat / torch.max(theta_flat)
|
|
69
|
-
|
|
70
|
-
del A_flat, y_flat, A_normalized, y_normalized
|
|
68
|
+
norm_A = A_flat.max()
|
|
69
|
+
norm_y = y_flat.max()
|
|
70
|
+
A_flat.div_(norm_A + 1e-8)
|
|
71
|
+
y_flat.div_(norm_y + 1e-8)
|
|
72
|
+
# 2. Initialisation
|
|
73
|
+
lambda_k = torch.zeros(ZX, device=device)
|
|
74
|
+
lambda_history = [] if isSavingEachIteration else None
|
|
75
|
+
saved_indices = [] # Pour stocker les indices des itérations sauvegardées
|
|
76
|
+
|
|
77
|
+
# Calculate save indices
|
|
78
|
+
if numIterations <= max_saves:
|
|
79
|
+
save_indices = list(range(numIterations))
|
|
80
|
+
else:
|
|
81
|
+
step = numIterations // max_saves
|
|
82
|
+
save_indices = list(range(0, numIterations, step))
|
|
83
|
+
if save_indices[-1] != numIterations - 1:
|
|
84
|
+
save_indices.append(numIterations - 1)
|
|
85
|
+
|
|
86
|
+
# Préconditionneur diagonal
|
|
87
|
+
diag_AAT = torch.sum(A_flat ** 2, dim=0)
|
|
88
|
+
M_inv = 1.0 / torch.clamp(diag_AAT, min=1e-6)
|
|
89
|
+
# Pré-allocation des tenseurs
|
|
90
|
+
r_k = torch.empty_like(y_flat)
|
|
91
|
+
AT_r = torch.empty(ZX, device=device)
|
|
92
|
+
description = f"AOT-BioMaps -- Stable LS Reconstruction ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
|
|
93
|
+
|
|
94
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
95
|
+
for it in iterator:
|
|
96
|
+
# Calcul du résidu (inplace)
|
|
97
|
+
torch.matmul(A_flat, lambda_k, out=r_k)
|
|
98
|
+
r_k = y_flat - r_k
|
|
99
|
+
if isSavingEachIteration and it in save_indices:
|
|
100
|
+
lambda_history.append(lambda_k.clone().reshape(Z, X) * (norm_y / norm_A))
|
|
101
|
+
saved_indices.append(it)
|
|
102
|
+
|
|
103
|
+
# Gradient préconditionné (inplace)
|
|
104
|
+
torch.matmul(A_flat.T, r_k, out=AT_r)
|
|
105
|
+
AT_r *= M_inv
|
|
106
|
+
# Mise à jour avec pas fixe et projection (inplace)
|
|
107
|
+
lambda_k.add_(AT_r, alpha=alpha)
|
|
108
|
+
lambda_k.clamp_(min=0)
|
|
109
|
+
|
|
110
|
+
# 3. Dénormalisation
|
|
111
|
+
lambda_final = lambda_k.reshape(Z, X) * (norm_y / norm_A)
|
|
112
|
+
# Free memory
|
|
113
|
+
del A_flat, y_flat, r_k, AT_r
|
|
71
114
|
torch.cuda.empty_cache()
|
|
72
|
-
|
|
73
115
|
if isSavingEachIteration:
|
|
74
|
-
return [
|
|
116
|
+
return [t.cpu().numpy() for t in lambda_history], saved_indices
|
|
75
117
|
else:
|
|
76
|
-
return
|
|
118
|
+
return lambda_final.cpu().numpy(), None
|
|
119
|
+
|
|
120
|
+
def _LS_GPU_opti(*args, **kwargs):
|
|
121
|
+
raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
|
|
122
|
+
|
|
123
|
+
def _LS_GPU_multi(*args, **kwargs):
|
|
124
|
+
raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
|
|
125
|
+
|
|
126
|
+
def _LS_CPU_opti(*args, **kwargs):
|
|
127
|
+
raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
|
|
128
|
+
|
|
129
|
+
def _LS_CPU_basic(*args, **kwargs):
|
|
130
|
+
raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
|
|
131
|
+
|
|
132
|
+
def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
|
|
133
|
+
"""
|
|
134
|
+
Reconstruction par Moindres Carrés (LS) via Gradient Conjugué (CG) sur format CSR.
|
|
135
|
+
Utilise les mêmes arguments que la fonction MLEM, sans sous-fonctions Python.
|
|
77
136
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
if isSavingEachIteration:
|
|
105
|
-
return saved_theta, saved_indices
|
|
106
|
-
else:
|
|
107
|
-
return theta_p, None
|
|
108
|
-
except Exception as e:
|
|
109
|
-
print("Error in basic CPU LS:", type(e).__name__, ":", e)
|
|
110
|
-
return None, None
|
|
137
|
+
SMatrix: instance de SparseSMatrix_CSR (déjà allouée)
|
|
138
|
+
y: données mesurées (1D np.float32 de taille TN)
|
|
139
|
+
"""
|
|
140
|
+
final_result = None
|
|
141
|
+
|
|
142
|
+
# Paramètres non utilisés dans CG mais conservés pour la signature: denominator_threshold, device
|
|
143
|
+
|
|
144
|
+
# --- Logique de Produit Scalaire (Intégrée) ---
|
|
145
|
+
def _dot_product_gpu(mod, a_ptr, b_ptr, N_int, stream):
|
|
146
|
+
block_size = 256
|
|
147
|
+
grid_size = (N_int + block_size - 1) // block_size
|
|
148
|
+
|
|
149
|
+
reduction_host = np.empty(grid_size, dtype=np.float32)
|
|
150
|
+
reduction_buffer = drv.mem_alloc(reduction_host.nbytes)
|
|
151
|
+
|
|
152
|
+
dot_kernel = mod.get_function("dot_product_reduction_kernel")
|
|
153
|
+
|
|
154
|
+
dot_kernel(reduction_buffer, a_ptr, b_ptr, np.int32(N_int),
|
|
155
|
+
block=(block_size, 1, 1), grid=(grid_size, 1, 1), stream=stream)
|
|
156
|
+
|
|
157
|
+
drv.memcpy_dtoh(reduction_host, reduction_buffer)
|
|
158
|
+
total_dot = np.sum(reduction_host)
|
|
159
|
+
|
|
160
|
+
reduction_buffer.free()
|
|
161
|
+
return total_dot
|
|
162
|
+
# -----------------------------------------------
|
|
111
163
|
|
|
112
|
-
def _LS_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, withTumor):
|
|
113
164
|
try:
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
165
|
+
if not isinstance(SMatrix, SMatrix.__class__):
|
|
166
|
+
raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
|
|
167
|
+
|
|
168
|
+
if SMatrix.ctx:
|
|
169
|
+
SMatrix.ctx.push()
|
|
170
|
+
|
|
171
|
+
dtype = np.float32
|
|
172
|
+
TN = SMatrix.N * SMatrix.T
|
|
173
|
+
ZX = SMatrix.Z * SMatrix.X
|
|
174
|
+
Z = SMatrix.Z
|
|
175
|
+
X = SMatrix.X
|
|
176
|
+
block_size = 256
|
|
177
|
+
tolerance = 1e-12
|
|
178
|
+
|
|
179
|
+
if show_logs:
|
|
180
|
+
print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
|
|
181
|
+
print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
|
|
182
|
+
|
|
183
|
+
stream = drv.Stream()
|
|
184
|
+
mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
|
|
185
|
+
|
|
186
|
+
# Récupération des Kernels
|
|
187
|
+
projection_kernel = mod.get_function('projection_kernel__CSR')
|
|
188
|
+
backprojection_kernel = mod.get_function('backprojection_kernel__CSR')
|
|
189
|
+
axpby_kernel = mod.get_function("vector_axpby_kernel")
|
|
190
|
+
minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
|
|
191
|
+
|
|
192
|
+
# --- Allocation des buffers (Pointeurs Bruts) ---
|
|
193
|
+
y = y.T.flatten().astype(dtype)
|
|
194
|
+
y_gpu = drv.mem_alloc(y.nbytes)
|
|
195
|
+
drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
|
|
196
|
+
|
|
197
|
+
theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # lambda
|
|
198
|
+
drv.memcpy_htod_async(theta_flat_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
|
|
199
|
+
|
|
200
|
+
q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize) # q = A*p
|
|
201
|
+
r_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # r (residue)
|
|
202
|
+
p_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # p (direction)
|
|
203
|
+
z_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # z = A^T A p
|
|
204
|
+
ATy_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # A^T y (constant)
|
|
205
|
+
|
|
206
|
+
# --- Initialisation CG ---
|
|
207
|
+
|
|
208
|
+
# 1. ATy = A^T * y
|
|
209
|
+
drv.memset_d32_async(ATy_flat_gpu, 0, ZX, stream)
|
|
210
|
+
backprojection_kernel(ATy_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
|
|
211
|
+
y_gpu, np.int32(TN),
|
|
212
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
213
|
+
|
|
214
|
+
# 2. q = A * theta_0
|
|
215
|
+
projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
|
|
216
|
+
theta_flat_gpu, np.int32(TN),
|
|
217
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
218
|
+
|
|
219
|
+
# 3. r_temp = A^T * q = A^T A theta_0
|
|
220
|
+
drv.memset_d32_async(r_flat_gpu, 0, ZX, stream)
|
|
221
|
+
backprojection_kernel(r_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
|
|
222
|
+
q_flat_gpu, np.int32(TN),
|
|
223
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
224
|
+
|
|
225
|
+
# 4. r_0 = ATy - r_temp (r = ATy + (-1)*r_temp)
|
|
226
|
+
axpby_kernel(r_flat_gpu, ATy_flat_gpu, r_flat_gpu,
|
|
227
|
+
np.float32(1.0), np.float32(-1.0), np.int32(ZX),
|
|
228
|
+
block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
229
|
+
|
|
230
|
+
# 5. p_0 = r_0
|
|
231
|
+
drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
|
|
232
|
+
|
|
233
|
+
# 6. rho_prev = ||r_0||^2
|
|
234
|
+
rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
|
|
235
|
+
|
|
236
|
+
# --- Boucle itérative ---
|
|
237
|
+
saved_theta, saved_indices = [], []
|
|
238
|
+
if numIterations <= max_saves:
|
|
239
|
+
save_indices = list(range(numIterations))
|
|
140
240
|
else:
|
|
141
|
-
|
|
241
|
+
save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
|
|
242
|
+
if save_indices[-1] != numIterations - 1:
|
|
243
|
+
save_indices.append(numIterations - 1)
|
|
244
|
+
|
|
245
|
+
description = f"AOT-BioMaps -- LS-CG (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
|
|
246
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
247
|
+
|
|
248
|
+
for it in iterator:
|
|
249
|
+
# a. q = A * p
|
|
250
|
+
projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
|
|
251
|
+
p_flat_gpu, np.int32(TN),
|
|
252
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
253
|
+
|
|
254
|
+
# b. z = A^T * q = A^T A p
|
|
255
|
+
drv.memset_d32_async(z_flat_gpu, 0, ZX, stream)
|
|
256
|
+
backprojection_kernel(z_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
|
|
257
|
+
q_flat_gpu, np.int32(TN),
|
|
258
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
259
|
+
|
|
260
|
+
# c. alpha = rho_prev / <p, z>
|
|
261
|
+
pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
|
|
262
|
+
|
|
263
|
+
if abs(pAp) < 1e-15: break
|
|
264
|
+
alpha = rho_prev / pAp
|
|
265
|
+
|
|
266
|
+
# d. theta = theta + alpha * p
|
|
267
|
+
axpby_kernel(theta_flat_gpu, theta_flat_gpu, p_flat_gpu,
|
|
268
|
+
np.float32(1.0), alpha, np.int32(ZX),
|
|
269
|
+
block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
270
|
+
|
|
271
|
+
# e. r = r - alpha * z
|
|
272
|
+
minus_axpy_kernel(r_flat_gpu, z_flat_gpu, alpha, np.int32(ZX),
|
|
273
|
+
block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
274
|
+
|
|
275
|
+
# f. rho_curr = ||r||^2
|
|
276
|
+
rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
|
|
277
|
+
|
|
278
|
+
if rho_curr < tolerance: break
|
|
279
|
+
|
|
280
|
+
# g. beta = rho_curr / rho_prev
|
|
281
|
+
beta = rho_curr / rho_prev
|
|
282
|
+
|
|
283
|
+
# h. p = r + beta * p
|
|
284
|
+
axpby_kernel(p_flat_gpu, r_flat_gpu, p_flat_gpu,
|
|
285
|
+
np.float32(1.0), beta, np.int32(ZX),
|
|
286
|
+
block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
287
|
+
|
|
288
|
+
rho_prev = rho_curr
|
|
289
|
+
|
|
290
|
+
if show_logs and (it % 10 == 0 or it == numIterations - 1):
|
|
291
|
+
drv.Context.synchronize()
|
|
292
|
+
|
|
293
|
+
if isSavingEachIteration and it in save_indices:
|
|
294
|
+
theta_host = np.empty(ZX, dtype=dtype)
|
|
295
|
+
drv.memcpy_dtoh(theta_host, theta_flat_gpu)
|
|
296
|
+
saved_theta.append(theta_host.reshape(Z, X))
|
|
297
|
+
saved_indices.append(it)
|
|
298
|
+
|
|
299
|
+
drv.Context.synchronize()
|
|
300
|
+
|
|
301
|
+
final_result = np.empty(ZX, dtype=dtype)
|
|
302
|
+
drv.memcpy_dtoh(final_result, theta_flat_gpu)
|
|
303
|
+
final_result = final_result.reshape(Z, X)
|
|
304
|
+
|
|
305
|
+
# Libération
|
|
306
|
+
y_gpu.free(); q_flat_gpu.free(); r_flat_gpu.free(); p_flat_gpu.free(); z_flat_gpu.free(); theta_flat_gpu.free(); ATy_flat_gpu.free()
|
|
307
|
+
|
|
308
|
+
return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
|
|
309
|
+
|
|
142
310
|
except Exception as e:
|
|
143
|
-
print("Error in
|
|
311
|
+
print(f"Error in LS_CG_sparseCSR_pycuda: {type(e).__name__}: {e}")
|
|
312
|
+
gc.collect()
|
|
144
313
|
return None, None
|
|
314
|
+
|
|
315
|
+
finally:
|
|
316
|
+
if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
|
|
317
|
+
SMatrix.ctx.pop()
|
|
318
|
+
|
|
319
|
+
def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
|
|
320
|
+
"""
|
|
321
|
+
Reconstruction par Moindres Carrés (LS) via Gradient Conjugué (CG) sur format SELL-C-sigma.
|
|
322
|
+
Utilise les mêmes arguments que la fonction MLEM, sans sous-fonctions Python.
|
|
323
|
+
|
|
324
|
+
SMatrix: instance de SparseSMatrix_SELL (déjà allouée)
|
|
325
|
+
y: données mesurées (1D np.float32 de taille TN)
|
|
326
|
+
"""
|
|
327
|
+
final_result = None
|
|
328
|
+
|
|
329
|
+
# --- Logique de Produit Scalaire (Intégrée) ---
|
|
330
|
+
def _dot_product_gpu(mod, a_ptr, b_ptr, N_int, stream):
|
|
331
|
+
block_size = 256
|
|
332
|
+
grid_size = (N_int + block_size - 1) // block_size
|
|
333
|
+
|
|
334
|
+
reduction_host = np.empty(grid_size, dtype=np.float32)
|
|
335
|
+
reduction_buffer = drv.mem_alloc(reduction_host.nbytes)
|
|
336
|
+
|
|
337
|
+
dot_kernel = mod.get_function("dot_product_reduction_kernel")
|
|
338
|
+
|
|
339
|
+
dot_kernel(reduction_buffer, a_ptr, b_ptr, np.int32(N_int),
|
|
340
|
+
block=(block_size, 1, 1), grid=(grid_size, 1, 1), stream=stream)
|
|
341
|
+
|
|
342
|
+
drv.memcpy_dtoh(reduction_host, reduction_buffer)
|
|
343
|
+
total_dot = np.sum(reduction_host)
|
|
344
|
+
|
|
345
|
+
reduction_buffer.free()
|
|
346
|
+
return total_dot
|
|
347
|
+
# -----------------------------------------------
|
|
145
348
|
|
|
146
|
-
def _LS_GPU_multi(SMatrix, y, numIterations, isSavingEachIteration, withTumor):
|
|
147
349
|
try:
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
350
|
+
if not isinstance(SMatrix, SMatrix.__class__):
|
|
351
|
+
raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
|
|
352
|
+
if SMatrix.sell_values_gpu is None:
|
|
353
|
+
raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
|
|
354
|
+
|
|
355
|
+
if SMatrix.ctx:
|
|
356
|
+
SMatrix.ctx.push()
|
|
357
|
+
|
|
358
|
+
dtype = np.float32
|
|
359
|
+
TN = int(SMatrix.N * SMatrix.T)
|
|
360
|
+
ZX = int(SMatrix.Z * SMatrix.X)
|
|
361
|
+
Z = SMatrix.Z
|
|
362
|
+
X = SMatrix.X
|
|
363
|
+
block_size = 256
|
|
364
|
+
tolerance = 1e-12
|
|
365
|
+
|
|
366
|
+
# Accès aux paramètres SELL
|
|
367
|
+
mod = SMatrix.sparse_mod
|
|
368
|
+
projection_kernel = mod.get_function("projection_kernel__SELL")
|
|
369
|
+
backprojection_kernel = mod.get_function("backprojection_kernel__SELL")
|
|
370
|
+
axpby_kernel = mod.get_function("vector_axpby_kernel")
|
|
371
|
+
minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
|
|
372
|
+
slice_height = np.int32(SMatrix.slice_height)
|
|
373
|
+
grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
|
|
374
|
+
|
|
375
|
+
stream = drv.Stream()
|
|
376
|
+
|
|
377
|
+
# Allocation des buffers
|
|
378
|
+
y = y.T.flatten().astype(dtype)
|
|
379
|
+
y_gpu = drv.mem_alloc(y.nbytes)
|
|
380
|
+
drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
|
|
381
|
+
|
|
382
|
+
theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
383
|
+
drv.memcpy_htod_async(theta_flat_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
|
|
384
|
+
|
|
385
|
+
q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
|
|
386
|
+
r_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
387
|
+
p_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
388
|
+
z_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
389
|
+
ATy_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
390
|
+
|
|
391
|
+
# --- Initialisation CG ---
|
|
392
|
+
|
|
393
|
+
# 1. ATy = A^T * y
|
|
394
|
+
drv.memset_d32_async(ATy_flat_gpu, 0, ZX, stream)
|
|
395
|
+
backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
396
|
+
y_gpu, ATy_flat_gpu, np.int32(TN), slice_height,
|
|
397
|
+
block=(block_size, 1, 1), grid=grid_rows, stream=stream)
|
|
398
|
+
|
|
399
|
+
# 2. q = A * theta_0
|
|
400
|
+
projection_kernel(q_flat_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
401
|
+
theta_flat_gpu, np.int32(TN), slice_height,
|
|
402
|
+
block=(block_size, 1, 1), grid=grid_rows, stream=stream)
|
|
403
|
+
|
|
404
|
+
# 3. r_temp = A^T * q = A^T A theta_0
|
|
405
|
+
drv.memset_d32_async(r_flat_gpu, 0, ZX, stream)
|
|
406
|
+
backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
407
|
+
q_flat_gpu, r_flat_gpu, np.int32(TN), slice_height,
|
|
408
|
+
block=(block_size, 1, 1), grid=grid_rows, stream=stream)
|
|
409
|
+
|
|
410
|
+
# 4. r_0 = ATy - r_temp
|
|
411
|
+
axpby_kernel(r_flat_gpu, ATy_flat_gpu, r_flat_gpu,
|
|
412
|
+
np.float32(1.0), np.float32(-1.0), np.int32(ZX),
|
|
413
|
+
block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
414
|
+
|
|
415
|
+
# 5. p_0 = r_0
|
|
416
|
+
drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
|
|
417
|
+
|
|
418
|
+
# 6. rho_prev = ||r_0||^2
|
|
419
|
+
rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
|
|
420
|
+
|
|
421
|
+
# --- Boucle itérative ---
|
|
422
|
+
saved_theta, saved_indices = [], []
|
|
423
|
+
if numIterations <= max_saves:
|
|
424
|
+
save_indices = list(range(numIterations))
|
|
197
425
|
else:
|
|
198
|
-
|
|
426
|
+
save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
|
|
427
|
+
if save_indices[-1] != numIterations - 1:
|
|
428
|
+
save_indices.append(numIterations - 1)
|
|
429
|
+
|
|
430
|
+
description = f"AOT-BioMaps -- LS-CG (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
|
|
431
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
432
|
+
|
|
433
|
+
for it in iterator:
|
|
434
|
+
# a. q = A * p
|
|
435
|
+
projection_kernel(q_flat_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
436
|
+
p_flat_gpu, np.int32(TN), slice_height,
|
|
437
|
+
block=(block_size, 1, 1), grid=grid_rows, stream=stream)
|
|
438
|
+
|
|
439
|
+
# b. z = A^T * q = A^T A p
|
|
440
|
+
drv.memset_d32_async(z_flat_gpu, 0, ZX, stream)
|
|
441
|
+
backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
|
|
442
|
+
q_flat_gpu, z_flat_gpu, np.int32(TN), slice_height,
|
|
443
|
+
block=(block_size, 1, 1), grid=grid_rows, stream=stream)
|
|
444
|
+
|
|
445
|
+
# c. alpha = rho_prev / <p, z>
|
|
446
|
+
pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
|
|
447
|
+
|
|
448
|
+
if abs(pAp) < 1e-15: break
|
|
449
|
+
alpha = rho_prev / pAp
|
|
450
|
+
|
|
451
|
+
# d. theta = theta + alpha * p
|
|
452
|
+
axpby_kernel(theta_flat_gpu, theta_flat_gpu, p_flat_gpu,
|
|
453
|
+
np.float32(1.0), alpha, np.int32(ZX),
|
|
454
|
+
block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
455
|
+
|
|
456
|
+
# e. r = r - alpha * z
|
|
457
|
+
minus_axpy_kernel(r_flat_gpu, z_flat_gpu, alpha, np.int32(ZX),
|
|
458
|
+
block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
459
|
+
|
|
460
|
+
# f. rho_curr = ||r||^2
|
|
461
|
+
rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
|
|
462
|
+
|
|
463
|
+
if rho_curr < tolerance: break
|
|
464
|
+
|
|
465
|
+
# g. beta = rho_curr / rho_prev
|
|
466
|
+
beta = rho_curr / rho_prev
|
|
467
|
+
|
|
468
|
+
# h. p = r + beta * p
|
|
469
|
+
axpby_kernel(p_flat_gpu, r_flat_gpu, p_flat_gpu,
|
|
470
|
+
np.float32(1.0), beta, np.int32(ZX),
|
|
471
|
+
block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
472
|
+
|
|
473
|
+
rho_prev = rho_curr
|
|
474
|
+
|
|
475
|
+
stream.synchronize()
|
|
476
|
+
if isSavingEachIteration and it in save_indices:
|
|
477
|
+
out = np.empty(ZX, dtype=dtype)
|
|
478
|
+
drv.memcpy_dtoh(out, theta_flat_gpu)
|
|
479
|
+
saved_theta.append(out.reshape((Z, X)))
|
|
480
|
+
saved_indices.append(it)
|
|
481
|
+
|
|
482
|
+
# final copy
|
|
483
|
+
res = np.empty(ZX, dtype=np.float32)
|
|
484
|
+
drv.memcpy_dtoh(res, theta_flat_gpu)
|
|
485
|
+
final_result = res.reshape((Z, X))
|
|
486
|
+
|
|
487
|
+
# free temporaries
|
|
488
|
+
y_gpu.free(); q_flat_gpu.free(); r_flat_gpu.free(); p_flat_gpu.free(); z_flat_gpu.free(); theta_flat_gpu.free(); ATy_flat_gpu.free()
|
|
489
|
+
|
|
490
|
+
return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
|
|
491
|
+
|
|
199
492
|
except Exception as e:
|
|
200
|
-
print("Error in
|
|
201
|
-
|
|
202
|
-
torch.cuda.empty_cache()
|
|
203
|
-
for i in range(num_gpus):
|
|
204
|
-
torch.cuda.empty_cache()
|
|
493
|
+
print(f"Error in LS_CG_sparseSELL_pycuda: {type(e).__name__}: {e}")
|
|
494
|
+
gc.collect()
|
|
205
495
|
return None, None
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
ZX = Z * X
|
|
211
|
-
TN = T * N
|
|
212
|
-
|
|
213
|
-
# Conversion des données
|
|
214
|
-
A_flat = torch.from_numpy(SMatrix).to(device=device, dtype=torch.float32).permute(0, 3, 1, 2).reshape(TN, ZX)
|
|
215
|
-
y_flat = torch.from_numpy(y).to(device=device, dtype=torch.float32).reshape(TN)
|
|
216
|
-
|
|
217
|
-
# Vérification des NaN/Inf
|
|
218
|
-
if torch.isnan(A_flat).any() or torch.isinf(A_flat).any():
|
|
219
|
-
raise ValueError("SMatrix contient des NaN ou Inf.")
|
|
220
|
-
if torch.isnan(y_flat).any() or torch.isinf(y_flat).any():
|
|
221
|
-
raise ValueError("y contient des NaN ou Inf.")
|
|
222
|
-
|
|
223
|
-
# Normalisation
|
|
224
|
-
A_norm = torch.max(torch.abs(A_flat))
|
|
225
|
-
y_norm = torch.max(torch.abs(y_flat))
|
|
226
|
-
if A_norm > 0:
|
|
227
|
-
A_flat = A_flat / A_norm
|
|
228
|
-
if y_norm > 0:
|
|
229
|
-
y_flat = y_flat / y_norm
|
|
230
|
-
|
|
231
|
-
# Initialisation uniforme
|
|
232
|
-
theta_flat = torch.ones(ZX, device=device) / (Z * X)
|
|
233
|
-
theta_prev = theta_flat.clone()
|
|
234
|
-
t = torch.tensor(1.0, device=device)
|
|
235
|
-
|
|
236
|
-
# Constante de Lipschitz
|
|
237
|
-
L = L_Factor * (torch.norm(A_flat, 2).item() ** 2)
|
|
238
|
-
|
|
239
|
-
# Stockage des itérations
|
|
240
|
-
theta_history = []
|
|
241
|
-
saved_indices = []
|
|
242
|
-
if isSavingEachIteration:
|
|
243
|
-
theta_history.append(theta_flat.reshape(Z, X).clone())
|
|
244
|
-
saved_indices.append(0)
|
|
245
|
-
step = max(1, (numIterations - 1) // 999)
|
|
246
|
-
save_count = 1
|
|
247
|
-
|
|
248
|
-
description = f"AOT-BioMaps -- LS + TV (λ: {lambda_tv}) ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- GPU {torch.cuda.current_device()}"
|
|
249
|
-
|
|
250
|
-
# Pré-allocation
|
|
251
|
-
grad_tv = torch.zeros_like(theta_flat)
|
|
252
|
-
|
|
253
|
-
for k in trange(numIterations, desc=description):
|
|
254
|
-
# Gradient des moindres carrés
|
|
255
|
-
grad_ls = A_flat.T @ (A_flat @ theta_flat - y_flat)
|
|
256
|
-
|
|
257
|
-
# Calcul du gradient TV (version corrigée avec padding pour les bords)
|
|
258
|
-
theta_2d = theta_flat.reshape(Z, X)
|
|
259
|
-
|
|
260
|
-
# Dérivées avant (forward differences) avec padding zéro aux bords
|
|
261
|
-
diff_z = torch.zeros_like(theta_2d)
|
|
262
|
-
diff_z[1:, :] = theta_2d[1:, :] - theta_2d[:-1, :] # Dérivée verticale
|
|
263
|
-
|
|
264
|
-
diff_x = torch.zeros_like(theta_2d)
|
|
265
|
-
diff_x[:, 1:] = theta_2d[:, 1:] - theta_2d[:, :-1] # Dérivée horizontale
|
|
266
|
-
|
|
267
|
-
# Divergence du gradient (≈ Laplacien)
|
|
268
|
-
div_grad = torch.zeros_like(theta_2d)
|
|
269
|
-
# Contribution de diff_z (d/dz)
|
|
270
|
-
div_grad[:-1, :] += diff_z[1:, :] # d/dz (θ_{z+1} - θ_z) → +1 à θ_z
|
|
271
|
-
div_grad[1:, :] -= diff_z[1:, :] # → -1 à θ_{z+1}
|
|
272
|
-
# Contribution de diff_x (d/dx)
|
|
273
|
-
div_grad[:, :-1] += diff_x[:, 1:] # d/dx (θ_{x+1} - θ_x) → +1 à θ_x
|
|
274
|
-
div_grad[:, 1:] -= diff_x[:, 1:] # → -1 à θ_{x+1}
|
|
275
|
-
|
|
276
|
-
grad_tv = div_grad.reshape(-1)
|
|
277
|
-
|
|
278
|
-
# Mise à jour avec régularisation TV
|
|
279
|
-
grad_total = grad_ls + lambda_tv * grad_tv
|
|
280
|
-
theta_new = theta_flat - (1/L) * grad_total
|
|
281
|
-
theta_new = torch.clamp(theta_new, min=0.0)
|
|
282
|
-
|
|
283
|
-
# Accélération de FISTA
|
|
284
|
-
t_new = (1 + torch.sqrt(1 + 4 * t**2)) / 2
|
|
285
|
-
theta_flat = theta_new + ((t - 1) / t_new) * (theta_new - theta_prev)
|
|
286
|
-
theta_prev = theta_new.clone()
|
|
287
|
-
t = t_new
|
|
288
|
-
|
|
289
|
-
# Sauvegarde conditionnelle
|
|
290
|
-
if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
|
|
291
|
-
theta_normalized = theta_flat.clone()
|
|
292
|
-
if torch.max(theta_normalized) > 0:
|
|
293
|
-
theta_normalized /= torch.max(theta_normalized)
|
|
294
|
-
theta_history.append(theta_normalized.reshape(Z, X).clone())
|
|
295
|
-
saved_indices.append(k + 1)
|
|
296
|
-
save_count += 1
|
|
297
|
-
if save_count >= 1000:
|
|
298
|
-
break
|
|
299
|
-
|
|
300
|
-
# Renormalisation finale
|
|
301
|
-
if renormalize_output:
|
|
302
|
-
if A_norm > 0 and y_norm > 0:
|
|
303
|
-
theta_flat *= (y_norm / (A_norm + 1e-8))
|
|
304
|
-
if torch.max(theta_flat) > 0:
|
|
305
|
-
theta_flat /= torch.max(theta_flat)
|
|
306
|
-
|
|
307
|
-
# Nettoyage
|
|
308
|
-
del A_flat, y_flat, theta_prev, grad_ls, grad_tv, theta_new, div_grad, diff_z, diff_x
|
|
309
|
-
torch.cuda.empty_cache()
|
|
310
|
-
|
|
311
|
-
if isSavingEachIteration:
|
|
312
|
-
return [t.cpu().numpy() for t in theta_history], saved_indices
|
|
313
|
-
else:
|
|
314
|
-
return theta_flat.reshape(Z, X).cpu().numpy(), None
|
|
496
|
+
|
|
497
|
+
finally:
|
|
498
|
+
if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
|
|
499
|
+
SMatrix.ctx.pop()
|