AOT-biomaps 2.9.138__py3-none-any.whl → 2.9.279__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (31) hide show
  1. AOT_biomaps/AOT_Acoustic/AcousticTools.py +35 -115
  2. AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
  3. AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +22 -18
  4. AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
  5. AOT_biomaps/AOT_Experiment/_mainExperiment.py +102 -68
  6. AOT_biomaps/AOT_Optic/_mainOptic.py +124 -58
  7. AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +72 -108
  8. AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +474 -289
  9. AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +173 -68
  10. AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +360 -154
  11. AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +150 -111
  12. AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
  13. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +281 -0
  14. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +328 -0
  15. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
  16. AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
  17. AOT_biomaps/AOT_Recon/AlgebraicRecon.py +359 -238
  18. AOT_biomaps/AOT_Recon/AnalyticRecon.py +29 -41
  19. AOT_biomaps/AOT_Recon/BayesianRecon.py +165 -91
  20. AOT_biomaps/AOT_Recon/DeepLearningRecon.py +4 -1
  21. AOT_biomaps/AOT_Recon/PrimalDualRecon.py +175 -31
  22. AOT_biomaps/AOT_Recon/ReconEnums.py +38 -3
  23. AOT_biomaps/AOT_Recon/ReconTools.py +184 -77
  24. AOT_biomaps/AOT_Recon/__init__.py +1 -0
  25. AOT_biomaps/AOT_Recon/_mainRecon.py +144 -74
  26. AOT_biomaps/__init__.py +4 -36
  27. {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/METADATA +2 -1
  28. aot_biomaps-2.9.279.dist-info/RECORD +47 -0
  29. aot_biomaps-2.9.138.dist-info/RECORD +0 -43
  30. {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/WHEEL +0 -0
  31. {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/top_level.txt +0 -0
@@ -1,314 +1,499 @@
1
1
  from AOT_biomaps.Config import config
2
- import numba
2
+ from AOT_biomaps.AOT_Recon.ReconTools import calculate_memory_requirement, check_gpu_memory
3
+ from AOT_biomaps.AOT_Recon.ReconEnums import SMatrixType
4
+
3
5
  import torch
4
6
  import numpy as np
5
- import os
6
7
  from tqdm import trange
8
+ import pycuda.driver as drv
9
+ import torch.cuda
10
+ import gc
11
+
12
+
13
+
14
+ def LS(
15
+ SMatrix,
16
+ y,
17
+ numIterations=100,
18
+ isSavingEachIteration=True,
19
+ withTumor=True,
20
+ alpha=1e-1,
21
+ device=None,
22
+ use_numba=False,
23
+ denominator_threshold=1e-6,
24
+ max_saves=5000,
25
+ show_logs=True,
26
+ smatrixType=SMatrixType.SELL,
27
+ Z=350,
28
+ ):
29
+ """
30
+ Least Squares reconstruction using Projected Gradient Descent (PGD) with non-negativity constraint.
31
+ Currently only implements the stable GPU version.
32
+ """
33
+ tumor_str = "WITH" if withTumor else "WITHOUT"
34
+ # Auto-select device and method
35
+ if device is None:
36
+ if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
37
+ device = torch.device(f"cuda:{config.select_best_gpu()}")
38
+ use_gpu = True
39
+ else:
40
+ device = torch.device("cpu")
41
+ use_gpu = False
42
+ else:
43
+ use_gpu = device.type == "cuda"
44
+ # Dispatch to the appropriate implementation
45
+ if use_gpu:
46
+ if smatrixType == SMatrixType.CSR:
47
+ return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
48
+ elif smatrixType == SMatrixType.SELL:
49
+ return _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
50
+ elif smatrixType == SMatrixType.DENSE:
51
+ return _LS_GPU_stable(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold,show_logs)
52
+ else:
53
+ raise ValueError("Unsupported SMatrixType for GPU LS.")
54
+ else:
55
+ raise NotImplementedError("Only GPU implementations are currently available for LS.")
7
56
 
8
- def _LS_GPU_basic(SMatrix, y, numIterations, isSavingEachIteration=True, withTumor=True):
57
+ def _LS_GPU_stable(SMatrix, y, numIterations, alpha, isSavingEachIteration, tumor_str, max_saves=5000, show_logs=True):
58
+ """
59
+ Stable GPU implementation of LS using projected gradient descent with diagonal preconditioner.
60
+ """
9
61
  device = torch.device(f"cuda:{config.select_best_gpu()}")
10
62
  T, Z, X, N = SMatrix.shape
11
63
  ZX = Z * X
12
64
  TN = T * N
13
- if y.shape != (T, N):
14
- raise ValueError(f"Expected y shape: ({T}, {N}), got {y.shape}")
15
-
65
+ # 1. Conversion et normalisation
16
66
  A_flat = torch.from_numpy(SMatrix).to(device=device, dtype=torch.float32).permute(0, 3, 1, 2).reshape(TN, ZX)
17
67
  y_flat = torch.from_numpy(y).to(device=device, dtype=torch.float32).reshape(TN)
18
-
19
- # Initialisation uniforme (importante !)
20
- theta_flat = torch.ones(ZX, dtype=torch.float32, device=device) / (Z * X)
21
-
22
- saved_theta = []
23
- saved_indices = []
24
- if isSavingEachIteration:
25
- saved_theta.append(theta_flat.reshape(Z, X).clone())
26
- saved_indices.append(0)
27
- step = max(1, (numIterations - 1) // 999)
28
- save_count = 1
29
-
30
- # Normalisation de A (par colonne) et de y (par max)
31
- col_norms = torch.norm(A_flat, dim=0, keepdim=True)
32
- A_normalized = A_flat / (col_norms + 1e-8)
33
- y_normalized = y_flat / (torch.max(y_flat) + 1e-8) # Normalise y entre 0 et ~1
34
-
35
- description = f"AOT-BioMaps -- LS Reconstruction ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- GPU {torch.cuda.current_device()}"
36
-
37
- with torch.no_grad():
38
- for k in trange(numIterations, desc=description):
39
- r = y_normalized - A_normalized @ theta_flat
40
- p = A_normalized.T @ r
41
- rsold = torch.dot(r, r)
42
-
43
- for _ in range(2):
44
- Ap = A_normalized @ p
45
- alpha = rsold / (torch.dot(p, A_normalized.T @ Ap) + 1e-8)
46
- theta_flat += alpha * p
47
- theta_flat = torch.clamp(theta_flat, min=0) # Projection sur R+
48
- r -= alpha * Ap
49
- rsnew = torch.dot(r, r)
50
- if rsnew < 1e-8:
51
- break
52
- p = A_normalized.T @ r + (rsnew / rsold) * p
53
- rsold = rsnew
54
-
55
- if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
56
- # Normalise entre 0 et 1 avant sauvegarde
57
- theta_normalized = theta_flat.clone()
58
- if torch.max(theta_normalized) > 0:
59
- theta_normalized = theta_normalized / torch.max(theta_normalized)
60
- saved_theta.append(theta_normalized.reshape(Z, X).clone())
61
- saved_indices.append(k + 1)
62
- save_count += 1
63
- if save_count >= 1000:
64
- break
65
-
66
- # Normalisation finale entre 0 et 1
67
- if torch.max(theta_flat) > 0:
68
- theta_flat = theta_flat / torch.max(theta_flat)
69
-
70
- del A_flat, y_flat, A_normalized, y_normalized
68
+ norm_A = A_flat.max()
69
+ norm_y = y_flat.max()
70
+ A_flat.div_(norm_A + 1e-8)
71
+ y_flat.div_(norm_y + 1e-8)
72
+ # 2. Initialisation
73
+ lambda_k = torch.zeros(ZX, device=device)
74
+ lambda_history = [] if isSavingEachIteration else None
75
+ saved_indices = [] # Pour stocker les indices des itérations sauvegardées
76
+
77
+ # Calculate save indices
78
+ if numIterations <= max_saves:
79
+ save_indices = list(range(numIterations))
80
+ else:
81
+ step = numIterations // max_saves
82
+ save_indices = list(range(0, numIterations, step))
83
+ if save_indices[-1] != numIterations - 1:
84
+ save_indices.append(numIterations - 1)
85
+
86
+ # Préconditionneur diagonal
87
+ diag_AAT = torch.sum(A_flat ** 2, dim=0)
88
+ M_inv = 1.0 / torch.clamp(diag_AAT, min=1e-6)
89
+ # Pré-allocation des tenseurs
90
+ r_k = torch.empty_like(y_flat)
91
+ AT_r = torch.empty(ZX, device=device)
92
+ description = f"AOT-BioMaps -- Stable LS Reconstruction ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
93
+
94
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
95
+ for it in iterator:
96
+ # Calcul du résidu (inplace)
97
+ torch.matmul(A_flat, lambda_k, out=r_k)
98
+ r_k = y_flat - r_k
99
+ if isSavingEachIteration and it in save_indices:
100
+ lambda_history.append(lambda_k.clone().reshape(Z, X) * (norm_y / norm_A))
101
+ saved_indices.append(it)
102
+
103
+ # Gradient préconditionné (inplace)
104
+ torch.matmul(A_flat.T, r_k, out=AT_r)
105
+ AT_r *= M_inv
106
+ # Mise à jour avec pas fixe et projection (inplace)
107
+ lambda_k.add_(AT_r, alpha=alpha)
108
+ lambda_k.clamp_(min=0)
109
+
110
+ # 3. Dénormalisation
111
+ lambda_final = lambda_k.reshape(Z, X) * (norm_y / norm_A)
112
+ # Free memory
113
+ del A_flat, y_flat, r_k, AT_r
71
114
  torch.cuda.empty_cache()
72
-
73
115
  if isSavingEachIteration:
74
- return [theta.cpu().numpy() for theta in saved_theta], saved_indices
116
+ return [t.cpu().numpy() for t in lambda_history], saved_indices
75
117
  else:
76
- return theta_flat.reshape(Z, X).cpu().numpy(), None
118
+ return lambda_final.cpu().numpy(), None
119
+
120
+ def _LS_GPU_opti(*args, **kwargs):
121
+ raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
122
+
123
+ def _LS_GPU_multi(*args, **kwargs):
124
+ raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
125
+
126
+ def _LS_CPU_opti(*args, **kwargs):
127
+ raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
128
+
129
+ def _LS_CPU_basic(*args, **kwargs):
130
+ raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
131
+
132
+ def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
133
+ """
134
+ Reconstruction par Moindres Carrés (LS) via Gradient Conjugué (CG) sur format CSR.
135
+ Utilise les mêmes arguments que la fonction MLEM, sans sous-fonctions Python.
77
136
 
78
- def _LS_CPU_basic(SMatrix, y, numIterations, isSavingEachIteration, withTumor):
79
- try:
80
- T, Z, X, N = SMatrix.shape
81
- theta_p = np.ones((Z, X))
82
- saved_theta = []
83
- saved_indices = []
84
- if isSavingEachIteration:
85
- saved_theta.append(theta_p.copy())
86
- saved_indices.append(0)
87
- step = max(1, (numIterations - 1) // 999)
88
- save_count = 1
89
- description = f"AOT-BioMaps -- LS Reconstruction ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- CPU (basic) ----"
90
- for k in trange(numIterations, desc=description):
91
- ATA = np.zeros((Z, X, Z, X))
92
- ATy = np.zeros((Z, X))
93
- for _t in range(T):
94
- for _n in range(N):
95
- ATA += np.einsum('ij,kl->ijkl', SMatrix[_t, :, :, _n], SMatrix[_t, :, :, _n])
96
- ATy += SMatrix[_t, :, :, _n] * y[_t, _n]
97
- theta_p = np.linalg.solve(ATA.reshape(Z*X, Z*X), ATy.reshape(Z*X)).reshape(Z, X)
98
- if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
99
- saved_theta.append(theta_p.copy())
100
- saved_indices.append(k + 1)
101
- save_count += 1
102
- if save_count >= 1000:
103
- break
104
- if isSavingEachIteration:
105
- return saved_theta, saved_indices
106
- else:
107
- return theta_p, None
108
- except Exception as e:
109
- print("Error in basic CPU LS:", type(e).__name__, ":", e)
110
- return None, None
137
+ SMatrix: instance de SparseSMatrix_CSR (déjà allouée)
138
+ y: données mesurées (1D np.float32 de taille TN)
139
+ """
140
+ final_result = None
141
+
142
+ # Paramètres non utilisés dans CG mais conservés pour la signature: denominator_threshold, device
143
+
144
+ # --- Logique de Produit Scalaire (Intégrée) ---
145
+ def _dot_product_gpu(mod, a_ptr, b_ptr, N_int, stream):
146
+ block_size = 256
147
+ grid_size = (N_int + block_size - 1) // block_size
148
+
149
+ reduction_host = np.empty(grid_size, dtype=np.float32)
150
+ reduction_buffer = drv.mem_alloc(reduction_host.nbytes)
151
+
152
+ dot_kernel = mod.get_function("dot_product_reduction_kernel")
153
+
154
+ dot_kernel(reduction_buffer, a_ptr, b_ptr, np.int32(N_int),
155
+ block=(block_size, 1, 1), grid=(grid_size, 1, 1), stream=stream)
156
+
157
+ drv.memcpy_dtoh(reduction_host, reduction_buffer)
158
+ total_dot = np.sum(reduction_host)
159
+
160
+ reduction_buffer.free()
161
+ return total_dot
162
+ # -----------------------------------------------
111
163
 
112
- def _LS_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, withTumor):
113
164
  try:
114
- T, Z, X, N = SMatrix.shape
115
- A_flat = SMatrix.astype(np.float32).transpose(0, 3, 1, 2).reshape(T*N, Z*X)
116
- y_flat = y.astype(np.float32).reshape(-1)
117
- theta_flat = np.zeros(Z*X, dtype=np.float32)
118
- saved_theta = []
119
- saved_indices = []
120
- if isSavingEachIteration:
121
- saved_theta.append(theta_flat.reshape(Z, X).copy())
122
- saved_indices.append(0)
123
- step = max(1, (numIterations - 1) // 999)
124
- save_count = 1
125
- A_normalized = A_flat / (np.linalg.norm(A_flat, axis=0, keepdims=True) + 1e-8)
126
- y_normalized = y_flat / (np.linalg.norm(y_flat) + 1e-8)
127
- description = f"AOT-BioMaps -- LS Reconstruction ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- CPU (optimized) ----"
128
- for k in trange(numIterations, desc=description):
129
- ATA = A_normalized.T @ A_normalized
130
- ATy = A_normalized.T @ y_normalized
131
- theta_flat = np.linalg.lstsq(ATA, ATy, rcond=None)[0]
132
- if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
133
- saved_theta.append(theta_flat.reshape(Z, X).copy())
134
- saved_indices.append(k + 1)
135
- save_count += 1
136
- if save_count >= 1000:
137
- break
138
- if isSavingEachIteration:
139
- return saved_theta, saved_indices
165
+ if not isinstance(SMatrix, SMatrix.__class__):
166
+ raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
167
+
168
+ if SMatrix.ctx:
169
+ SMatrix.ctx.push()
170
+
171
+ dtype = np.float32
172
+ TN = SMatrix.N * SMatrix.T
173
+ ZX = SMatrix.Z * SMatrix.X
174
+ Z = SMatrix.Z
175
+ X = SMatrix.X
176
+ block_size = 256
177
+ tolerance = 1e-12
178
+
179
+ if show_logs:
180
+ print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
181
+ print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
182
+
183
+ stream = drv.Stream()
184
+ mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
185
+
186
+ # Récupération des Kernels
187
+ projection_kernel = mod.get_function('projection_kernel__CSR')
188
+ backprojection_kernel = mod.get_function('backprojection_kernel__CSR')
189
+ axpby_kernel = mod.get_function("vector_axpby_kernel")
190
+ minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
191
+
192
+ # --- Allocation des buffers (Pointeurs Bruts) ---
193
+ y = y.T.flatten().astype(dtype)
194
+ y_gpu = drv.mem_alloc(y.nbytes)
195
+ drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
196
+
197
+ theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # lambda
198
+ drv.memcpy_htod_async(theta_flat_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
199
+
200
+ q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize) # q = A*p
201
+ r_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # r (residue)
202
+ p_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # p (direction)
203
+ z_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # z = A^T A p
204
+ ATy_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # A^T y (constant)
205
+
206
+ # --- Initialisation CG ---
207
+
208
+ # 1. ATy = A^T * y
209
+ drv.memset_d32_async(ATy_flat_gpu, 0, ZX, stream)
210
+ backprojection_kernel(ATy_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
211
+ y_gpu, np.int32(TN),
212
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
213
+
214
+ # 2. q = A * theta_0
215
+ projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
216
+ theta_flat_gpu, np.int32(TN),
217
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
218
+
219
+ # 3. r_temp = A^T * q = A^T A theta_0
220
+ drv.memset_d32_async(r_flat_gpu, 0, ZX, stream)
221
+ backprojection_kernel(r_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
222
+ q_flat_gpu, np.int32(TN),
223
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
224
+
225
+ # 4. r_0 = ATy - r_temp (r = ATy + (-1)*r_temp)
226
+ axpby_kernel(r_flat_gpu, ATy_flat_gpu, r_flat_gpu,
227
+ np.float32(1.0), np.float32(-1.0), np.int32(ZX),
228
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
229
+
230
+ # 5. p_0 = r_0
231
+ drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
232
+
233
+ # 6. rho_prev = ||r_0||^2
234
+ rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
235
+
236
+ # --- Boucle itérative ---
237
+ saved_theta, saved_indices = [], []
238
+ if numIterations <= max_saves:
239
+ save_indices = list(range(numIterations))
140
240
  else:
141
- return theta_flat.reshape(Z, X), None
241
+ save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
242
+ if save_indices[-1] != numIterations - 1:
243
+ save_indices.append(numIterations - 1)
244
+
245
+ description = f"AOT-BioMaps -- LS-CG (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
246
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
247
+
248
+ for it in iterator:
249
+ # a. q = A * p
250
+ projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
251
+ p_flat_gpu, np.int32(TN),
252
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
253
+
254
+ # b. z = A^T * q = A^T A p
255
+ drv.memset_d32_async(z_flat_gpu, 0, ZX, stream)
256
+ backprojection_kernel(z_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
257
+ q_flat_gpu, np.int32(TN),
258
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
259
+
260
+ # c. alpha = rho_prev / <p, z>
261
+ pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
262
+
263
+ if abs(pAp) < 1e-15: break
264
+ alpha = rho_prev / pAp
265
+
266
+ # d. theta = theta + alpha * p
267
+ axpby_kernel(theta_flat_gpu, theta_flat_gpu, p_flat_gpu,
268
+ np.float32(1.0), alpha, np.int32(ZX),
269
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
270
+
271
+ # e. r = r - alpha * z
272
+ minus_axpy_kernel(r_flat_gpu, z_flat_gpu, alpha, np.int32(ZX),
273
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
274
+
275
+ # f. rho_curr = ||r||^2
276
+ rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
277
+
278
+ if rho_curr < tolerance: break
279
+
280
+ # g. beta = rho_curr / rho_prev
281
+ beta = rho_curr / rho_prev
282
+
283
+ # h. p = r + beta * p
284
+ axpby_kernel(p_flat_gpu, r_flat_gpu, p_flat_gpu,
285
+ np.float32(1.0), beta, np.int32(ZX),
286
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
287
+
288
+ rho_prev = rho_curr
289
+
290
+ if show_logs and (it % 10 == 0 or it == numIterations - 1):
291
+ drv.Context.synchronize()
292
+
293
+ if isSavingEachIteration and it in save_indices:
294
+ theta_host = np.empty(ZX, dtype=dtype)
295
+ drv.memcpy_dtoh(theta_host, theta_flat_gpu)
296
+ saved_theta.append(theta_host.reshape(Z, X))
297
+ saved_indices.append(it)
298
+
299
+ drv.Context.synchronize()
300
+
301
+ final_result = np.empty(ZX, dtype=dtype)
302
+ drv.memcpy_dtoh(final_result, theta_flat_gpu)
303
+ final_result = final_result.reshape(Z, X)
304
+
305
+ # Libération
306
+ y_gpu.free(); q_flat_gpu.free(); r_flat_gpu.free(); p_flat_gpu.free(); z_flat_gpu.free(); theta_flat_gpu.free(); ATy_flat_gpu.free()
307
+
308
+ return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
309
+
142
310
  except Exception as e:
143
- print("Error in optimized CPU LS:", type(e).__name__, ":", e)
311
+ print(f"Error in LS_CG_sparseCSR_pycuda: {type(e).__name__}: {e}")
312
+ gc.collect()
144
313
  return None, None
314
+
315
+ finally:
316
+ if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
317
+ SMatrix.ctx.pop()
318
+
319
+ def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
320
+ """
321
+ Reconstruction par Moindres Carrés (LS) via Gradient Conjugué (CG) sur format SELL-C-sigma.
322
+ Utilise les mêmes arguments que la fonction MLEM, sans sous-fonctions Python.
323
+
324
+ SMatrix: instance de SparseSMatrix_SELL (déjà allouée)
325
+ y: données mesurées (1D np.float32 de taille TN)
326
+ """
327
+ final_result = None
328
+
329
+ # --- Logique de Produit Scalaire (Intégrée) ---
330
+ def _dot_product_gpu(mod, a_ptr, b_ptr, N_int, stream):
331
+ block_size = 256
332
+ grid_size = (N_int + block_size - 1) // block_size
333
+
334
+ reduction_host = np.empty(grid_size, dtype=np.float32)
335
+ reduction_buffer = drv.mem_alloc(reduction_host.nbytes)
336
+
337
+ dot_kernel = mod.get_function("dot_product_reduction_kernel")
338
+
339
+ dot_kernel(reduction_buffer, a_ptr, b_ptr, np.int32(N_int),
340
+ block=(block_size, 1, 1), grid=(grid_size, 1, 1), stream=stream)
341
+
342
+ drv.memcpy_dtoh(reduction_host, reduction_buffer)
343
+ total_dot = np.sum(reduction_host)
344
+
345
+ reduction_buffer.free()
346
+ return total_dot
347
+ # -----------------------------------------------
145
348
 
146
- def _LS_GPU_multi(SMatrix, y, numIterations, isSavingEachIteration, withTumor):
147
349
  try:
148
- num_gpus = torch.cuda.device_count()
149
- device = torch.device('cuda:0')
150
- T, Z, X, N = SMatrix.shape
151
- A_matrix_torch = torch.tensor(SMatrix, dtype=torch.float32).to(device).permute(0, 3, 1, 2).reshape(T*N, Z*X)
152
- y_torch = torch.tensor(y, dtype=torch.float32).to(device).reshape(-1)
153
- saved_theta = []
154
- saved_indices = []
155
- if isSavingEachIteration:
156
- saved_theta.append(torch.zeros(Z, X, device=device).cpu().numpy())
157
- saved_indices.append(0)
158
- step = max(1, (numIterations - 1) // 999)
159
- save_count = 1
160
- A_split = torch.chunk(A_matrix_torch, num_gpus, dim=0)
161
- y_split = torch.chunk(y_torch, num_gpus)
162
- theta_0 = torch.zeros(Z*X, dtype=torch.float32, device=device)
163
- theta_list = [theta_0.clone().to(device) for _ in range(num_gpus)]
164
- description = f"AOT-BioMaps -- LS Reconstruction ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- multi-GPU ----"
165
- for k in trange(numIterations, desc=description):
166
- for i in range(num_gpus):
167
- with torch.cuda.device(f'cuda:{i}'):
168
- A_i = A_split[i].to(f'cuda:{i}')
169
- y_i = y_split[i].to(f'cuda:{i}')
170
- theta_p = theta_list[i].to(f'cuda:{i}')
171
- r = y_i - A_i @ theta_p
172
- p = r.clone()
173
- rsold = torch.dot(r, r)
174
- for _ in range(2):
175
- Ap = A_i @ p
176
- alpha = rsold / (torch.dot(p, Ap) + 1e-8)
177
- theta_p += alpha * p
178
- r -= alpha * Ap
179
- rsnew = torch.dot(r, r)
180
- if rsnew < 1e-8:
181
- break
182
- p = r + (rsnew / rsold) * p
183
- rsold = rsnew
184
- theta_list[i] = theta_p.to('cuda:0')
185
- if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
186
- saved_theta.append(torch.stack(theta_list).mean(dim=0).reshape(Z, X).cpu().numpy())
187
- saved_indices.append(k + 1)
188
- save_count += 1
189
- if save_count >= 1000:
190
- break
191
- del A_matrix_torch, y_torch, A_split, y_split, theta_0
192
- torch.cuda.empty_cache()
193
- for i in range(num_gpus):
194
- torch.cuda.empty_cache()
195
- if isSavingEachIteration:
196
- return saved_theta, saved_indices
350
+ if not isinstance(SMatrix, SMatrix.__class__):
351
+ raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
352
+ if SMatrix.sell_values_gpu is None:
353
+ raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
354
+
355
+ if SMatrix.ctx:
356
+ SMatrix.ctx.push()
357
+
358
+ dtype = np.float32
359
+ TN = int(SMatrix.N * SMatrix.T)
360
+ ZX = int(SMatrix.Z * SMatrix.X)
361
+ Z = SMatrix.Z
362
+ X = SMatrix.X
363
+ block_size = 256
364
+ tolerance = 1e-12
365
+
366
+ # Accès aux paramètres SELL
367
+ mod = SMatrix.sparse_mod
368
+ projection_kernel = mod.get_function("projection_kernel__SELL")
369
+ backprojection_kernel = mod.get_function("backprojection_kernel__SELL")
370
+ axpby_kernel = mod.get_function("vector_axpby_kernel")
371
+ minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
372
+ slice_height = np.int32(SMatrix.slice_height)
373
+ grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
374
+
375
+ stream = drv.Stream()
376
+
377
+ # Allocation des buffers
378
+ y = y.T.flatten().astype(dtype)
379
+ y_gpu = drv.mem_alloc(y.nbytes)
380
+ drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
381
+
382
+ theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
383
+ drv.memcpy_htod_async(theta_flat_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
384
+
385
+ q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
386
+ r_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
387
+ p_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
388
+ z_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
389
+ ATy_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
390
+
391
+ # --- Initialisation CG ---
392
+
393
+ # 1. ATy = A^T * y
394
+ drv.memset_d32_async(ATy_flat_gpu, 0, ZX, stream)
395
+ backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
396
+ y_gpu, ATy_flat_gpu, np.int32(TN), slice_height,
397
+ block=(block_size, 1, 1), grid=grid_rows, stream=stream)
398
+
399
+ # 2. q = A * theta_0
400
+ projection_kernel(q_flat_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
401
+ theta_flat_gpu, np.int32(TN), slice_height,
402
+ block=(block_size, 1, 1), grid=grid_rows, stream=stream)
403
+
404
+ # 3. r_temp = A^T * q = A^T A theta_0
405
+ drv.memset_d32_async(r_flat_gpu, 0, ZX, stream)
406
+ backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
407
+ q_flat_gpu, r_flat_gpu, np.int32(TN), slice_height,
408
+ block=(block_size, 1, 1), grid=grid_rows, stream=stream)
409
+
410
+ # 4. r_0 = ATy - r_temp
411
+ axpby_kernel(r_flat_gpu, ATy_flat_gpu, r_flat_gpu,
412
+ np.float32(1.0), np.float32(-1.0), np.int32(ZX),
413
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
414
+
415
+ # 5. p_0 = r_0
416
+ drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
417
+
418
+ # 6. rho_prev = ||r_0||^2
419
+ rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
420
+
421
+ # --- Boucle itérative ---
422
+ saved_theta, saved_indices = [], []
423
+ if numIterations <= max_saves:
424
+ save_indices = list(range(numIterations))
197
425
  else:
198
- return torch.stack(theta_list).mean(dim=0).reshape(Z, X).cpu().numpy(), None
426
+ save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
427
+ if save_indices[-1] != numIterations - 1:
428
+ save_indices.append(numIterations - 1)
429
+
430
+ description = f"AOT-BioMaps -- LS-CG (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
431
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
432
+
433
+ for it in iterator:
434
+ # a. q = A * p
435
+ projection_kernel(q_flat_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
436
+ p_flat_gpu, np.int32(TN), slice_height,
437
+ block=(block_size, 1, 1), grid=grid_rows, stream=stream)
438
+
439
+ # b. z = A^T * q = A^T A p
440
+ drv.memset_d32_async(z_flat_gpu, 0, ZX, stream)
441
+ backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
442
+ q_flat_gpu, z_flat_gpu, np.int32(TN), slice_height,
443
+ block=(block_size, 1, 1), grid=grid_rows, stream=stream)
444
+
445
+ # c. alpha = rho_prev / <p, z>
446
+ pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
447
+
448
+ if abs(pAp) < 1e-15: break
449
+ alpha = rho_prev / pAp
450
+
451
+ # d. theta = theta + alpha * p
452
+ axpby_kernel(theta_flat_gpu, theta_flat_gpu, p_flat_gpu,
453
+ np.float32(1.0), alpha, np.int32(ZX),
454
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
455
+
456
+ # e. r = r - alpha * z
457
+ minus_axpy_kernel(r_flat_gpu, z_flat_gpu, alpha, np.int32(ZX),
458
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
459
+
460
+ # f. rho_curr = ||r||^2
461
+ rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
462
+
463
+ if rho_curr < tolerance: break
464
+
465
+ # g. beta = rho_curr / rho_prev
466
+ beta = rho_curr / rho_prev
467
+
468
+ # h. p = r + beta * p
469
+ axpby_kernel(p_flat_gpu, r_flat_gpu, p_flat_gpu,
470
+ np.float32(1.0), beta, np.int32(ZX),
471
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
472
+
473
+ rho_prev = rho_curr
474
+
475
+ stream.synchronize()
476
+ if isSavingEachIteration and it in save_indices:
477
+ out = np.empty(ZX, dtype=dtype)
478
+ drv.memcpy_dtoh(out, theta_flat_gpu)
479
+ saved_theta.append(out.reshape((Z, X)))
480
+ saved_indices.append(it)
481
+
482
+ # final copy
483
+ res = np.empty(ZX, dtype=np.float32)
484
+ drv.memcpy_dtoh(res, theta_flat_gpu)
485
+ final_result = res.reshape((Z, X))
486
+
487
+ # free temporaries
488
+ y_gpu.free(); q_flat_gpu.free(); r_flat_gpu.free(); p_flat_gpu.free(); z_flat_gpu.free(); theta_flat_gpu.free(); ATy_flat_gpu.free()
489
+
490
+ return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
491
+
199
492
  except Exception as e:
200
- print("Error in multi-GPU LS:", type(e).__name__, ":", e)
201
- del A_matrix_torch, y_torch, A_split, y_split, theta_0
202
- torch.cuda.empty_cache()
203
- for i in range(num_gpus):
204
- torch.cuda.empty_cache()
493
+ print(f"Error in LS_CG_sparseSELL_pycuda: {type(e).__name__}: {e}")
494
+ gc.collect()
205
495
  return None, None
206
-
207
- def _LS_TV_GPU(SMatrix, y, numIterations, isSavingEachIteration=True, withTumor=True, lambda_tv=1e-3, L_Factor=1.0, renormalize_output=True):
208
- device = torch.device(f"cuda:{torch.cuda.current_device()}")
209
- T, Z, X, N = SMatrix.shape
210
- ZX = Z * X
211
- TN = T * N
212
-
213
- # Conversion des données
214
- A_flat = torch.from_numpy(SMatrix).to(device=device, dtype=torch.float32).permute(0, 3, 1, 2).reshape(TN, ZX)
215
- y_flat = torch.from_numpy(y).to(device=device, dtype=torch.float32).reshape(TN)
216
-
217
- # Vérification des NaN/Inf
218
- if torch.isnan(A_flat).any() or torch.isinf(A_flat).any():
219
- raise ValueError("SMatrix contient des NaN ou Inf.")
220
- if torch.isnan(y_flat).any() or torch.isinf(y_flat).any():
221
- raise ValueError("y contient des NaN ou Inf.")
222
-
223
- # Normalisation
224
- A_norm = torch.max(torch.abs(A_flat))
225
- y_norm = torch.max(torch.abs(y_flat))
226
- if A_norm > 0:
227
- A_flat = A_flat / A_norm
228
- if y_norm > 0:
229
- y_flat = y_flat / y_norm
230
-
231
- # Initialisation uniforme
232
- theta_flat = torch.ones(ZX, device=device) / (Z * X)
233
- theta_prev = theta_flat.clone()
234
- t = torch.tensor(1.0, device=device)
235
-
236
- # Constante de Lipschitz
237
- L = L_Factor * (torch.norm(A_flat, 2).item() ** 2)
238
-
239
- # Stockage des itérations
240
- theta_history = []
241
- saved_indices = []
242
- if isSavingEachIteration:
243
- theta_history.append(theta_flat.reshape(Z, X).clone())
244
- saved_indices.append(0)
245
- step = max(1, (numIterations - 1) // 999)
246
- save_count = 1
247
-
248
- description = f"AOT-BioMaps -- LS + TV (λ: {lambda_tv}) ---- {'WITH' if withTumor else 'WITHOUT'} TUMOR ---- GPU {torch.cuda.current_device()}"
249
-
250
- # Pré-allocation
251
- grad_tv = torch.zeros_like(theta_flat)
252
-
253
- for k in trange(numIterations, desc=description):
254
- # Gradient des moindres carrés
255
- grad_ls = A_flat.T @ (A_flat @ theta_flat - y_flat)
256
-
257
- # Calcul du gradient TV (version corrigée avec padding pour les bords)
258
- theta_2d = theta_flat.reshape(Z, X)
259
-
260
- # Dérivées avant (forward differences) avec padding zéro aux bords
261
- diff_z = torch.zeros_like(theta_2d)
262
- diff_z[1:, :] = theta_2d[1:, :] - theta_2d[:-1, :] # Dérivée verticale
263
-
264
- diff_x = torch.zeros_like(theta_2d)
265
- diff_x[:, 1:] = theta_2d[:, 1:] - theta_2d[:, :-1] # Dérivée horizontale
266
-
267
- # Divergence du gradient (≈ Laplacien)
268
- div_grad = torch.zeros_like(theta_2d)
269
- # Contribution de diff_z (d/dz)
270
- div_grad[:-1, :] += diff_z[1:, :] # d/dz (θ_{z+1} - θ_z) → +1 à θ_z
271
- div_grad[1:, :] -= diff_z[1:, :] # → -1 à θ_{z+1}
272
- # Contribution de diff_x (d/dx)
273
- div_grad[:, :-1] += diff_x[:, 1:] # d/dx (θ_{x+1} - θ_x) → +1 à θ_x
274
- div_grad[:, 1:] -= diff_x[:, 1:] # → -1 à θ_{x+1}
275
-
276
- grad_tv = div_grad.reshape(-1)
277
-
278
- # Mise à jour avec régularisation TV
279
- grad_total = grad_ls + lambda_tv * grad_tv
280
- theta_new = theta_flat - (1/L) * grad_total
281
- theta_new = torch.clamp(theta_new, min=0.0)
282
-
283
- # Accélération de FISTA
284
- t_new = (1 + torch.sqrt(1 + 4 * t**2)) / 2
285
- theta_flat = theta_new + ((t - 1) / t_new) * (theta_new - theta_prev)
286
- theta_prev = theta_new.clone()
287
- t = t_new
288
-
289
- # Sauvegarde conditionnelle
290
- if isSavingEachIteration and (k % step == 0 or k == numIterations - 1):
291
- theta_normalized = theta_flat.clone()
292
- if torch.max(theta_normalized) > 0:
293
- theta_normalized /= torch.max(theta_normalized)
294
- theta_history.append(theta_normalized.reshape(Z, X).clone())
295
- saved_indices.append(k + 1)
296
- save_count += 1
297
- if save_count >= 1000:
298
- break
299
-
300
- # Renormalisation finale
301
- if renormalize_output:
302
- if A_norm > 0 and y_norm > 0:
303
- theta_flat *= (y_norm / (A_norm + 1e-8))
304
- if torch.max(theta_flat) > 0:
305
- theta_flat /= torch.max(theta_flat)
306
-
307
- # Nettoyage
308
- del A_flat, y_flat, theta_prev, grad_ls, grad_tv, theta_new, div_grad, diff_z, diff_x
309
- torch.cuda.empty_cache()
310
-
311
- if isSavingEachIteration:
312
- return [t.cpu().numpy() for t in theta_history], saved_indices
313
- else:
314
- return theta_flat.reshape(Z, X).cpu().numpy(), None
496
+
497
+ finally:
498
+ if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
499
+ SMatrix.ctx.pop()