AOT-biomaps 2.9.176__py3-none-any.whl → 2.9.279__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (29) hide show
  1. AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
  2. AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +11 -6
  3. AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
  4. AOT_biomaps/AOT_Experiment/_mainExperiment.py +95 -55
  5. AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +48 -13
  6. AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +409 -13
  7. AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +118 -38
  8. AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +306 -102
  9. AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +1 -1
  10. AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
  11. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +281 -0
  12. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +328 -0
  13. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
  14. AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
  15. AOT_biomaps/AOT_Recon/AlgebraicRecon.py +265 -153
  16. AOT_biomaps/AOT_Recon/AnalyticRecon.py +27 -42
  17. AOT_biomaps/AOT_Recon/BayesianRecon.py +84 -151
  18. AOT_biomaps/AOT_Recon/DeepLearningRecon.py +1 -1
  19. AOT_biomaps/AOT_Recon/PrimalDualRecon.py +69 -62
  20. AOT_biomaps/AOT_Recon/ReconEnums.py +27 -2
  21. AOT_biomaps/AOT_Recon/ReconTools.py +152 -12
  22. AOT_biomaps/AOT_Recon/__init__.py +1 -0
  23. AOT_biomaps/AOT_Recon/_mainRecon.py +72 -58
  24. AOT_biomaps/__init__.py +4 -74
  25. {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.279.dist-info}/METADATA +2 -1
  26. aot_biomaps-2.9.279.dist-info/RECORD +47 -0
  27. aot_biomaps-2.9.176.dist-info/RECORD +0 -43
  28. {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.279.dist-info}/WHEEL +0 -0
  29. {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.279.dist-info}/top_level.txt +0 -0
@@ -1,34 +1,60 @@
1
1
  from AOT_biomaps.Config import config
2
+ from AOT_biomaps.AOT_Recon.ReconTools import calculate_memory_requirement, check_gpu_memory
3
+ from AOT_biomaps.AOT_Recon.ReconEnums import SMatrixType
4
+
2
5
  import torch
3
6
  import numpy as np
4
7
  from tqdm import trange
8
+ import pycuda.driver as drv
9
+ import torch.cuda
10
+ import gc
11
+
12
+
5
13
 
6
14
  def LS(
7
15
  SMatrix,
8
16
  y,
9
- numIterations=5000,
10
- alpha=1e-3,
17
+ numIterations=100,
11
18
  isSavingEachIteration=True,
12
19
  withTumor=True,
20
+ alpha=1e-1,
13
21
  device=None,
22
+ use_numba=False,
23
+ denominator_threshold=1e-6,
14
24
  max_saves=5000,
25
+ show_logs=True,
26
+ smatrixType=SMatrixType.SELL,
27
+ Z=350,
15
28
  ):
16
29
  """
17
30
  Least Squares reconstruction using Projected Gradient Descent (PGD) with non-negativity constraint.
18
31
  Currently only implements the stable GPU version.
19
32
  """
20
33
  tumor_str = "WITH" if withTumor else "WITHOUT"
21
- # Force GPU usage for now
34
+ # Auto-select device and method
22
35
  if device is None:
23
- if not torch.cuda.is_available():
24
- raise RuntimeError("CUDA is required for this implementation.")
25
- device = torch.device(f"cuda:{config.select_best_gpu()}")
36
+ if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
37
+ device = torch.device(f"cuda:{config.select_best_gpu()}")
38
+ use_gpu = True
39
+ else:
40
+ device = torch.device("cpu")
41
+ use_gpu = False
42
+ else:
43
+ use_gpu = device.type == "cuda"
44
+ # Dispatch to the appropriate implementation
45
+ if use_gpu:
46
+ if smatrixType == SMatrixType.CSR:
47
+ return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
48
+ elif smatrixType == SMatrixType.SELL:
49
+ return _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
50
+ elif smatrixType == SMatrixType.DENSE:
51
+ return _LS_GPU_stable(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold,show_logs)
52
+ else:
53
+ raise ValueError("Unsupported SMatrixType for GPU LS.")
26
54
  else:
27
- if device.type != "cuda":
28
- raise RuntimeError("Only GPU implementation is available for now.")
29
- return _LS_GPU_stable(SMatrix, y, numIterations, alpha, isSavingEachIteration, tumor_str, max_saves)
55
+ raise NotImplementedError("Only GPU implementations are currently available for LS.")
30
56
 
31
- def _LS_GPU_stable(SMatrix, y, numIterations, alpha, isSavingEachIteration, tumor_str, max_saves=5000):
57
+ def _LS_GPU_stable(SMatrix, y, numIterations, alpha, isSavingEachIteration, tumor_str, max_saves=5000, show_logs=True):
32
58
  """
33
59
  Stable GPU implementation of LS using projected gradient descent with diagonal preconditioner.
34
60
  """
@@ -65,13 +91,14 @@ def _LS_GPU_stable(SMatrix, y, numIterations, alpha, isSavingEachIteration, tumo
65
91
  AT_r = torch.empty(ZX, device=device)
66
92
  description = f"AOT-BioMaps -- Stable LS Reconstruction ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
67
93
 
68
- for k in trange(numIterations, desc=description):
94
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
95
+ for it in iterator:
69
96
  # Calcul du résidu (inplace)
70
97
  torch.matmul(A_flat, lambda_k, out=r_k)
71
98
  r_k = y_flat - r_k
72
- if isSavingEachIteration and k in save_indices:
99
+ if isSavingEachIteration and it in save_indices:
73
100
  lambda_history.append(lambda_k.clone().reshape(Z, X) * (norm_y / norm_A))
74
- saved_indices.append(k)
101
+ saved_indices.append(it)
75
102
 
76
103
  # Gradient préconditionné (inplace)
77
104
  torch.matmul(A_flat.T, r_k, out=AT_r)
@@ -101,3 +128,372 @@ def _LS_CPU_opti(*args, **kwargs):
101
128
 
102
129
  def _LS_CPU_basic(*args, **kwargs):
103
130
  raise NotImplementedError("Only _LS_GPU_stable is implemented for now.")
131
+
132
+ def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
133
+ """
134
+ Reconstruction par Moindres Carrés (LS) via Gradient Conjugué (CG) sur format CSR.
135
+ Utilise les mêmes arguments que la fonction MLEM, sans sous-fonctions Python.
136
+
137
+ SMatrix: instance de SparseSMatrix_CSR (déjà allouée)
138
+ y: données mesurées (1D np.float32 de taille TN)
139
+ """
140
+ final_result = None
141
+
142
+ # Paramètres non utilisés dans CG mais conservés pour la signature: denominator_threshold, device
143
+
144
+ # --- Logique de Produit Scalaire (Intégrée) ---
145
+ def _dot_product_gpu(mod, a_ptr, b_ptr, N_int, stream):
146
+ block_size = 256
147
+ grid_size = (N_int + block_size - 1) // block_size
148
+
149
+ reduction_host = np.empty(grid_size, dtype=np.float32)
150
+ reduction_buffer = drv.mem_alloc(reduction_host.nbytes)
151
+
152
+ dot_kernel = mod.get_function("dot_product_reduction_kernel")
153
+
154
+ dot_kernel(reduction_buffer, a_ptr, b_ptr, np.int32(N_int),
155
+ block=(block_size, 1, 1), grid=(grid_size, 1, 1), stream=stream)
156
+
157
+ drv.memcpy_dtoh(reduction_host, reduction_buffer)
158
+ total_dot = np.sum(reduction_host)
159
+
160
+ reduction_buffer.free()
161
+ return total_dot
162
+ # -----------------------------------------------
163
+
164
+ try:
165
+ if not isinstance(SMatrix, SMatrix.__class__):
166
+ raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
167
+
168
+ if SMatrix.ctx:
169
+ SMatrix.ctx.push()
170
+
171
+ dtype = np.float32
172
+ TN = SMatrix.N * SMatrix.T
173
+ ZX = SMatrix.Z * SMatrix.X
174
+ Z = SMatrix.Z
175
+ X = SMatrix.X
176
+ block_size = 256
177
+ tolerance = 1e-12
178
+
179
+ if show_logs:
180
+ print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
181
+ print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
182
+
183
+ stream = drv.Stream()
184
+ mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
185
+
186
+ # Récupération des Kernels
187
+ projection_kernel = mod.get_function('projection_kernel__CSR')
188
+ backprojection_kernel = mod.get_function('backprojection_kernel__CSR')
189
+ axpby_kernel = mod.get_function("vector_axpby_kernel")
190
+ minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
191
+
192
+ # --- Allocation des buffers (Pointeurs Bruts) ---
193
+ y = y.T.flatten().astype(dtype)
194
+ y_gpu = drv.mem_alloc(y.nbytes)
195
+ drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
196
+
197
+ theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # lambda
198
+ drv.memcpy_htod_async(theta_flat_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
199
+
200
+ q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize) # q = A*p
201
+ r_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # r (residue)
202
+ p_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # p (direction)
203
+ z_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # z = A^T A p
204
+ ATy_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize) # A^T y (constant)
205
+
206
+ # --- Initialisation CG ---
207
+
208
+ # 1. ATy = A^T * y
209
+ drv.memset_d32_async(ATy_flat_gpu, 0, ZX, stream)
210
+ backprojection_kernel(ATy_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
211
+ y_gpu, np.int32(TN),
212
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
213
+
214
+ # 2. q = A * theta_0
215
+ projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
216
+ theta_flat_gpu, np.int32(TN),
217
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
218
+
219
+ # 3. r_temp = A^T * q = A^T A theta_0
220
+ drv.memset_d32_async(r_flat_gpu, 0, ZX, stream)
221
+ backprojection_kernel(r_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
222
+ q_flat_gpu, np.int32(TN),
223
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
224
+
225
+ # 4. r_0 = ATy - r_temp (r = ATy + (-1)*r_temp)
226
+ axpby_kernel(r_flat_gpu, ATy_flat_gpu, r_flat_gpu,
227
+ np.float32(1.0), np.float32(-1.0), np.int32(ZX),
228
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
229
+
230
+ # 5. p_0 = r_0
231
+ drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
232
+
233
+ # 6. rho_prev = ||r_0||^2
234
+ rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
235
+
236
+ # --- Boucle itérative ---
237
+ saved_theta, saved_indices = [], []
238
+ if numIterations <= max_saves:
239
+ save_indices = list(range(numIterations))
240
+ else:
241
+ save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
242
+ if save_indices[-1] != numIterations - 1:
243
+ save_indices.append(numIterations - 1)
244
+
245
+ description = f"AOT-BioMaps -- LS-CG (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
246
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
247
+
248
+ for it in iterator:
249
+ # a. q = A * p
250
+ projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
251
+ p_flat_gpu, np.int32(TN),
252
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
253
+
254
+ # b. z = A^T * q = A^T A p
255
+ drv.memset_d32_async(z_flat_gpu, 0, ZX, stream)
256
+ backprojection_kernel(z_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
257
+ q_flat_gpu, np.int32(TN),
258
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
259
+
260
+ # c. alpha = rho_prev / <p, z>
261
+ pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
262
+
263
+ if abs(pAp) < 1e-15: break
264
+ alpha = rho_prev / pAp
265
+
266
+ # d. theta = theta + alpha * p
267
+ axpby_kernel(theta_flat_gpu, theta_flat_gpu, p_flat_gpu,
268
+ np.float32(1.0), alpha, np.int32(ZX),
269
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
270
+
271
+ # e. r = r - alpha * z
272
+ minus_axpy_kernel(r_flat_gpu, z_flat_gpu, alpha, np.int32(ZX),
273
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
274
+
275
+ # f. rho_curr = ||r||^2
276
+ rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
277
+
278
+ if rho_curr < tolerance: break
279
+
280
+ # g. beta = rho_curr / rho_prev
281
+ beta = rho_curr / rho_prev
282
+
283
+ # h. p = r + beta * p
284
+ axpby_kernel(p_flat_gpu, r_flat_gpu, p_flat_gpu,
285
+ np.float32(1.0), beta, np.int32(ZX),
286
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
287
+
288
+ rho_prev = rho_curr
289
+
290
+ if show_logs and (it % 10 == 0 or it == numIterations - 1):
291
+ drv.Context.synchronize()
292
+
293
+ if isSavingEachIteration and it in save_indices:
294
+ theta_host = np.empty(ZX, dtype=dtype)
295
+ drv.memcpy_dtoh(theta_host, theta_flat_gpu)
296
+ saved_theta.append(theta_host.reshape(Z, X))
297
+ saved_indices.append(it)
298
+
299
+ drv.Context.synchronize()
300
+
301
+ final_result = np.empty(ZX, dtype=dtype)
302
+ drv.memcpy_dtoh(final_result, theta_flat_gpu)
303
+ final_result = final_result.reshape(Z, X)
304
+
305
+ # Libération
306
+ y_gpu.free(); q_flat_gpu.free(); r_flat_gpu.free(); p_flat_gpu.free(); z_flat_gpu.free(); theta_flat_gpu.free(); ATy_flat_gpu.free()
307
+
308
+ return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
309
+
310
+ except Exception as e:
311
+ print(f"Error in LS_CG_sparseCSR_pycuda: {type(e).__name__}: {e}")
312
+ gc.collect()
313
+ return None, None
314
+
315
+ finally:
316
+ if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
317
+ SMatrix.ctx.pop()
318
+
319
+ def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
320
+ """
321
+ Reconstruction par Moindres Carrés (LS) via Gradient Conjugué (CG) sur format SELL-C-sigma.
322
+ Utilise les mêmes arguments que la fonction MLEM, sans sous-fonctions Python.
323
+
324
+ SMatrix: instance de SparseSMatrix_SELL (déjà allouée)
325
+ y: données mesurées (1D np.float32 de taille TN)
326
+ """
327
+ final_result = None
328
+
329
+ # --- Logique de Produit Scalaire (Intégrée) ---
330
+ def _dot_product_gpu(mod, a_ptr, b_ptr, N_int, stream):
331
+ block_size = 256
332
+ grid_size = (N_int + block_size - 1) // block_size
333
+
334
+ reduction_host = np.empty(grid_size, dtype=np.float32)
335
+ reduction_buffer = drv.mem_alloc(reduction_host.nbytes)
336
+
337
+ dot_kernel = mod.get_function("dot_product_reduction_kernel")
338
+
339
+ dot_kernel(reduction_buffer, a_ptr, b_ptr, np.int32(N_int),
340
+ block=(block_size, 1, 1), grid=(grid_size, 1, 1), stream=stream)
341
+
342
+ drv.memcpy_dtoh(reduction_host, reduction_buffer)
343
+ total_dot = np.sum(reduction_host)
344
+
345
+ reduction_buffer.free()
346
+ return total_dot
347
+ # -----------------------------------------------
348
+
349
+ try:
350
+ if not isinstance(SMatrix, SMatrix.__class__):
351
+ raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
352
+ if SMatrix.sell_values_gpu is None:
353
+ raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
354
+
355
+ if SMatrix.ctx:
356
+ SMatrix.ctx.push()
357
+
358
+ dtype = np.float32
359
+ TN = int(SMatrix.N * SMatrix.T)
360
+ ZX = int(SMatrix.Z * SMatrix.X)
361
+ Z = SMatrix.Z
362
+ X = SMatrix.X
363
+ block_size = 256
364
+ tolerance = 1e-12
365
+
366
+ # Accès aux paramètres SELL
367
+ mod = SMatrix.sparse_mod
368
+ projection_kernel = mod.get_function("projection_kernel__SELL")
369
+ backprojection_kernel = mod.get_function("backprojection_kernel__SELL")
370
+ axpby_kernel = mod.get_function("vector_axpby_kernel")
371
+ minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
372
+ slice_height = np.int32(SMatrix.slice_height)
373
+ grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
374
+
375
+ stream = drv.Stream()
376
+
377
+ # Allocation des buffers
378
+ y = y.T.flatten().astype(dtype)
379
+ y_gpu = drv.mem_alloc(y.nbytes)
380
+ drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
381
+
382
+ theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
383
+ drv.memcpy_htod_async(theta_flat_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
384
+
385
+ q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
386
+ r_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
387
+ p_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
388
+ z_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
389
+ ATy_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
390
+
391
+ # --- Initialisation CG ---
392
+
393
+ # 1. ATy = A^T * y
394
+ drv.memset_d32_async(ATy_flat_gpu, 0, ZX, stream)
395
+ backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
396
+ y_gpu, ATy_flat_gpu, np.int32(TN), slice_height,
397
+ block=(block_size, 1, 1), grid=grid_rows, stream=stream)
398
+
399
+ # 2. q = A * theta_0
400
+ projection_kernel(q_flat_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
401
+ theta_flat_gpu, np.int32(TN), slice_height,
402
+ block=(block_size, 1, 1), grid=grid_rows, stream=stream)
403
+
404
+ # 3. r_temp = A^T * q = A^T A theta_0
405
+ drv.memset_d32_async(r_flat_gpu, 0, ZX, stream)
406
+ backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
407
+ q_flat_gpu, r_flat_gpu, np.int32(TN), slice_height,
408
+ block=(block_size, 1, 1), grid=grid_rows, stream=stream)
409
+
410
+ # 4. r_0 = ATy - r_temp
411
+ axpby_kernel(r_flat_gpu, ATy_flat_gpu, r_flat_gpu,
412
+ np.float32(1.0), np.float32(-1.0), np.int32(ZX),
413
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
414
+
415
+ # 5. p_0 = r_0
416
+ drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
417
+
418
+ # 6. rho_prev = ||r_0||^2
419
+ rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
420
+
421
+ # --- Boucle itérative ---
422
+ saved_theta, saved_indices = [], []
423
+ if numIterations <= max_saves:
424
+ save_indices = list(range(numIterations))
425
+ else:
426
+ save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
427
+ if save_indices[-1] != numIterations - 1:
428
+ save_indices.append(numIterations - 1)
429
+
430
+ description = f"AOT-BioMaps -- LS-CG (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
431
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
432
+
433
+ for it in iterator:
434
+ # a. q = A * p
435
+ projection_kernel(q_flat_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
436
+ p_flat_gpu, np.int32(TN), slice_height,
437
+ block=(block_size, 1, 1), grid=grid_rows, stream=stream)
438
+
439
+ # b. z = A^T * q = A^T A p
440
+ drv.memset_d32_async(z_flat_gpu, 0, ZX, stream)
441
+ backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
442
+ q_flat_gpu, z_flat_gpu, np.int32(TN), slice_height,
443
+ block=(block_size, 1, 1), grid=grid_rows, stream=stream)
444
+
445
+ # c. alpha = rho_prev / <p, z>
446
+ pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
447
+
448
+ if abs(pAp) < 1e-15: break
449
+ alpha = rho_prev / pAp
450
+
451
+ # d. theta = theta + alpha * p
452
+ axpby_kernel(theta_flat_gpu, theta_flat_gpu, p_flat_gpu,
453
+ np.float32(1.0), alpha, np.int32(ZX),
454
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
455
+
456
+ # e. r = r - alpha * z
457
+ minus_axpy_kernel(r_flat_gpu, z_flat_gpu, alpha, np.int32(ZX),
458
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
459
+
460
+ # f. rho_curr = ||r||^2
461
+ rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
462
+
463
+ if rho_curr < tolerance: break
464
+
465
+ # g. beta = rho_curr / rho_prev
466
+ beta = rho_curr / rho_prev
467
+
468
+ # h. p = r + beta * p
469
+ axpby_kernel(p_flat_gpu, r_flat_gpu, p_flat_gpu,
470
+ np.float32(1.0), beta, np.int32(ZX),
471
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
472
+
473
+ rho_prev = rho_curr
474
+
475
+ stream.synchronize()
476
+ if isSavingEachIteration and it in save_indices:
477
+ out = np.empty(ZX, dtype=dtype)
478
+ drv.memcpy_dtoh(out, theta_flat_gpu)
479
+ saved_theta.append(out.reshape((Z, X)))
480
+ saved_indices.append(it)
481
+
482
+ # final copy
483
+ res = np.empty(ZX, dtype=np.float32)
484
+ drv.memcpy_dtoh(res, theta_flat_gpu)
485
+ final_result = res.reshape((Z, X))
486
+
487
+ # free temporaries
488
+ y_gpu.free(); q_flat_gpu.free(); r_flat_gpu.free(); p_flat_gpu.free(); z_flat_gpu.free(); theta_flat_gpu.free(); ATy_flat_gpu.free()
489
+
490
+ return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
491
+
492
+ except Exception as e:
493
+ print(f"Error in LS_CG_sparseSELL_pycuda: {type(e).__name__}: {e}")
494
+ gc.collect()
495
+ return None, None
496
+
497
+ finally:
498
+ if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
499
+ SMatrix.ctx.pop()