AOT-biomaps 2.9.176__py3-none-any.whl → 2.9.279__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (29) hide show
  1. AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
  2. AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +11 -6
  3. AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
  4. AOT_biomaps/AOT_Experiment/_mainExperiment.py +95 -55
  5. AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +48 -13
  6. AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +409 -13
  7. AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +118 -38
  8. AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +306 -102
  9. AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +1 -1
  10. AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
  11. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +281 -0
  12. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +328 -0
  13. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
  14. AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
  15. AOT_biomaps/AOT_Recon/AlgebraicRecon.py +265 -153
  16. AOT_biomaps/AOT_Recon/AnalyticRecon.py +27 -42
  17. AOT_biomaps/AOT_Recon/BayesianRecon.py +84 -151
  18. AOT_biomaps/AOT_Recon/DeepLearningRecon.py +1 -1
  19. AOT_biomaps/AOT_Recon/PrimalDualRecon.py +69 -62
  20. AOT_biomaps/AOT_Recon/ReconEnums.py +27 -2
  21. AOT_biomaps/AOT_Recon/ReconTools.py +152 -12
  22. AOT_biomaps/AOT_Recon/__init__.py +1 -0
  23. AOT_biomaps/AOT_Recon/_mainRecon.py +72 -58
  24. AOT_biomaps/__init__.py +4 -74
  25. {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.279.dist-info}/METADATA +2 -1
  26. aot_biomaps-2.9.279.dist-info/RECORD +47 -0
  27. aot_biomaps-2.9.176.dist-info/RECORD +0 -43
  28. {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.279.dist-info}/WHEEL +0 -0
  29. {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.279.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,18 @@
1
1
  from AOT_biomaps.AOT_Recon.ReconTools import _forward_projection, _backward_projection, check_gpu_memory, calculate_memory_requirement
2
2
  from AOT_biomaps.Config import config
3
+ from AOT_biomaps.AOT_Recon.AOT_SparseSMatrix.SparseSMatrix_SELL import SparseSMatrix_SELL
4
+ from AOT_biomaps.AOT_Recon.AOT_SparseSMatrix.SparseSMatrix_CSR import SparseSMatrix_CSR
5
+ from AOT_biomaps.AOT_Recon.ReconEnums import SMatrixType
3
6
  import numba
4
7
  import torch
5
8
  import numpy as np
6
9
  import os
7
10
  from tqdm import trange
11
+ import cupy as cp
12
+ import cupyx.scipy.sparse as cpsparse
13
+ import gc
14
+ import pycuda.driver as drv
15
+
8
16
 
9
17
  def MLEM(
10
18
  SMatrix,
@@ -13,9 +21,12 @@ def MLEM(
13
21
  isSavingEachIteration=True,
14
22
  withTumor=True,
15
23
  device=None,
16
- use_multi_gpu=False,
17
24
  use_numba=False,
25
+ denominator_threshold=1e-6,
18
26
  max_saves=5000,
27
+ show_logs=True,
28
+ smatrixType=SMatrixType.SELL,
29
+ Z=350,
19
30
  ):
20
31
  """
21
32
  Unified MLEM algorithm for Acousto-Optic Tomography.
@@ -33,34 +44,38 @@ def MLEM(
33
44
  Returns:
34
45
  Reconstructed image(s) and iteration indices (if isSavingEachIteration)
35
46
  """
36
- try:
37
- tumor_str = "WITH" if withTumor else "WITHOUT"
38
- # Auto-select device and method
39
- if device is None:
40
- if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y)):
41
- device = torch.device(f"cuda:{config.select_best_gpu()}")
42
- use_gpu = True
43
- else:
44
- device = torch.device("cpu")
45
- use_gpu = False
47
+ # try:
48
+ tumor_str = "WITH" if withTumor else "WITHOUT"
49
+ # Auto-select device and method
50
+ if device is None:
51
+ if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
52
+ device = torch.device(f"cuda:{config.select_best_gpu()}")
53
+ use_gpu = True
46
54
  else:
47
- use_gpu = device.type == "cuda"
48
- # Dispatch to the appropriate implementation
49
- if use_gpu:
50
- if use_multi_gpu and torch.cuda.device_count() > 1:
51
- return _MLEM_multi_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves)
55
+ device = torch.device("cpu")
56
+ use_gpu = False
57
+ else:
58
+ use_gpu = device.type == "cuda"
59
+ # Dispatch to the appropriate implementation
60
+ if use_gpu:
61
+ if smatrixType == SMatrixType.CSR:
62
+ return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
63
+ elif smatrixType == SMatrixType.SELL:
64
+ return MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
65
+ elif smatrixType == SMatrixType.DENSE:
66
+ return _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold,show_logs)
52
67
  else:
53
- return _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves)
68
+ raise ValueError("Unsupported SMatrixType for GPU MLEM.")
69
+ else:
70
+ if use_numba:
71
+ return _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
54
72
  else:
55
- if use_numba:
56
- return _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves)
57
- else:
58
- return _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves)
59
- except Exception as e:
60
- print(f"Error in MLEM: {type(e).__name__}: {e}")
61
- return None, None
73
+ return _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
74
+ # except Exception as e:
75
+ # print(f"Error in MLEM: {type(e).__name__}: {e}")
76
+ # return None, None
62
77
 
63
- def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves=5000):
78
+ def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
64
79
  try:
65
80
  eps = torch.finfo(torch.float32).eps
66
81
  T, Z, X, N = SMatrix.shape
@@ -82,7 +97,6 @@ def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str
82
97
  .reshape(-1)
83
98
  )
84
99
  description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
85
-
86
100
  # Calculate save indices
87
101
  if numIterations <= max_saves:
88
102
  save_indices = list(range(numIterations))
@@ -91,20 +105,21 @@ def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str
91
105
  save_indices = list(range(0, numIterations, step))
92
106
  if save_indices[-1] != numIterations - 1:
93
107
  save_indices.append(numIterations - 1)
94
-
95
108
  saved_theta = []
96
109
  saved_indices = []
97
-
98
110
  with torch.no_grad():
99
- for it in trange(numIterations, desc=description):
111
+ # Utilise range si show_logs=False, sinon trange
112
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
113
+ for it in iterator:
100
114
  q_flat = A_flat @ theta_flat
101
- e_flat = y_flat / (q_flat + eps)
115
+ # Appliquer le seuil : si q_flat < denominator_threshold, on met e_flat à 1 (comme dans le code C++)
116
+ mask = q_flat >= denominator_threshold
117
+ e_flat = torch.where(mask, y_flat / (q_flat + eps), torch.ones_like(q_flat))
102
118
  c_flat = A_flat.T @ e_flat
103
119
  theta_flat = (theta_flat / (norm_factor_flat + eps)) * c_flat
104
120
  if isSavingEachIteration and it in save_indices:
105
121
  saved_theta.append(theta_flat.reshape(Z, X).clone())
106
122
  saved_indices.append(it)
107
-
108
123
  # Free memory
109
124
  del A_flat, y_flat, norm_factor_flat
110
125
  torch.cuda.empty_cache()
@@ -117,74 +132,15 @@ def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str
117
132
  torch.cuda.empty_cache()
118
133
  return None, None
119
134
 
120
- def _MLEM_multi_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves=5000):
121
- try:
122
- num_gpus = torch.cuda.device_count()
123
- device = torch.device('cuda:0')
124
- T, Z, X, N = SMatrix.shape
125
- A_matrix_torch = torch.tensor(SMatrix, dtype=torch.float32).to(device).permute(0, 3, 1, 2).reshape(T * N, Z * X)
126
- y_torch = torch.tensor(y, dtype=torch.float32).to(device).reshape(-1)
127
- A_split = torch.chunk(A_matrix_torch, num_gpus, dim=0)
128
- y_split = torch.chunk(y_torch, num_gpus)
129
- theta_0 = torch.ones((Z, X), dtype=torch.float32, device=device)
130
- theta_list = [theta_0.clone().to(device) for _ in range(num_gpus)]
131
- normalization_factor = A_matrix_torch.sum(dim=0).reshape(Z, X).to(device)
132
-
133
- # Calculate save indices
134
- if numIterations <= max_saves:
135
- save_indices = list(range(numIterations))
136
- else:
137
- step = numIterations // max_saves
138
- save_indices = list(range(0, numIterations, step))
139
- if save_indices[-1] != numIterations - 1:
140
- save_indices.append(numIterations - 1)
141
-
142
- saved_theta = [theta_0.cpu().numpy()]
143
- saved_indices = [0]
144
- description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- processing on multi-GPU ({num_gpus} GPUs) ----"
145
-
146
- for it in trange(numIterations, desc=description):
147
- theta_p_list = []
148
- for i in range(num_gpus):
149
- with torch.cuda.device(f'cuda:{i}'):
150
- theta_p = theta_list[i].to(f'cuda:{i}')
151
- A_i = A_split[i].to(f'cuda:{i}')
152
- y_i = y_split[i].to(f'cuda:{i}')
153
- q_flat = A_i @ theta_p.reshape(-1)
154
- e_flat = y_i / (q_flat + torch.finfo(torch.float32).tiny)
155
- c_flat = A_i.T @ e_flat
156
- theta_p_plus_1_flat = (theta_p.reshape(-1) / (normalization_factor.to(f'cuda:{i}').reshape(-1) + torch.finfo(torch.float32).tiny)) * c_flat
157
- theta_p_plus_1 = theta_p_plus_1_flat.reshape(Z, X)
158
- theta_p_list.append(theta_p_plus_1)
159
- for i in range(num_gpus):
160
- theta_list[i] = theta_p_list[i].to('cuda:0')
161
- if isSavingEachIteration and it in save_indices:
162
- saved_theta.append(torch.stack(theta_p_list).mean(dim=0).cpu().numpy())
163
- saved_indices.append(it + 1)
164
-
165
- del A_matrix_torch, y_torch, A_split, y_split, theta_0, normalization_factor
166
- for i in range(num_gpus):
167
- torch.cuda.empty_cache()
168
- if not isSavingEachIteration:
169
- return torch.stack(theta_p_list).mean(dim=0).cpu().numpy(), None
170
- else:
171
- return saved_theta, saved_indices
172
- except Exception as e:
173
- print(f"Error in multi-GPU MLEM: {type(e).__name__}: {e}")
174
- del A_matrix_torch, y_torch, A_split, y_split, theta_0, normalization_factor
175
- for i in range(num_gpus):
176
- torch.cuda.empty_cache()
177
- return None, None
178
-
179
- def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves=5000):
135
+ def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs=True):
180
136
  try:
181
137
  numba.set_num_threads(os.cpu_count())
182
- q_p = np.zeros((SMatrix.shape[0], SMatrix.shape[3]))
183
- c_p = np.zeros((SMatrix.shape[1], SMatrix.shape[2]))
184
- theta_p_0 = np.ones((SMatrix.shape[1], SMatrix.shape[2]))
138
+ q_p = np.zeros((SMatrix.shape[0], SMatrix.shape[3]), dtype=np.float32)
139
+ c_p = np.zeros((SMatrix.shape[1], SMatrix.shape[2]), dtype=np.float32)
140
+ theta_p_0 = np.ones((SMatrix.shape[1], SMatrix.shape[2]), dtype=np.float32)
185
141
  matrix_theta = [theta_p_0]
186
142
  saved_indices = [0]
187
- normalization_factor = np.sum(SMatrix, axis=(0, 3))
143
+ normalization_factor = np.sum(SMatrix, axis=(0, 3)).astype(np.float32)
188
144
 
189
145
  # Calculate save indices
190
146
  if numIterations <= max_saves:
@@ -196,14 +152,20 @@ def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
196
152
  save_indices.append(numIterations - 1)
197
153
 
198
154
  description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- processing on multithread CPU ({numba.config.NUMBA_DEFAULT_NUM_THREADS} threads) ----"
155
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
199
156
 
200
- for it in trange(numIterations, desc=description):
157
+ for it in iterator:
201
158
  theta_p = matrix_theta[-1]
202
159
  _forward_projection(SMatrix, theta_p, q_p)
203
- e_p = y / (q_p + 1e-8)
160
+
161
+ # Appliquer le seuil : si q_p < denominator_threshold, on met e_p à 1
162
+ mask = q_p >= denominator_threshold
163
+ e_p = np.where(mask, y / (q_p + 1e-8), 1.0)
164
+
204
165
  _backward_projection(SMatrix, e_p, c_p)
205
166
  theta_p_plus_1 = theta_p / (normalization_factor + 1e-8) * c_p
206
- if isSavingEachIteration and it in save_indices:
167
+
168
+ if isSavingEachIteration and (it + 1) in save_indices:
207
169
  matrix_theta.append(theta_p_plus_1)
208
170
  saved_indices.append(it + 1)
209
171
  else:
@@ -217,7 +179,7 @@ def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
217
179
  print(f"Error in Numba CPU MLEM: {type(e).__name__}: {e}")
218
180
  return None, None
219
181
 
220
- def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves=5000):
182
+ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs=True):
221
183
  try:
222
184
  T, Z, X, N = SMatrix.shape
223
185
  A_flat = SMatrix.astype(np.float32).transpose(0, 3, 1, 2).reshape(T * N, Z * X)
@@ -238,16 +200,22 @@ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
238
200
  save_indices.append(numIterations - 1)
239
201
 
240
202
  description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- processing on single CPU (optimized) ----"
203
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
241
204
 
242
- for it in trange(numIterations, desc=description):
205
+ for it in iterator:
243
206
  theta_p = matrix_theta[-1]
244
207
  theta_p_flat = theta_p.reshape(-1)
245
208
  q_flat = A_flat @ theta_p_flat
246
- e_flat = y_flat / (q_flat + np.finfo(np.float32).tiny)
209
+
210
+ # Appliquer le seuil : si q_flat < denominator_threshold, on met e_flat à 1
211
+ mask = q_flat >= denominator_threshold
212
+ e_flat = np.where(mask, y_flat / (q_flat + np.finfo(np.float32).tiny), 1.0)
213
+
247
214
  c_flat = A_flat.T @ e_flat
248
215
  theta_p_plus_1_flat = theta_p_flat / (normalization_factor_flat + np.finfo(np.float32).tiny) * c_flat
249
216
  theta_p_plus_1 = theta_p_plus_1_flat.reshape(Z, X)
250
- if isSavingEachIteration and it in save_indices:
217
+
218
+ if isSavingEachIteration and (it + 1) in save_indices:
251
219
  matrix_theta.append(theta_p_plus_1)
252
220
  saved_indices.append(it + 1)
253
221
  else:
@@ -260,3 +228,239 @@ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
260
228
  except Exception as e:
261
229
  print(f"Error in optimized CPU MLEM: {type(e).__name__}: {e}")
262
230
  return None, None
231
+
232
+ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
233
+ """
234
+ SMatrix: instance of SparseMatrixGPU (already allocated)
235
+ y: measured data (1D np.float32 of length TN)
236
+
237
+ Assumptions:
238
+ - SMatrix.values_gpu and SMatrix.col_ind_gpu and SMatrix.row_ptr_gpu are device pointers
239
+ - SMatrix.norm_factor_inv_gpu exists
240
+ - SMatrix.ctx is the PyCUDA context for the target GPU.
241
+ """
242
+
243
+ # We use a final_result placeholder to ensure it's defined outside the try block
244
+ final_result = None
245
+
246
+ try:
247
+ if not isinstance(SMatrix, SparseSMatrix_CSR):
248
+ raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
249
+
250
+ # --- CONTEXT FIX: Push the context associated with SMatrix ---
251
+ # This ensures all subsequent PyCUDA operations use the correct GPU/context.
252
+ if SMatrix.ctx:
253
+ SMatrix.ctx.push()
254
+ # -----------------------------------------------------------
255
+
256
+ dtype = np.float32
257
+ TN = SMatrix.N * SMatrix.T
258
+ ZX = SMatrix.Z * SMatrix.X
259
+ # Ensure Z and X are correctly defined for reshaping
260
+ Z = SMatrix.Z
261
+ X = SMatrix.X
262
+
263
+ if show_logs:
264
+ # We assume SMatrix was initialized using the correct device index.
265
+ print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
266
+ print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
267
+
268
+ # streams
269
+ stream = drv.Stream()
270
+
271
+ # allocate device buffers
272
+ y = y.T.flatten().astype(np.float32)
273
+ y_gpu = drv.mem_alloc(y.nbytes)
274
+ drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
275
+
276
+ theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
277
+ initial_theta = np.full(ZX, 0.1, dtype=dtype)
278
+ drv.memcpy_htod_async(theta_flat_gpu, initial_theta, stream)
279
+
280
+ norm_factor_inv_gpu = SMatrix.norm_factor_inv_gpu
281
+
282
+ q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
283
+ e_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
284
+ c_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
285
+
286
+ # Assuming the cubin file is found globally or managed by the caller
287
+ mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
288
+ projection_kernel = mod.get_function('projection_kernel__CSR')
289
+ backprojection_kernel = mod.get_function('backprojection_kernel__CSR')
290
+ ratio_kernel = mod.get_function('ratio_kernel')
291
+ update_kernel = mod.get_function('update_theta_kernel')
292
+ block_size = 256
293
+
294
+ saved_theta, saved_indices = [], []
295
+ if numIterations <= max_saves:
296
+ save_indices = list(range(numIterations))
297
+ else:
298
+ save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
299
+ if save_indices[-1] != numIterations - 1:
300
+ save_indices.append(numIterations - 1)
301
+
302
+ description = f"AOT-BioMaps -- ML-EM (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
303
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
304
+ for it in iterator:
305
+ # projection: q = A * theta
306
+ projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
307
+ theta_flat_gpu, np.int32(TN),
308
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1),
309
+ stream=stream)
310
+
311
+ # ratio: e = y / max(q, threshold)
312
+ ratio_kernel(e_flat_gpu, y_gpu, q_flat_gpu, np.float32(denominator_threshold), np.int32(TN),
313
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
314
+
315
+ # backprojection: c = A^T * e
316
+ drv.memset_d32_async(c_flat_gpu, 0, ZX, stream)
317
+ backprojection_kernel(c_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
318
+ e_flat_gpu, np.int32(TN),
319
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
320
+
321
+ # update: theta *= norm_factor_inv * c
322
+ update_kernel(theta_flat_gpu, c_flat_gpu, norm_factor_inv_gpu, np.int32(ZX),
323
+ block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
324
+
325
+ if show_logs and (it % 10 == 0 or it == numIterations - 1):
326
+ drv.Context.synchronize()
327
+
328
+ if isSavingEachIteration and it in save_indices:
329
+ theta_host = np.empty(ZX, dtype=dtype)
330
+ drv.memcpy_dtoh(theta_host, theta_flat_gpu)
331
+ saved_theta.append(theta_host.reshape(Z, X))
332
+ saved_indices.append(it)
333
+
334
+ drv.Context.synchronize()
335
+
336
+ final_result = np.empty(ZX, dtype=dtype)
337
+ drv.memcpy_dtoh(final_result, theta_flat_gpu)
338
+ final_result = final_result.reshape(Z, X)
339
+
340
+ # free local allocations
341
+ y_gpu.free(); q_flat_gpu.free(); e_flat_gpu.free(); c_flat_gpu.free(); theta_flat_gpu.free()
342
+
343
+ return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
344
+
345
+ except Exception as e:
346
+ print(f"Error in MLEM_sparseCSR_pycuda: {type(e).__name__}: {e}")
347
+ gc.collect()
348
+ return None, None
349
+
350
+ finally:
351
+ # --- CONTEXT FIX: Pop the context ---
352
+ if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
353
+ SMatrix.ctx.pop()
354
+ # ------------------------------------
355
+
356
+ def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
357
+ """
358
+ MLEM using SELL-C-σ kernels already present on device.
359
+ y must be float32 length TN.
360
+ """
361
+ final_result = None
362
+
363
+ try:
364
+ # check if SMatrix is SparseSMatrix_SELL object
365
+ if not isinstance(SMatrix, SparseSMatrix_SELL):
366
+ raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
367
+ if SMatrix.sell_values_gpu is None:
368
+ raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
369
+
370
+ # --- CONTEXT FIX: Push the context associated with SMatrix ---
371
+ # This ensures all subsequent PyCUDA operations use the correct GPU/context.
372
+ if SMatrix.ctx:
373
+ SMatrix.ctx.push()
374
+ # -----------------------------------------------------------
375
+
376
+ TN = int(SMatrix.N * SMatrix.T)
377
+ ZX = int(SMatrix.Z * SMatrix.X)
378
+ dtype = np.float32
379
+ block_size = 256
380
+
381
+ mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
382
+ proj = mod.get_function("projection_kernel__SELL")
383
+ backproj = mod.get_function("backprojection_kernel__SELL")
384
+ ratio = mod.get_function("ratio_kernel")
385
+ update = mod.get_function("update_theta_kernel")
386
+
387
+ stream = drv.Stream()
388
+
389
+ # device buffers
390
+ y = y.T.flatten().astype(np.float32)
391
+ y_gpu = drv.mem_alloc(y.nbytes)
392
+ drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
393
+
394
+ theta_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
395
+ drv.memcpy_htod_async(theta_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
396
+
397
+ q_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
398
+ e_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
399
+ c_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
400
+
401
+ slice_ptr_gpu = SMatrix.slice_ptr_gpu
402
+ slice_len_gpu = SMatrix.slice_len_gpu
403
+ slice_height = np.int32(SMatrix.slice_height)
404
+
405
+ grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
406
+ grid_cols = ((ZX + block_size - 1) // block_size, 1, 1)
407
+
408
+ saved_theta, saved_indices = [], []
409
+ if numIterations <= max_saves:
410
+ save_indices = list(range(numIterations))
411
+ else:
412
+ save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
413
+ if save_indices[-1] != numIterations - 1:
414
+ save_indices.append(numIterations - 1)
415
+
416
+ description = f"AOT-BioMaps -- ML-EM (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
417
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
418
+ for it in iterator:
419
+ # projection
420
+ proj(q_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, slice_ptr_gpu, slice_len_gpu,
421
+ theta_gpu, np.int32(TN), slice_height,
422
+ block=(block_size,1,1), grid=grid_rows, stream=stream)
423
+
424
+ # ratio
425
+ ratio(e_gpu, y_gpu, q_gpu, np.float32(denominator_threshold), np.int32(TN),
426
+ block=(block_size,1,1), grid=grid_rows, stream=stream)
427
+
428
+ # zero c
429
+ drv.memset_d32_async(c_gpu, 0, ZX, stream)
430
+
431
+ # backprojection accumulate
432
+ backproj(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, slice_ptr_gpu, slice_len_gpu,
433
+ e_gpu, c_gpu, np.int32(TN), slice_height,
434
+ block=(block_size,1,1), grid=grid_rows, stream=stream)
435
+
436
+ # update
437
+ update(theta_gpu, c_gpu, SMatrix.norm_factor_inv_gpu, np.int32(ZX),
438
+ block=(block_size,1,1), grid=grid_cols, stream=stream)
439
+
440
+ stream.synchronize()
441
+ if isSavingEachIteration and it in save_indices:
442
+ out = np.empty(ZX, dtype=np.float32)
443
+ drv.memcpy_dtoh(out, theta_gpu)
444
+ saved_theta.append(out.reshape((SMatrix.Z, SMatrix.X)))
445
+ saved_indices.append(it)
446
+
447
+ # final copy
448
+ res = np.empty(ZX, dtype=np.float32)
449
+ drv.memcpy_dtoh(res, theta_gpu)
450
+
451
+ # free temporaries
452
+ y_gpu.free(); q_gpu.free(); e_gpu.free(); c_gpu.free(); theta_gpu.free()
453
+
454
+ final_result = res.reshape((SMatrix.Z, SMatrix.X))
455
+ return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
456
+
457
+ except Exception as e:
458
+ print(f"Error in MLEM_sparseSELL_pycuda: {type(e).__name__}: {e}")
459
+ gc.collect()
460
+ return None, None
461
+
462
+ finally:
463
+ # --- CONTEXT FIX: Pop the context ---
464
+ if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
465
+ SMatrix.ctx.pop()
466
+ # ------------------------------------
@@ -27,7 +27,7 @@ def CP_TV(
27
27
  Works on both CPU and GPU.
28
28
  Args:
29
29
  SMatrix: System matrix (shape: T, Z, X, N)
30
- y: Measurement data (shape: T, X, N)
30
+ y: Measurement data (shape: T, N)
31
31
  alpha: Regularization parameter for TV
32
32
  theta: Relaxation parameter (1.0 for standard Chambolle-Pock)
33
33
  numIterations: Number of iterations
@@ -9,26 +9,24 @@ def _Omega_RELATIVE_DIFFERENCE_CPU(theta_flat, index, values, gamma):
9
9
  theta_k = theta_flat[k_idx]
10
10
  diff = theta_k - theta_j
11
11
  abs_diff = np.abs(diff)
12
-
13
12
  denom = theta_k + theta_j + gamma * abs_diff + 1e-8
14
13
  num = diff ** 2
15
-
14
+ psi_pair = num / denom
15
+ psi_pair = values * psi_pair
16
16
  # First derivative ∂U/∂θ_j
17
17
  dpsi = (2 * diff * denom - num * (1 + gamma * np.sign(diff))) / (denom ** 2)
18
18
  grad_pair = values * (-dpsi) # Note the negative sign: U contains ψ(θ_k, θ_j), seeking ∂/∂θ_j
19
-
20
19
  # Second derivative ∂²U/∂θ_j² (numerically stable, approximate treatment)
21
20
  d2psi = (2 * denom ** 2 - 4 * diff * denom * (1 + gamma * np.sign(diff))
22
21
  + 2 * num * (1 + gamma * np.sign(diff)) ** 2) / (denom ** 3 + 1e-8)
23
22
  hess_pair = values * d2psi
24
-
25
23
  grad_U = np.zeros_like(theta_flat)
26
24
  hess_U = np.zeros_like(theta_flat)
27
-
28
25
  np.add.at(grad_U, j_idx, grad_pair)
29
26
  np.add.at(hess_U, j_idx, hess_pair)
30
-
31
- return grad_U, hess_U
27
+ # Compute U_value
28
+ U_value = 0.5 * np.sum(psi_pair)
29
+ return grad_U, hess_U, U_value
32
30
 
33
31
  def _Omega_RELATIVE_DIFFERENCE_GPU(theta_flat, index, values, device, gamma):
34
32
  j_idx, k_idx = index
@@ -38,26 +36,24 @@ def _Omega_RELATIVE_DIFFERENCE_GPU(theta_flat, index, values, device, gamma):
38
36
  abs_diff = torch.abs(diff)
39
37
  denom = theta_k + theta_j + gamma * abs_diff + 1e-8
40
38
  num = diff ** 2
41
-
39
+ psi_pair = num / denom
40
+ psi_pair = values * psi_pair
42
41
  # Compute gradient contributions
43
42
  dpsi = (2 * diff * denom - num * (1 + gamma * torch.sign(diff))) / (denom ** 2)
44
43
  grad_pair = values * (-dpsi)
45
-
46
44
  # Compute Hessian contributions
47
45
  d2psi = (2 * denom ** 2 - 4 * diff * denom * (1 + gamma * torch.sign(diff))
48
46
  + 2 * num * (1 + gamma * torch.sign(diff)) ** 2) / (denom ** 3 + 1e-8)
49
47
  hess_pair = values * d2psi
50
-
51
48
  # Initialize gradient and Hessian on the correct device
52
49
  grad_U = torch.zeros_like(theta_flat, device=device)
53
50
  hess_U = torch.zeros_like(theta_flat, device=device)
54
-
55
51
  # Accumulate gradient contributions
56
52
  grad_U.index_add_(0, j_idx, grad_pair)
57
53
  grad_U.index_add_(0, k_idx, -grad_pair)
58
-
59
54
  # Accumulate Hessian contributions
60
55
  hess_U.index_add_(0, j_idx, hess_pair)
61
56
  hess_U.index_add_(0, k_idx, hess_pair)
62
-
63
- return grad_U, hess_U
57
+ # Compute U_value
58
+ U_value = 0.5 * psi_pair.sum()
59
+ return grad_U, hess_U, U_value