AOT-biomaps 2.9.176__py3-none-any.whl → 2.9.300__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of AOT-biomaps might be problematic. Click here for more details.
- AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
- AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +11 -6
- AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
- AOT_biomaps/AOT_Experiment/_mainExperiment.py +95 -55
- AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +48 -13
- AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +406 -13
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +118 -38
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +390 -102
- AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +443 -12
- AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +274 -0
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +331 -0
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
- AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
- AOT_biomaps/AOT_Recon/AlgebraicRecon.py +259 -153
- AOT_biomaps/AOT_Recon/AnalyticRecon.py +27 -42
- AOT_biomaps/AOT_Recon/BayesianRecon.py +84 -151
- AOT_biomaps/AOT_Recon/DeepLearningRecon.py +1 -1
- AOT_biomaps/AOT_Recon/PrimalDualRecon.py +162 -102
- AOT_biomaps/AOT_Recon/ReconEnums.py +27 -2
- AOT_biomaps/AOT_Recon/ReconTools.py +229 -12
- AOT_biomaps/AOT_Recon/__init__.py +1 -0
- AOT_biomaps/AOT_Recon/_mainRecon.py +72 -58
- AOT_biomaps/__init__.py +4 -53
- {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.300.dist-info}/METADATA +2 -1
- aot_biomaps-2.9.300.dist-info/RECORD +47 -0
- aot_biomaps-2.9.176.dist-info/RECORD +0 -43
- {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.300.dist-info}/WHEEL +0 -0
- {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.300.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,18 @@
|
|
|
1
1
|
from AOT_biomaps.AOT_Recon.ReconTools import _forward_projection, _backward_projection, check_gpu_memory, calculate_memory_requirement
|
|
2
2
|
from AOT_biomaps.Config import config
|
|
3
|
+
from AOT_biomaps.AOT_Recon.AOT_SparseSMatrix.SparseSMatrix_SELL import SparseSMatrix_SELL
|
|
4
|
+
from AOT_biomaps.AOT_Recon.AOT_SparseSMatrix.SparseSMatrix_CSR import SparseSMatrix_CSR
|
|
5
|
+
from AOT_biomaps.AOT_Recon.ReconEnums import SMatrixType
|
|
3
6
|
import numba
|
|
4
7
|
import torch
|
|
5
8
|
import numpy as np
|
|
6
9
|
import os
|
|
7
10
|
from tqdm import trange
|
|
11
|
+
import cupy as cp
|
|
12
|
+
import cupyx.scipy.sparse as cpsparse
|
|
13
|
+
import gc
|
|
14
|
+
import pycuda.driver as drv
|
|
15
|
+
|
|
8
16
|
|
|
9
17
|
def MLEM(
|
|
10
18
|
SMatrix,
|
|
@@ -13,9 +21,11 @@ def MLEM(
|
|
|
13
21
|
isSavingEachIteration=True,
|
|
14
22
|
withTumor=True,
|
|
15
23
|
device=None,
|
|
16
|
-
use_multi_gpu=False,
|
|
17
24
|
use_numba=False,
|
|
25
|
+
denominator_threshold=1e-6,
|
|
18
26
|
max_saves=5000,
|
|
27
|
+
show_logs=True,
|
|
28
|
+
smatrixType=SMatrixType.SELL,
|
|
19
29
|
):
|
|
20
30
|
"""
|
|
21
31
|
Unified MLEM algorithm for Acousto-Optic Tomography.
|
|
@@ -33,34 +43,38 @@ def MLEM(
|
|
|
33
43
|
Returns:
|
|
34
44
|
Reconstructed image(s) and iteration indices (if isSavingEachIteration)
|
|
35
45
|
"""
|
|
36
|
-
try:
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
else:
|
|
44
|
-
device = torch.device("cpu")
|
|
45
|
-
use_gpu = False
|
|
46
|
+
# try:
|
|
47
|
+
tumor_str = "WITH" if withTumor else "WITHOUT"
|
|
48
|
+
# Auto-select device and method
|
|
49
|
+
if device is None:
|
|
50
|
+
if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
|
|
51
|
+
device = torch.device(f"cuda:{config.select_best_gpu()}")
|
|
52
|
+
use_gpu = True
|
|
46
53
|
else:
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
54
|
+
device = torch.device("cpu")
|
|
55
|
+
use_gpu = False
|
|
56
|
+
else:
|
|
57
|
+
use_gpu = device.type == "cuda"
|
|
58
|
+
# Dispatch to the appropriate implementation
|
|
59
|
+
if use_gpu:
|
|
60
|
+
if smatrixType == SMatrixType.CSR:
|
|
61
|
+
return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
|
|
62
|
+
elif smatrixType == SMatrixType.SELL:
|
|
63
|
+
return MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
|
|
64
|
+
elif smatrixType == SMatrixType.DENSE:
|
|
65
|
+
return _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold,show_logs)
|
|
52
66
|
else:
|
|
53
|
-
|
|
67
|
+
raise ValueError("Unsupported SMatrixType for GPU MLEM.")
|
|
68
|
+
else:
|
|
69
|
+
if use_numba:
|
|
70
|
+
return _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
|
|
54
71
|
else:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
except Exception as e:
|
|
60
|
-
print(f"Error in MLEM: {type(e).__name__}: {e}")
|
|
61
|
-
return None, None
|
|
72
|
+
return _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
|
|
73
|
+
# except Exception as e:
|
|
74
|
+
# print(f"Error in MLEM: {type(e).__name__}: {e}")
|
|
75
|
+
# return None, None
|
|
62
76
|
|
|
63
|
-
def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves=
|
|
77
|
+
def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
|
|
64
78
|
try:
|
|
65
79
|
eps = torch.finfo(torch.float32).eps
|
|
66
80
|
T, Z, X, N = SMatrix.shape
|
|
@@ -82,7 +96,6 @@ def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str
|
|
|
82
96
|
.reshape(-1)
|
|
83
97
|
)
|
|
84
98
|
description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
|
|
85
|
-
|
|
86
99
|
# Calculate save indices
|
|
87
100
|
if numIterations <= max_saves:
|
|
88
101
|
save_indices = list(range(numIterations))
|
|
@@ -91,20 +104,21 @@ def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str
|
|
|
91
104
|
save_indices = list(range(0, numIterations, step))
|
|
92
105
|
if save_indices[-1] != numIterations - 1:
|
|
93
106
|
save_indices.append(numIterations - 1)
|
|
94
|
-
|
|
95
107
|
saved_theta = []
|
|
96
108
|
saved_indices = []
|
|
97
|
-
|
|
98
109
|
with torch.no_grad():
|
|
99
|
-
|
|
110
|
+
# Utilise range si show_logs=False, sinon trange
|
|
111
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
112
|
+
for it in iterator:
|
|
100
113
|
q_flat = A_flat @ theta_flat
|
|
101
|
-
|
|
114
|
+
# Appliquer le seuil : si q_flat < denominator_threshold, on met e_flat à 1 (comme dans le code C++)
|
|
115
|
+
mask = q_flat >= denominator_threshold
|
|
116
|
+
e_flat = torch.where(mask, y_flat / (q_flat + eps), torch.ones_like(q_flat))
|
|
102
117
|
c_flat = A_flat.T @ e_flat
|
|
103
118
|
theta_flat = (theta_flat / (norm_factor_flat + eps)) * c_flat
|
|
104
119
|
if isSavingEachIteration and it in save_indices:
|
|
105
120
|
saved_theta.append(theta_flat.reshape(Z, X).clone())
|
|
106
121
|
saved_indices.append(it)
|
|
107
|
-
|
|
108
122
|
# Free memory
|
|
109
123
|
del A_flat, y_flat, norm_factor_flat
|
|
110
124
|
torch.cuda.empty_cache()
|
|
@@ -117,74 +131,15 @@ def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str
|
|
|
117
131
|
torch.cuda.empty_cache()
|
|
118
132
|
return None, None
|
|
119
133
|
|
|
120
|
-
def
|
|
121
|
-
try:
|
|
122
|
-
num_gpus = torch.cuda.device_count()
|
|
123
|
-
device = torch.device('cuda:0')
|
|
124
|
-
T, Z, X, N = SMatrix.shape
|
|
125
|
-
A_matrix_torch = torch.tensor(SMatrix, dtype=torch.float32).to(device).permute(0, 3, 1, 2).reshape(T * N, Z * X)
|
|
126
|
-
y_torch = torch.tensor(y, dtype=torch.float32).to(device).reshape(-1)
|
|
127
|
-
A_split = torch.chunk(A_matrix_torch, num_gpus, dim=0)
|
|
128
|
-
y_split = torch.chunk(y_torch, num_gpus)
|
|
129
|
-
theta_0 = torch.ones((Z, X), dtype=torch.float32, device=device)
|
|
130
|
-
theta_list = [theta_0.clone().to(device) for _ in range(num_gpus)]
|
|
131
|
-
normalization_factor = A_matrix_torch.sum(dim=0).reshape(Z, X).to(device)
|
|
132
|
-
|
|
133
|
-
# Calculate save indices
|
|
134
|
-
if numIterations <= max_saves:
|
|
135
|
-
save_indices = list(range(numIterations))
|
|
136
|
-
else:
|
|
137
|
-
step = numIterations // max_saves
|
|
138
|
-
save_indices = list(range(0, numIterations, step))
|
|
139
|
-
if save_indices[-1] != numIterations - 1:
|
|
140
|
-
save_indices.append(numIterations - 1)
|
|
141
|
-
|
|
142
|
-
saved_theta = [theta_0.cpu().numpy()]
|
|
143
|
-
saved_indices = [0]
|
|
144
|
-
description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- processing on multi-GPU ({num_gpus} GPUs) ----"
|
|
145
|
-
|
|
146
|
-
for it in trange(numIterations, desc=description):
|
|
147
|
-
theta_p_list = []
|
|
148
|
-
for i in range(num_gpus):
|
|
149
|
-
with torch.cuda.device(f'cuda:{i}'):
|
|
150
|
-
theta_p = theta_list[i].to(f'cuda:{i}')
|
|
151
|
-
A_i = A_split[i].to(f'cuda:{i}')
|
|
152
|
-
y_i = y_split[i].to(f'cuda:{i}')
|
|
153
|
-
q_flat = A_i @ theta_p.reshape(-1)
|
|
154
|
-
e_flat = y_i / (q_flat + torch.finfo(torch.float32).tiny)
|
|
155
|
-
c_flat = A_i.T @ e_flat
|
|
156
|
-
theta_p_plus_1_flat = (theta_p.reshape(-1) / (normalization_factor.to(f'cuda:{i}').reshape(-1) + torch.finfo(torch.float32).tiny)) * c_flat
|
|
157
|
-
theta_p_plus_1 = theta_p_plus_1_flat.reshape(Z, X)
|
|
158
|
-
theta_p_list.append(theta_p_plus_1)
|
|
159
|
-
for i in range(num_gpus):
|
|
160
|
-
theta_list[i] = theta_p_list[i].to('cuda:0')
|
|
161
|
-
if isSavingEachIteration and it in save_indices:
|
|
162
|
-
saved_theta.append(torch.stack(theta_p_list).mean(dim=0).cpu().numpy())
|
|
163
|
-
saved_indices.append(it + 1)
|
|
164
|
-
|
|
165
|
-
del A_matrix_torch, y_torch, A_split, y_split, theta_0, normalization_factor
|
|
166
|
-
for i in range(num_gpus):
|
|
167
|
-
torch.cuda.empty_cache()
|
|
168
|
-
if not isSavingEachIteration:
|
|
169
|
-
return torch.stack(theta_p_list).mean(dim=0).cpu().numpy(), None
|
|
170
|
-
else:
|
|
171
|
-
return saved_theta, saved_indices
|
|
172
|
-
except Exception as e:
|
|
173
|
-
print(f"Error in multi-GPU MLEM: {type(e).__name__}: {e}")
|
|
174
|
-
del A_matrix_torch, y_torch, A_split, y_split, theta_0, normalization_factor
|
|
175
|
-
for i in range(num_gpus):
|
|
176
|
-
torch.cuda.empty_cache()
|
|
177
|
-
return None, None
|
|
178
|
-
|
|
179
|
-
def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves=5000):
|
|
134
|
+
def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs=True):
|
|
180
135
|
try:
|
|
181
136
|
numba.set_num_threads(os.cpu_count())
|
|
182
|
-
q_p = np.zeros((SMatrix.shape[0], SMatrix.shape[3]))
|
|
183
|
-
c_p = np.zeros((SMatrix.shape[1], SMatrix.shape[2]))
|
|
184
|
-
theta_p_0 = np.ones((SMatrix.shape[1], SMatrix.shape[2]))
|
|
137
|
+
q_p = np.zeros((SMatrix.shape[0], SMatrix.shape[3]), dtype=np.float32)
|
|
138
|
+
c_p = np.zeros((SMatrix.shape[1], SMatrix.shape[2]), dtype=np.float32)
|
|
139
|
+
theta_p_0 = np.ones((SMatrix.shape[1], SMatrix.shape[2]), dtype=np.float32)
|
|
185
140
|
matrix_theta = [theta_p_0]
|
|
186
141
|
saved_indices = [0]
|
|
187
|
-
normalization_factor = np.sum(SMatrix, axis=(0, 3))
|
|
142
|
+
normalization_factor = np.sum(SMatrix, axis=(0, 3)).astype(np.float32)
|
|
188
143
|
|
|
189
144
|
# Calculate save indices
|
|
190
145
|
if numIterations <= max_saves:
|
|
@@ -196,14 +151,20 @@ def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
|
|
|
196
151
|
save_indices.append(numIterations - 1)
|
|
197
152
|
|
|
198
153
|
description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- processing on multithread CPU ({numba.config.NUMBA_DEFAULT_NUM_THREADS} threads) ----"
|
|
154
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
199
155
|
|
|
200
|
-
for it in
|
|
156
|
+
for it in iterator:
|
|
201
157
|
theta_p = matrix_theta[-1]
|
|
202
158
|
_forward_projection(SMatrix, theta_p, q_p)
|
|
203
|
-
|
|
159
|
+
|
|
160
|
+
# Appliquer le seuil : si q_p < denominator_threshold, on met e_p à 1
|
|
161
|
+
mask = q_p >= denominator_threshold
|
|
162
|
+
e_p = np.where(mask, y / (q_p + 1e-8), 1.0)
|
|
163
|
+
|
|
204
164
|
_backward_projection(SMatrix, e_p, c_p)
|
|
205
165
|
theta_p_plus_1 = theta_p / (normalization_factor + 1e-8) * c_p
|
|
206
|
-
|
|
166
|
+
|
|
167
|
+
if isSavingEachIteration and (it + 1) in save_indices:
|
|
207
168
|
matrix_theta.append(theta_p_plus_1)
|
|
208
169
|
saved_indices.append(it + 1)
|
|
209
170
|
else:
|
|
@@ -217,7 +178,7 @@ def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
|
|
|
217
178
|
print(f"Error in Numba CPU MLEM: {type(e).__name__}: {e}")
|
|
218
179
|
return None, None
|
|
219
180
|
|
|
220
|
-
def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves=
|
|
181
|
+
def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs=True):
|
|
221
182
|
try:
|
|
222
183
|
T, Z, X, N = SMatrix.shape
|
|
223
184
|
A_flat = SMatrix.astype(np.float32).transpose(0, 3, 1, 2).reshape(T * N, Z * X)
|
|
@@ -238,16 +199,22 @@ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
|
|
|
238
199
|
save_indices.append(numIterations - 1)
|
|
239
200
|
|
|
240
201
|
description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- processing on single CPU (optimized) ----"
|
|
202
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
241
203
|
|
|
242
|
-
for it in
|
|
204
|
+
for it in iterator:
|
|
243
205
|
theta_p = matrix_theta[-1]
|
|
244
206
|
theta_p_flat = theta_p.reshape(-1)
|
|
245
207
|
q_flat = A_flat @ theta_p_flat
|
|
246
|
-
|
|
208
|
+
|
|
209
|
+
# Appliquer le seuil : si q_flat < denominator_threshold, on met e_flat à 1
|
|
210
|
+
mask = q_flat >= denominator_threshold
|
|
211
|
+
e_flat = np.where(mask, y_flat / (q_flat + np.finfo(np.float32).tiny), 1.0)
|
|
212
|
+
|
|
247
213
|
c_flat = A_flat.T @ e_flat
|
|
248
214
|
theta_p_plus_1_flat = theta_p_flat / (normalization_factor_flat + np.finfo(np.float32).tiny) * c_flat
|
|
249
215
|
theta_p_plus_1 = theta_p_plus_1_flat.reshape(Z, X)
|
|
250
|
-
|
|
216
|
+
|
|
217
|
+
if isSavingEachIteration and (it + 1) in save_indices:
|
|
251
218
|
matrix_theta.append(theta_p_plus_1)
|
|
252
219
|
saved_indices.append(it + 1)
|
|
253
220
|
else:
|
|
@@ -260,3 +227,324 @@ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
|
|
|
260
227
|
except Exception as e:
|
|
261
228
|
print(f"Error in optimized CPU MLEM: {type(e).__name__}: {e}")
|
|
262
229
|
return None, None
|
|
230
|
+
|
|
231
|
+
def MLEM_sparseCSR_pycuda(
|
|
232
|
+
SMatrix,
|
|
233
|
+
y,
|
|
234
|
+
numIterations,
|
|
235
|
+
isSavingEachIteration,
|
|
236
|
+
tumor_str,
|
|
237
|
+
max_saves,
|
|
238
|
+
denominator_threshold,
|
|
239
|
+
show_logs=True,
|
|
240
|
+
):
|
|
241
|
+
"""
|
|
242
|
+
Robust MLEM implementation for CSR SMatrix using PyCUDA kernels.
|
|
243
|
+
Expects SMatrix to be SparseSMatrix_CSR with attributes:
|
|
244
|
+
- values_gpu, col_ind_gpu, row_ptr_gpu (device pointers)
|
|
245
|
+
- norm_factor_inv_gpu (device pointer)
|
|
246
|
+
- sparse_mod (loaded module with kernels)
|
|
247
|
+
- ctx (PyCUDA context)
|
|
248
|
+
Returns (saved_theta_list, saved_indices) if isSavingEachIteration else (final_theta, None)
|
|
249
|
+
"""
|
|
250
|
+
final_result = None
|
|
251
|
+
|
|
252
|
+
# Local holders to free in finally
|
|
253
|
+
y_gpu = q_flat_gpu = e_flat_gpu = c_flat_gpu = theta_flat_gpu = None
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
if not isinstance(SMatrix, SparseSMatrix_CSR):
|
|
257
|
+
raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
|
|
258
|
+
|
|
259
|
+
# push context (if provided)
|
|
260
|
+
popped_ctx = False
|
|
261
|
+
if getattr(SMatrix, "ctx", None):
|
|
262
|
+
SMatrix.ctx.push()
|
|
263
|
+
popped_ctx = True
|
|
264
|
+
|
|
265
|
+
dtype = np.float32
|
|
266
|
+
TN = int(SMatrix.N * SMatrix.T)
|
|
267
|
+
ZX = int(SMatrix.Z * SMatrix.X)
|
|
268
|
+
Z = int(SMatrix.Z)
|
|
269
|
+
X = int(SMatrix.X)
|
|
270
|
+
|
|
271
|
+
# Make sure required GPU pointers exist
|
|
272
|
+
if getattr(SMatrix, "values_gpu", None) is None or getattr(SMatrix, "col_ind_gpu", None) is None or getattr(SMatrix, "row_ptr_gpu", None) is None:
|
|
273
|
+
raise RuntimeError("SMatrix is missing GPU buffers (values_gpu / col_ind_gpu / row_ptr_gpu)")
|
|
274
|
+
|
|
275
|
+
if getattr(SMatrix, "norm_factor_inv_gpu", None) is None:
|
|
276
|
+
raise RuntimeError("SMatrix.norm_factor_inv_gpu not available on GPU")
|
|
277
|
+
|
|
278
|
+
# stream for async operations
|
|
279
|
+
stream = drv.Stream()
|
|
280
|
+
|
|
281
|
+
# prepare device buffers
|
|
282
|
+
y_arr = np.ascontiguousarray(y.T.flatten().astype(np.float32))
|
|
283
|
+
y_gpu = drv.mem_alloc(y_arr.nbytes)
|
|
284
|
+
drv.memcpy_htod_async(y_gpu, y_arr, stream)
|
|
285
|
+
|
|
286
|
+
theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
287
|
+
initial_theta = np.full(ZX, 0.1, dtype=dtype)
|
|
288
|
+
drv.memcpy_htod_async(theta_flat_gpu, initial_theta, stream)
|
|
289
|
+
|
|
290
|
+
norm_factor_inv_gpu = SMatrix.norm_factor_inv_gpu
|
|
291
|
+
|
|
292
|
+
q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
|
|
293
|
+
e_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
|
|
294
|
+
c_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
295
|
+
|
|
296
|
+
# Ensure kernels exist
|
|
297
|
+
projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__CSR")
|
|
298
|
+
backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__CSR")
|
|
299
|
+
ratio_kernel = SMatrix.sparse_mod.get_function("ratio_kernel")
|
|
300
|
+
update_kernel = SMatrix.sparse_mod.get_function("update_theta_kernel")
|
|
301
|
+
block_size = 256
|
|
302
|
+
|
|
303
|
+
# prepare save indices once
|
|
304
|
+
if numIterations <= max_saves:
|
|
305
|
+
save_indices = list(range(numIterations))
|
|
306
|
+
else:
|
|
307
|
+
step = max(1, numIterations // max_saves)
|
|
308
|
+
save_indices = list(range(0, numIterations, step))
|
|
309
|
+
if save_indices[-1] != numIterations - 1:
|
|
310
|
+
save_indices.append(numIterations - 1)
|
|
311
|
+
|
|
312
|
+
saved_theta = []
|
|
313
|
+
saved_indices = []
|
|
314
|
+
|
|
315
|
+
description = f"AOT-BioMaps -- ML-EM (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
|
|
316
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
317
|
+
|
|
318
|
+
# grid sizes
|
|
319
|
+
grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
|
|
320
|
+
grid_cols = ((ZX + block_size - 1) // block_size, 1, 1)
|
|
321
|
+
|
|
322
|
+
for it in iterator:
|
|
323
|
+
# projection: q = A * theta
|
|
324
|
+
projection_kernel(
|
|
325
|
+
q_flat_gpu,
|
|
326
|
+
SMatrix.values_gpu,
|
|
327
|
+
SMatrix.row_ptr_gpu,
|
|
328
|
+
SMatrix.col_ind_gpu,
|
|
329
|
+
theta_flat_gpu,
|
|
330
|
+
np.int32(TN),
|
|
331
|
+
block=(block_size, 1, 1),
|
|
332
|
+
grid=grid_rows,
|
|
333
|
+
stream=stream,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# ratio: e = y / max(q, threshold)
|
|
337
|
+
ratio_kernel(
|
|
338
|
+
e_flat_gpu,
|
|
339
|
+
y_gpu,
|
|
340
|
+
q_flat_gpu,
|
|
341
|
+
np.float32(denominator_threshold),
|
|
342
|
+
np.int32(TN),
|
|
343
|
+
block=(block_size, 1, 1),
|
|
344
|
+
grid=grid_rows,
|
|
345
|
+
stream=stream,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# backprojection: c = A^T * e (zero c first)
|
|
349
|
+
drv.memset_d32_async(c_flat_gpu, 0, ZX, stream)
|
|
350
|
+
backprojection_kernel(
|
|
351
|
+
c_flat_gpu,
|
|
352
|
+
SMatrix.values_gpu,
|
|
353
|
+
SMatrix.row_ptr_gpu,
|
|
354
|
+
SMatrix.col_ind_gpu,
|
|
355
|
+
e_flat_gpu,
|
|
356
|
+
np.int32(TN),
|
|
357
|
+
block=(block_size, 1, 1),
|
|
358
|
+
grid=grid_rows,
|
|
359
|
+
stream=stream,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# update: theta *= norm_factor_inv * c
|
|
363
|
+
update_kernel(
|
|
364
|
+
theta_flat_gpu,
|
|
365
|
+
c_flat_gpu,
|
|
366
|
+
norm_factor_inv_gpu,
|
|
367
|
+
np.int32(ZX),
|
|
368
|
+
block=(block_size, 1, 1),
|
|
369
|
+
grid=grid_cols,
|
|
370
|
+
stream=stream,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# periodic synchronization for stability / logging
|
|
374
|
+
if show_logs and (it % 10 == 0 or it == numIterations - 1):
|
|
375
|
+
stream.synchronize()
|
|
376
|
+
|
|
377
|
+
# save snapshot if required
|
|
378
|
+
if isSavingEachIteration and it in save_indices:
|
|
379
|
+
# ensure kernels finished
|
|
380
|
+
stream.synchronize()
|
|
381
|
+
theta_host = np.empty(ZX, dtype=dtype)
|
|
382
|
+
drv.memcpy_dtoh(theta_host, theta_flat_gpu)
|
|
383
|
+
saved_theta.append(theta_host.reshape(Z, X))
|
|
384
|
+
saved_indices.append(int(it))
|
|
385
|
+
|
|
386
|
+
# make sure everything finished
|
|
387
|
+
stream.synchronize()
|
|
388
|
+
final_theta_host = np.empty(ZX, dtype=dtype)
|
|
389
|
+
drv.memcpy_dtoh(final_theta_host, theta_flat_gpu)
|
|
390
|
+
final_result = final_theta_host.reshape(Z, X)
|
|
391
|
+
|
|
392
|
+
# free local allocations (will also be freed in finally if exception)
|
|
393
|
+
try:
|
|
394
|
+
y_gpu.free()
|
|
395
|
+
q_flat_gpu.free()
|
|
396
|
+
e_flat_gpu.free()
|
|
397
|
+
c_flat_gpu.free()
|
|
398
|
+
theta_flat_gpu.free()
|
|
399
|
+
except Exception:
|
|
400
|
+
pass
|
|
401
|
+
|
|
402
|
+
return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
|
|
403
|
+
|
|
404
|
+
except Exception as e:
|
|
405
|
+
print(f"Error in MLEM_sparseCSR_pycuda: {type(e).__name__}: {e}")
|
|
406
|
+
gc.collect()
|
|
407
|
+
return None, None
|
|
408
|
+
|
|
409
|
+
finally:
|
|
410
|
+
# free buffers if still allocated
|
|
411
|
+
for buf in ("y_gpu", "q_flat_gpu", "e_flat_gpu", "c_flat_gpu", "theta_flat_gpu"):
|
|
412
|
+
try:
|
|
413
|
+
val = locals().get(buf, None)
|
|
414
|
+
if val is not None:
|
|
415
|
+
val.free()
|
|
416
|
+
except Exception:
|
|
417
|
+
pass
|
|
418
|
+
# pop context safely
|
|
419
|
+
try:
|
|
420
|
+
if SMatrix and hasattr(SMatrix, "ctx") and SMatrix.ctx and popped_ctx:
|
|
421
|
+
SMatrix.ctx.pop()
|
|
422
|
+
except Exception:
|
|
423
|
+
pass
|
|
424
|
+
|
|
425
|
+
def MLEM_sparseSELL_pycuda(
|
|
426
|
+
SMatrix,
|
|
427
|
+
y,
|
|
428
|
+
numIterations,
|
|
429
|
+
isSavingEachIteration,
|
|
430
|
+
tumor_str,
|
|
431
|
+
max_saves,
|
|
432
|
+
denominator_threshold,
|
|
433
|
+
show_logs=True,
|
|
434
|
+
):
|
|
435
|
+
"""
|
|
436
|
+
MLEM using SELL-C-σ kernels already present on device.
|
|
437
|
+
y must be float32 length TN.
|
|
438
|
+
|
|
439
|
+
Version propre : diagnostics retirés.
|
|
440
|
+
"""
|
|
441
|
+
final_result = None
|
|
442
|
+
|
|
443
|
+
try:
|
|
444
|
+
if not isinstance(SMatrix, SparseSMatrix_SELL):
|
|
445
|
+
raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
|
|
446
|
+
if SMatrix.sell_values_gpu is None:
|
|
447
|
+
raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
|
|
448
|
+
|
|
449
|
+
# Context
|
|
450
|
+
if SMatrix.ctx:
|
|
451
|
+
SMatrix.ctx.push()
|
|
452
|
+
|
|
453
|
+
TN = int(SMatrix.N * SMatrix.T)
|
|
454
|
+
ZX = int(SMatrix.Z * SMatrix.X)
|
|
455
|
+
dtype = np.float32
|
|
456
|
+
block_size = 256
|
|
457
|
+
|
|
458
|
+
proj = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
|
|
459
|
+
backproj = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
|
|
460
|
+
ratio = SMatrix.sparse_mod.get_function("ratio_kernel")
|
|
461
|
+
update = SMatrix.sparse_mod.get_function("update_theta_kernel")
|
|
462
|
+
|
|
463
|
+
stream = drv.Stream()
|
|
464
|
+
|
|
465
|
+
# Device buffers
|
|
466
|
+
y = y.T.flatten().astype(np.float32)
|
|
467
|
+
y_gpu = drv.mem_alloc(y.nbytes)
|
|
468
|
+
drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
|
|
469
|
+
|
|
470
|
+
theta_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
471
|
+
drv.memcpy_htod_async(theta_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
|
|
472
|
+
|
|
473
|
+
q_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
|
|
474
|
+
e_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
|
|
475
|
+
c_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
476
|
+
|
|
477
|
+
slice_ptr_gpu = SMatrix.slice_ptr_gpu
|
|
478
|
+
slice_len_gpu = SMatrix.slice_len_gpu
|
|
479
|
+
slice_height = np.int32(SMatrix.slice_height)
|
|
480
|
+
|
|
481
|
+
grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
|
|
482
|
+
grid_cols = ((ZX + block_size - 1) // block_size, 1, 1)
|
|
483
|
+
|
|
484
|
+
# Prepare save indices
|
|
485
|
+
saved_theta, saved_indices = [], []
|
|
486
|
+
if numIterations <= max_saves:
|
|
487
|
+
save_indices = list(range(numIterations))
|
|
488
|
+
else:
|
|
489
|
+
save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
|
|
490
|
+
if save_indices[-1] != numIterations - 1:
|
|
491
|
+
save_indices.append(numIterations - 1)
|
|
492
|
+
|
|
493
|
+
description = f"AOT-BioMaps -- ML-EM (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
|
|
494
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
495
|
+
|
|
496
|
+
# --- MLEM Loop ---
|
|
497
|
+
for it in iterator:
|
|
498
|
+
|
|
499
|
+
proj(q_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu,
|
|
500
|
+
slice_ptr_gpu, slice_len_gpu,
|
|
501
|
+
theta_gpu, np.int32(TN), slice_height,
|
|
502
|
+
block=(block_size,1,1), grid=grid_rows, stream=stream)
|
|
503
|
+
|
|
504
|
+
ratio(e_gpu, y_gpu, q_gpu, np.float32(denominator_threshold), np.int32(TN),
|
|
505
|
+
block=(block_size,1,1), grid=grid_rows, stream=stream)
|
|
506
|
+
|
|
507
|
+
drv.memset_d32_async(c_gpu, 0, ZX, stream)
|
|
508
|
+
|
|
509
|
+
backproj(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu,
|
|
510
|
+
slice_ptr_gpu, slice_len_gpu,
|
|
511
|
+
e_gpu, c_gpu, np.int32(TN), slice_height,
|
|
512
|
+
block=(block_size,1,1), grid=grid_rows, stream=stream)
|
|
513
|
+
|
|
514
|
+
update(theta_gpu, c_gpu, SMatrix.norm_factor_inv_gpu, np.int32(ZX),
|
|
515
|
+
block=(block_size,1,1), grid=grid_cols, stream=stream)
|
|
516
|
+
|
|
517
|
+
if isSavingEachIteration and it in save_indices:
|
|
518
|
+
out = np.empty(ZX, dtype=np.float32)
|
|
519
|
+
drv.memcpy_dtoh(out, theta_gpu)
|
|
520
|
+
saved_theta.append(out.reshape((SMatrix.Z, SMatrix.X)))
|
|
521
|
+
saved_indices.append(it)
|
|
522
|
+
|
|
523
|
+
stream.synchronize()
|
|
524
|
+
res = np.empty(ZX, dtype=np.float32)
|
|
525
|
+
drv.memcpy_dtoh(res, theta_gpu)
|
|
526
|
+
|
|
527
|
+
# free
|
|
528
|
+
try:
|
|
529
|
+
y_gpu.free()
|
|
530
|
+
q_gpu.free()
|
|
531
|
+
e_gpu.free()
|
|
532
|
+
c_gpu.free()
|
|
533
|
+
theta_gpu.free()
|
|
534
|
+
except Exception:
|
|
535
|
+
pass
|
|
536
|
+
|
|
537
|
+
final_result = res.reshape((SMatrix.Z, SMatrix.X))
|
|
538
|
+
return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
|
|
539
|
+
|
|
540
|
+
except Exception as e:
|
|
541
|
+
print(f"Error in MLEM_sparseSELL_pycuda: {type(e).__name__}: {e}")
|
|
542
|
+
gc.collect()
|
|
543
|
+
return None, None
|
|
544
|
+
|
|
545
|
+
finally:
|
|
546
|
+
if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
|
|
547
|
+
try:
|
|
548
|
+
SMatrix.ctx.pop()
|
|
549
|
+
except Exception:
|
|
550
|
+
pass
|