AOT-biomaps 2.9.176__py3-none-any.whl → 2.9.279__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of AOT-biomaps might be problematic. Click here for more details.
- AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
- AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +11 -6
- AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
- AOT_biomaps/AOT_Experiment/_mainExperiment.py +95 -55
- AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +48 -13
- AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +409 -13
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +118 -38
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +306 -102
- AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +1 -1
- AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +281 -0
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +328 -0
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
- AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
- AOT_biomaps/AOT_Recon/AlgebraicRecon.py +265 -153
- AOT_biomaps/AOT_Recon/AnalyticRecon.py +27 -42
- AOT_biomaps/AOT_Recon/BayesianRecon.py +84 -151
- AOT_biomaps/AOT_Recon/DeepLearningRecon.py +1 -1
- AOT_biomaps/AOT_Recon/PrimalDualRecon.py +69 -62
- AOT_biomaps/AOT_Recon/ReconEnums.py +27 -2
- AOT_biomaps/AOT_Recon/ReconTools.py +152 -12
- AOT_biomaps/AOT_Recon/__init__.py +1 -0
- AOT_biomaps/AOT_Recon/_mainRecon.py +72 -58
- AOT_biomaps/__init__.py +4 -74
- {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.279.dist-info}/METADATA +2 -1
- aot_biomaps-2.9.279.dist-info/RECORD +47 -0
- aot_biomaps-2.9.176.dist-info/RECORD +0 -43
- {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.279.dist-info}/WHEEL +0 -0
- {aot_biomaps-2.9.176.dist-info → aot_biomaps-2.9.279.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,18 @@
|
|
|
1
1
|
from AOT_biomaps.AOT_Recon.ReconTools import _forward_projection, _backward_projection, check_gpu_memory, calculate_memory_requirement
|
|
2
2
|
from AOT_biomaps.Config import config
|
|
3
|
+
from AOT_biomaps.AOT_Recon.AOT_SparseSMatrix.SparseSMatrix_SELL import SparseSMatrix_SELL
|
|
4
|
+
from AOT_biomaps.AOT_Recon.AOT_SparseSMatrix.SparseSMatrix_CSR import SparseSMatrix_CSR
|
|
5
|
+
from AOT_biomaps.AOT_Recon.ReconEnums import SMatrixType
|
|
3
6
|
import numba
|
|
4
7
|
import torch
|
|
5
8
|
import numpy as np
|
|
6
9
|
import os
|
|
7
10
|
from tqdm import trange
|
|
11
|
+
import cupy as cp
|
|
12
|
+
import cupyx.scipy.sparse as cpsparse
|
|
13
|
+
import gc
|
|
14
|
+
import pycuda.driver as drv
|
|
15
|
+
|
|
8
16
|
|
|
9
17
|
def MLEM(
|
|
10
18
|
SMatrix,
|
|
@@ -13,9 +21,12 @@ def MLEM(
|
|
|
13
21
|
isSavingEachIteration=True,
|
|
14
22
|
withTumor=True,
|
|
15
23
|
device=None,
|
|
16
|
-
use_multi_gpu=False,
|
|
17
24
|
use_numba=False,
|
|
25
|
+
denominator_threshold=1e-6,
|
|
18
26
|
max_saves=5000,
|
|
27
|
+
show_logs=True,
|
|
28
|
+
smatrixType=SMatrixType.SELL,
|
|
29
|
+
Z=350,
|
|
19
30
|
):
|
|
20
31
|
"""
|
|
21
32
|
Unified MLEM algorithm for Acousto-Optic Tomography.
|
|
@@ -33,34 +44,38 @@ def MLEM(
|
|
|
33
44
|
Returns:
|
|
34
45
|
Reconstructed image(s) and iteration indices (if isSavingEachIteration)
|
|
35
46
|
"""
|
|
36
|
-
try:
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
else:
|
|
44
|
-
device = torch.device("cpu")
|
|
45
|
-
use_gpu = False
|
|
47
|
+
# try:
|
|
48
|
+
tumor_str = "WITH" if withTumor else "WITHOUT"
|
|
49
|
+
# Auto-select device and method
|
|
50
|
+
if device is None:
|
|
51
|
+
if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
|
|
52
|
+
device = torch.device(f"cuda:{config.select_best_gpu()}")
|
|
53
|
+
use_gpu = True
|
|
46
54
|
else:
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
55
|
+
device = torch.device("cpu")
|
|
56
|
+
use_gpu = False
|
|
57
|
+
else:
|
|
58
|
+
use_gpu = device.type == "cuda"
|
|
59
|
+
# Dispatch to the appropriate implementation
|
|
60
|
+
if use_gpu:
|
|
61
|
+
if smatrixType == SMatrixType.CSR:
|
|
62
|
+
return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
|
|
63
|
+
elif smatrixType == SMatrixType.SELL:
|
|
64
|
+
return MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
|
|
65
|
+
elif smatrixType == SMatrixType.DENSE:
|
|
66
|
+
return _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold,show_logs)
|
|
52
67
|
else:
|
|
53
|
-
|
|
68
|
+
raise ValueError("Unsupported SMatrixType for GPU MLEM.")
|
|
69
|
+
else:
|
|
70
|
+
if use_numba:
|
|
71
|
+
return _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
|
|
54
72
|
else:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
except Exception as e:
|
|
60
|
-
print(f"Error in MLEM: {type(e).__name__}: {e}")
|
|
61
|
-
return None, None
|
|
73
|
+
return _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
|
|
74
|
+
# except Exception as e:
|
|
75
|
+
# print(f"Error in MLEM: {type(e).__name__}: {e}")
|
|
76
|
+
# return None, None
|
|
62
77
|
|
|
63
|
-
def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves=
|
|
78
|
+
def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
|
|
64
79
|
try:
|
|
65
80
|
eps = torch.finfo(torch.float32).eps
|
|
66
81
|
T, Z, X, N = SMatrix.shape
|
|
@@ -82,7 +97,6 @@ def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str
|
|
|
82
97
|
.reshape(-1)
|
|
83
98
|
)
|
|
84
99
|
description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
|
|
85
|
-
|
|
86
100
|
# Calculate save indices
|
|
87
101
|
if numIterations <= max_saves:
|
|
88
102
|
save_indices = list(range(numIterations))
|
|
@@ -91,20 +105,21 @@ def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str
|
|
|
91
105
|
save_indices = list(range(0, numIterations, step))
|
|
92
106
|
if save_indices[-1] != numIterations - 1:
|
|
93
107
|
save_indices.append(numIterations - 1)
|
|
94
|
-
|
|
95
108
|
saved_theta = []
|
|
96
109
|
saved_indices = []
|
|
97
|
-
|
|
98
110
|
with torch.no_grad():
|
|
99
|
-
|
|
111
|
+
# Utilise range si show_logs=False, sinon trange
|
|
112
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
113
|
+
for it in iterator:
|
|
100
114
|
q_flat = A_flat @ theta_flat
|
|
101
|
-
|
|
115
|
+
# Appliquer le seuil : si q_flat < denominator_threshold, on met e_flat à 1 (comme dans le code C++)
|
|
116
|
+
mask = q_flat >= denominator_threshold
|
|
117
|
+
e_flat = torch.where(mask, y_flat / (q_flat + eps), torch.ones_like(q_flat))
|
|
102
118
|
c_flat = A_flat.T @ e_flat
|
|
103
119
|
theta_flat = (theta_flat / (norm_factor_flat + eps)) * c_flat
|
|
104
120
|
if isSavingEachIteration and it in save_indices:
|
|
105
121
|
saved_theta.append(theta_flat.reshape(Z, X).clone())
|
|
106
122
|
saved_indices.append(it)
|
|
107
|
-
|
|
108
123
|
# Free memory
|
|
109
124
|
del A_flat, y_flat, norm_factor_flat
|
|
110
125
|
torch.cuda.empty_cache()
|
|
@@ -117,74 +132,15 @@ def _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str
|
|
|
117
132
|
torch.cuda.empty_cache()
|
|
118
133
|
return None, None
|
|
119
134
|
|
|
120
|
-
def
|
|
121
|
-
try:
|
|
122
|
-
num_gpus = torch.cuda.device_count()
|
|
123
|
-
device = torch.device('cuda:0')
|
|
124
|
-
T, Z, X, N = SMatrix.shape
|
|
125
|
-
A_matrix_torch = torch.tensor(SMatrix, dtype=torch.float32).to(device).permute(0, 3, 1, 2).reshape(T * N, Z * X)
|
|
126
|
-
y_torch = torch.tensor(y, dtype=torch.float32).to(device).reshape(-1)
|
|
127
|
-
A_split = torch.chunk(A_matrix_torch, num_gpus, dim=0)
|
|
128
|
-
y_split = torch.chunk(y_torch, num_gpus)
|
|
129
|
-
theta_0 = torch.ones((Z, X), dtype=torch.float32, device=device)
|
|
130
|
-
theta_list = [theta_0.clone().to(device) for _ in range(num_gpus)]
|
|
131
|
-
normalization_factor = A_matrix_torch.sum(dim=0).reshape(Z, X).to(device)
|
|
132
|
-
|
|
133
|
-
# Calculate save indices
|
|
134
|
-
if numIterations <= max_saves:
|
|
135
|
-
save_indices = list(range(numIterations))
|
|
136
|
-
else:
|
|
137
|
-
step = numIterations // max_saves
|
|
138
|
-
save_indices = list(range(0, numIterations, step))
|
|
139
|
-
if save_indices[-1] != numIterations - 1:
|
|
140
|
-
save_indices.append(numIterations - 1)
|
|
141
|
-
|
|
142
|
-
saved_theta = [theta_0.cpu().numpy()]
|
|
143
|
-
saved_indices = [0]
|
|
144
|
-
description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- processing on multi-GPU ({num_gpus} GPUs) ----"
|
|
145
|
-
|
|
146
|
-
for it in trange(numIterations, desc=description):
|
|
147
|
-
theta_p_list = []
|
|
148
|
-
for i in range(num_gpus):
|
|
149
|
-
with torch.cuda.device(f'cuda:{i}'):
|
|
150
|
-
theta_p = theta_list[i].to(f'cuda:{i}')
|
|
151
|
-
A_i = A_split[i].to(f'cuda:{i}')
|
|
152
|
-
y_i = y_split[i].to(f'cuda:{i}')
|
|
153
|
-
q_flat = A_i @ theta_p.reshape(-1)
|
|
154
|
-
e_flat = y_i / (q_flat + torch.finfo(torch.float32).tiny)
|
|
155
|
-
c_flat = A_i.T @ e_flat
|
|
156
|
-
theta_p_plus_1_flat = (theta_p.reshape(-1) / (normalization_factor.to(f'cuda:{i}').reshape(-1) + torch.finfo(torch.float32).tiny)) * c_flat
|
|
157
|
-
theta_p_plus_1 = theta_p_plus_1_flat.reshape(Z, X)
|
|
158
|
-
theta_p_list.append(theta_p_plus_1)
|
|
159
|
-
for i in range(num_gpus):
|
|
160
|
-
theta_list[i] = theta_p_list[i].to('cuda:0')
|
|
161
|
-
if isSavingEachIteration and it in save_indices:
|
|
162
|
-
saved_theta.append(torch.stack(theta_p_list).mean(dim=0).cpu().numpy())
|
|
163
|
-
saved_indices.append(it + 1)
|
|
164
|
-
|
|
165
|
-
del A_matrix_torch, y_torch, A_split, y_split, theta_0, normalization_factor
|
|
166
|
-
for i in range(num_gpus):
|
|
167
|
-
torch.cuda.empty_cache()
|
|
168
|
-
if not isSavingEachIteration:
|
|
169
|
-
return torch.stack(theta_p_list).mean(dim=0).cpu().numpy(), None
|
|
170
|
-
else:
|
|
171
|
-
return saved_theta, saved_indices
|
|
172
|
-
except Exception as e:
|
|
173
|
-
print(f"Error in multi-GPU MLEM: {type(e).__name__}: {e}")
|
|
174
|
-
del A_matrix_torch, y_torch, A_split, y_split, theta_0, normalization_factor
|
|
175
|
-
for i in range(num_gpus):
|
|
176
|
-
torch.cuda.empty_cache()
|
|
177
|
-
return None, None
|
|
178
|
-
|
|
179
|
-
def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves=5000):
|
|
135
|
+
def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs=True):
|
|
180
136
|
try:
|
|
181
137
|
numba.set_num_threads(os.cpu_count())
|
|
182
|
-
q_p = np.zeros((SMatrix.shape[0], SMatrix.shape[3]))
|
|
183
|
-
c_p = np.zeros((SMatrix.shape[1], SMatrix.shape[2]))
|
|
184
|
-
theta_p_0 = np.ones((SMatrix.shape[1], SMatrix.shape[2]))
|
|
138
|
+
q_p = np.zeros((SMatrix.shape[0], SMatrix.shape[3]), dtype=np.float32)
|
|
139
|
+
c_p = np.zeros((SMatrix.shape[1], SMatrix.shape[2]), dtype=np.float32)
|
|
140
|
+
theta_p_0 = np.ones((SMatrix.shape[1], SMatrix.shape[2]), dtype=np.float32)
|
|
185
141
|
matrix_theta = [theta_p_0]
|
|
186
142
|
saved_indices = [0]
|
|
187
|
-
normalization_factor = np.sum(SMatrix, axis=(0, 3))
|
|
143
|
+
normalization_factor = np.sum(SMatrix, axis=(0, 3)).astype(np.float32)
|
|
188
144
|
|
|
189
145
|
# Calculate save indices
|
|
190
146
|
if numIterations <= max_saves:
|
|
@@ -196,14 +152,20 @@ def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
|
|
|
196
152
|
save_indices.append(numIterations - 1)
|
|
197
153
|
|
|
198
154
|
description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- processing on multithread CPU ({numba.config.NUMBA_DEFAULT_NUM_THREADS} threads) ----"
|
|
155
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
199
156
|
|
|
200
|
-
for it in
|
|
157
|
+
for it in iterator:
|
|
201
158
|
theta_p = matrix_theta[-1]
|
|
202
159
|
_forward_projection(SMatrix, theta_p, q_p)
|
|
203
|
-
|
|
160
|
+
|
|
161
|
+
# Appliquer le seuil : si q_p < denominator_threshold, on met e_p à 1
|
|
162
|
+
mask = q_p >= denominator_threshold
|
|
163
|
+
e_p = np.where(mask, y / (q_p + 1e-8), 1.0)
|
|
164
|
+
|
|
204
165
|
_backward_projection(SMatrix, e_p, c_p)
|
|
205
166
|
theta_p_plus_1 = theta_p / (normalization_factor + 1e-8) * c_p
|
|
206
|
-
|
|
167
|
+
|
|
168
|
+
if isSavingEachIteration and (it + 1) in save_indices:
|
|
207
169
|
matrix_theta.append(theta_p_plus_1)
|
|
208
170
|
saved_indices.append(it + 1)
|
|
209
171
|
else:
|
|
@@ -217,7 +179,7 @@ def _MLEM_CPU_numba(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
|
|
|
217
179
|
print(f"Error in Numba CPU MLEM: {type(e).__name__}: {e}")
|
|
218
180
|
return None, None
|
|
219
181
|
|
|
220
|
-
def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves=
|
|
182
|
+
def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs=True):
|
|
221
183
|
try:
|
|
222
184
|
T, Z, X, N = SMatrix.shape
|
|
223
185
|
A_flat = SMatrix.astype(np.float32).transpose(0, 3, 1, 2).reshape(T * N, Z * X)
|
|
@@ -238,16 +200,22 @@ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
|
|
|
238
200
|
save_indices.append(numIterations - 1)
|
|
239
201
|
|
|
240
202
|
description = f"AOT-BioMaps -- ML-EM ---- {tumor_str} TUMOR ---- processing on single CPU (optimized) ----"
|
|
203
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
241
204
|
|
|
242
|
-
for it in
|
|
205
|
+
for it in iterator:
|
|
243
206
|
theta_p = matrix_theta[-1]
|
|
244
207
|
theta_p_flat = theta_p.reshape(-1)
|
|
245
208
|
q_flat = A_flat @ theta_p_flat
|
|
246
|
-
|
|
209
|
+
|
|
210
|
+
# Appliquer le seuil : si q_flat < denominator_threshold, on met e_flat à 1
|
|
211
|
+
mask = q_flat >= denominator_threshold
|
|
212
|
+
e_flat = np.where(mask, y_flat / (q_flat + np.finfo(np.float32).tiny), 1.0)
|
|
213
|
+
|
|
247
214
|
c_flat = A_flat.T @ e_flat
|
|
248
215
|
theta_p_plus_1_flat = theta_p_flat / (normalization_factor_flat + np.finfo(np.float32).tiny) * c_flat
|
|
249
216
|
theta_p_plus_1 = theta_p_plus_1_flat.reshape(Z, X)
|
|
250
|
-
|
|
217
|
+
|
|
218
|
+
if isSavingEachIteration and (it + 1) in save_indices:
|
|
251
219
|
matrix_theta.append(theta_p_plus_1)
|
|
252
220
|
saved_indices.append(it + 1)
|
|
253
221
|
else:
|
|
@@ -260,3 +228,239 @@ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
|
|
|
260
228
|
except Exception as e:
|
|
261
229
|
print(f"Error in optimized CPU MLEM: {type(e).__name__}: {e}")
|
|
262
230
|
return None, None
|
|
231
|
+
|
|
232
|
+
def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
|
|
233
|
+
"""
|
|
234
|
+
SMatrix: instance of SparseMatrixGPU (already allocated)
|
|
235
|
+
y: measured data (1D np.float32 of length TN)
|
|
236
|
+
|
|
237
|
+
Assumptions:
|
|
238
|
+
- SMatrix.values_gpu and SMatrix.col_ind_gpu and SMatrix.row_ptr_gpu are device pointers
|
|
239
|
+
- SMatrix.norm_factor_inv_gpu exists
|
|
240
|
+
- SMatrix.ctx is the PyCUDA context for the target GPU.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
# We use a final_result placeholder to ensure it's defined outside the try block
|
|
244
|
+
final_result = None
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
if not isinstance(SMatrix, SparseSMatrix_CSR):
|
|
248
|
+
raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
|
|
249
|
+
|
|
250
|
+
# --- CONTEXT FIX: Push the context associated with SMatrix ---
|
|
251
|
+
# This ensures all subsequent PyCUDA operations use the correct GPU/context.
|
|
252
|
+
if SMatrix.ctx:
|
|
253
|
+
SMatrix.ctx.push()
|
|
254
|
+
# -----------------------------------------------------------
|
|
255
|
+
|
|
256
|
+
dtype = np.float32
|
|
257
|
+
TN = SMatrix.N * SMatrix.T
|
|
258
|
+
ZX = SMatrix.Z * SMatrix.X
|
|
259
|
+
# Ensure Z and X are correctly defined for reshaping
|
|
260
|
+
Z = SMatrix.Z
|
|
261
|
+
X = SMatrix.X
|
|
262
|
+
|
|
263
|
+
if show_logs:
|
|
264
|
+
# We assume SMatrix was initialized using the correct device index.
|
|
265
|
+
print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
|
|
266
|
+
print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
|
|
267
|
+
|
|
268
|
+
# streams
|
|
269
|
+
stream = drv.Stream()
|
|
270
|
+
|
|
271
|
+
# allocate device buffers
|
|
272
|
+
y = y.T.flatten().astype(np.float32)
|
|
273
|
+
y_gpu = drv.mem_alloc(y.nbytes)
|
|
274
|
+
drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
|
|
275
|
+
|
|
276
|
+
theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
277
|
+
initial_theta = np.full(ZX, 0.1, dtype=dtype)
|
|
278
|
+
drv.memcpy_htod_async(theta_flat_gpu, initial_theta, stream)
|
|
279
|
+
|
|
280
|
+
norm_factor_inv_gpu = SMatrix.norm_factor_inv_gpu
|
|
281
|
+
|
|
282
|
+
q_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
|
|
283
|
+
e_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
|
|
284
|
+
c_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
285
|
+
|
|
286
|
+
# Assuming the cubin file is found globally or managed by the caller
|
|
287
|
+
mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
|
|
288
|
+
projection_kernel = mod.get_function('projection_kernel__CSR')
|
|
289
|
+
backprojection_kernel = mod.get_function('backprojection_kernel__CSR')
|
|
290
|
+
ratio_kernel = mod.get_function('ratio_kernel')
|
|
291
|
+
update_kernel = mod.get_function('update_theta_kernel')
|
|
292
|
+
block_size = 256
|
|
293
|
+
|
|
294
|
+
saved_theta, saved_indices = [], []
|
|
295
|
+
if numIterations <= max_saves:
|
|
296
|
+
save_indices = list(range(numIterations))
|
|
297
|
+
else:
|
|
298
|
+
save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
|
|
299
|
+
if save_indices[-1] != numIterations - 1:
|
|
300
|
+
save_indices.append(numIterations - 1)
|
|
301
|
+
|
|
302
|
+
description = f"AOT-BioMaps -- ML-EM (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
|
|
303
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
304
|
+
for it in iterator:
|
|
305
|
+
# projection: q = A * theta
|
|
306
|
+
projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
|
|
307
|
+
theta_flat_gpu, np.int32(TN),
|
|
308
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1),
|
|
309
|
+
stream=stream)
|
|
310
|
+
|
|
311
|
+
# ratio: e = y / max(q, threshold)
|
|
312
|
+
ratio_kernel(e_flat_gpu, y_gpu, q_flat_gpu, np.float32(denominator_threshold), np.int32(TN),
|
|
313
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
314
|
+
|
|
315
|
+
# backprojection: c = A^T * e
|
|
316
|
+
drv.memset_d32_async(c_flat_gpu, 0, ZX, stream)
|
|
317
|
+
backprojection_kernel(c_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
|
|
318
|
+
e_flat_gpu, np.int32(TN),
|
|
319
|
+
block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
320
|
+
|
|
321
|
+
# update: theta *= norm_factor_inv * c
|
|
322
|
+
update_kernel(theta_flat_gpu, c_flat_gpu, norm_factor_inv_gpu, np.int32(ZX),
|
|
323
|
+
block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
|
|
324
|
+
|
|
325
|
+
if show_logs and (it % 10 == 0 or it == numIterations - 1):
|
|
326
|
+
drv.Context.synchronize()
|
|
327
|
+
|
|
328
|
+
if isSavingEachIteration and it in save_indices:
|
|
329
|
+
theta_host = np.empty(ZX, dtype=dtype)
|
|
330
|
+
drv.memcpy_dtoh(theta_host, theta_flat_gpu)
|
|
331
|
+
saved_theta.append(theta_host.reshape(Z, X))
|
|
332
|
+
saved_indices.append(it)
|
|
333
|
+
|
|
334
|
+
drv.Context.synchronize()
|
|
335
|
+
|
|
336
|
+
final_result = np.empty(ZX, dtype=dtype)
|
|
337
|
+
drv.memcpy_dtoh(final_result, theta_flat_gpu)
|
|
338
|
+
final_result = final_result.reshape(Z, X)
|
|
339
|
+
|
|
340
|
+
# free local allocations
|
|
341
|
+
y_gpu.free(); q_flat_gpu.free(); e_flat_gpu.free(); c_flat_gpu.free(); theta_flat_gpu.free()
|
|
342
|
+
|
|
343
|
+
return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
|
|
344
|
+
|
|
345
|
+
except Exception as e:
|
|
346
|
+
print(f"Error in MLEM_sparseCSR_pycuda: {type(e).__name__}: {e}")
|
|
347
|
+
gc.collect()
|
|
348
|
+
return None, None
|
|
349
|
+
|
|
350
|
+
finally:
|
|
351
|
+
# --- CONTEXT FIX: Pop the context ---
|
|
352
|
+
if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
|
|
353
|
+
SMatrix.ctx.pop()
|
|
354
|
+
# ------------------------------------
|
|
355
|
+
|
|
356
|
+
def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
|
|
357
|
+
"""
|
|
358
|
+
MLEM using SELL-C-σ kernels already present on device.
|
|
359
|
+
y must be float32 length TN.
|
|
360
|
+
"""
|
|
361
|
+
final_result = None
|
|
362
|
+
|
|
363
|
+
try:
|
|
364
|
+
# check if SMatrix is SparseSMatrix_SELL object
|
|
365
|
+
if not isinstance(SMatrix, SparseSMatrix_SELL):
|
|
366
|
+
raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
|
|
367
|
+
if SMatrix.sell_values_gpu is None:
|
|
368
|
+
raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
|
|
369
|
+
|
|
370
|
+
# --- CONTEXT FIX: Push the context associated with SMatrix ---
|
|
371
|
+
# This ensures all subsequent PyCUDA operations use the correct GPU/context.
|
|
372
|
+
if SMatrix.ctx:
|
|
373
|
+
SMatrix.ctx.push()
|
|
374
|
+
# -----------------------------------------------------------
|
|
375
|
+
|
|
376
|
+
TN = int(SMatrix.N * SMatrix.T)
|
|
377
|
+
ZX = int(SMatrix.Z * SMatrix.X)
|
|
378
|
+
dtype = np.float32
|
|
379
|
+
block_size = 256
|
|
380
|
+
|
|
381
|
+
mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
|
|
382
|
+
proj = mod.get_function("projection_kernel__SELL")
|
|
383
|
+
backproj = mod.get_function("backprojection_kernel__SELL")
|
|
384
|
+
ratio = mod.get_function("ratio_kernel")
|
|
385
|
+
update = mod.get_function("update_theta_kernel")
|
|
386
|
+
|
|
387
|
+
stream = drv.Stream()
|
|
388
|
+
|
|
389
|
+
# device buffers
|
|
390
|
+
y = y.T.flatten().astype(np.float32)
|
|
391
|
+
y_gpu = drv.mem_alloc(y.nbytes)
|
|
392
|
+
drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
|
|
393
|
+
|
|
394
|
+
theta_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
395
|
+
drv.memcpy_htod_async(theta_gpu, np.full(ZX, 0.1, dtype=dtype), stream)
|
|
396
|
+
|
|
397
|
+
q_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
|
|
398
|
+
e_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
|
|
399
|
+
c_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
|
|
400
|
+
|
|
401
|
+
slice_ptr_gpu = SMatrix.slice_ptr_gpu
|
|
402
|
+
slice_len_gpu = SMatrix.slice_len_gpu
|
|
403
|
+
slice_height = np.int32(SMatrix.slice_height)
|
|
404
|
+
|
|
405
|
+
grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
|
|
406
|
+
grid_cols = ((ZX + block_size - 1) // block_size, 1, 1)
|
|
407
|
+
|
|
408
|
+
saved_theta, saved_indices = [], []
|
|
409
|
+
if numIterations <= max_saves:
|
|
410
|
+
save_indices = list(range(numIterations))
|
|
411
|
+
else:
|
|
412
|
+
save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
|
|
413
|
+
if save_indices[-1] != numIterations - 1:
|
|
414
|
+
save_indices.append(numIterations - 1)
|
|
415
|
+
|
|
416
|
+
description = f"AOT-BioMaps -- ML-EM (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
|
|
417
|
+
iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
|
|
418
|
+
for it in iterator:
|
|
419
|
+
# projection
|
|
420
|
+
proj(q_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, slice_ptr_gpu, slice_len_gpu,
|
|
421
|
+
theta_gpu, np.int32(TN), slice_height,
|
|
422
|
+
block=(block_size,1,1), grid=grid_rows, stream=stream)
|
|
423
|
+
|
|
424
|
+
# ratio
|
|
425
|
+
ratio(e_gpu, y_gpu, q_gpu, np.float32(denominator_threshold), np.int32(TN),
|
|
426
|
+
block=(block_size,1,1), grid=grid_rows, stream=stream)
|
|
427
|
+
|
|
428
|
+
# zero c
|
|
429
|
+
drv.memset_d32_async(c_gpu, 0, ZX, stream)
|
|
430
|
+
|
|
431
|
+
# backprojection accumulate
|
|
432
|
+
backproj(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, slice_ptr_gpu, slice_len_gpu,
|
|
433
|
+
e_gpu, c_gpu, np.int32(TN), slice_height,
|
|
434
|
+
block=(block_size,1,1), grid=grid_rows, stream=stream)
|
|
435
|
+
|
|
436
|
+
# update
|
|
437
|
+
update(theta_gpu, c_gpu, SMatrix.norm_factor_inv_gpu, np.int32(ZX),
|
|
438
|
+
block=(block_size,1,1), grid=grid_cols, stream=stream)
|
|
439
|
+
|
|
440
|
+
stream.synchronize()
|
|
441
|
+
if isSavingEachIteration and it in save_indices:
|
|
442
|
+
out = np.empty(ZX, dtype=np.float32)
|
|
443
|
+
drv.memcpy_dtoh(out, theta_gpu)
|
|
444
|
+
saved_theta.append(out.reshape((SMatrix.Z, SMatrix.X)))
|
|
445
|
+
saved_indices.append(it)
|
|
446
|
+
|
|
447
|
+
# final copy
|
|
448
|
+
res = np.empty(ZX, dtype=np.float32)
|
|
449
|
+
drv.memcpy_dtoh(res, theta_gpu)
|
|
450
|
+
|
|
451
|
+
# free temporaries
|
|
452
|
+
y_gpu.free(); q_gpu.free(); e_gpu.free(); c_gpu.free(); theta_gpu.free()
|
|
453
|
+
|
|
454
|
+
final_result = res.reshape((SMatrix.Z, SMatrix.X))
|
|
455
|
+
return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
|
|
456
|
+
|
|
457
|
+
except Exception as e:
|
|
458
|
+
print(f"Error in MLEM_sparseSELL_pycuda: {type(e).__name__}: {e}")
|
|
459
|
+
gc.collect()
|
|
460
|
+
return None, None
|
|
461
|
+
|
|
462
|
+
finally:
|
|
463
|
+
# --- CONTEXT FIX: Pop the context ---
|
|
464
|
+
if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
|
|
465
|
+
SMatrix.ctx.pop()
|
|
466
|
+
# ------------------------------------
|
|
@@ -27,7 +27,7 @@ def CP_TV(
|
|
|
27
27
|
Works on both CPU and GPU.
|
|
28
28
|
Args:
|
|
29
29
|
SMatrix: System matrix (shape: T, Z, X, N)
|
|
30
|
-
y: Measurement data (shape: T,
|
|
30
|
+
y: Measurement data (shape: T, N)
|
|
31
31
|
alpha: Regularization parameter for TV
|
|
32
32
|
theta: Relaxation parameter (1.0 for standard Chambolle-Pock)
|
|
33
33
|
numIterations: Number of iterations
|
|
@@ -9,26 +9,24 @@ def _Omega_RELATIVE_DIFFERENCE_CPU(theta_flat, index, values, gamma):
|
|
|
9
9
|
theta_k = theta_flat[k_idx]
|
|
10
10
|
diff = theta_k - theta_j
|
|
11
11
|
abs_diff = np.abs(diff)
|
|
12
|
-
|
|
13
12
|
denom = theta_k + theta_j + gamma * abs_diff + 1e-8
|
|
14
13
|
num = diff ** 2
|
|
15
|
-
|
|
14
|
+
psi_pair = num / denom
|
|
15
|
+
psi_pair = values * psi_pair
|
|
16
16
|
# First derivative ∂U/∂θ_j
|
|
17
17
|
dpsi = (2 * diff * denom - num * (1 + gamma * np.sign(diff))) / (denom ** 2)
|
|
18
18
|
grad_pair = values * (-dpsi) # Note the negative sign: U contains ψ(θ_k, θ_j), seeking ∂/∂θ_j
|
|
19
|
-
|
|
20
19
|
# Second derivative ∂²U/∂θ_j² (numerically stable, approximate treatment)
|
|
21
20
|
d2psi = (2 * denom ** 2 - 4 * diff * denom * (1 + gamma * np.sign(diff))
|
|
22
21
|
+ 2 * num * (1 + gamma * np.sign(diff)) ** 2) / (denom ** 3 + 1e-8)
|
|
23
22
|
hess_pair = values * d2psi
|
|
24
|
-
|
|
25
23
|
grad_U = np.zeros_like(theta_flat)
|
|
26
24
|
hess_U = np.zeros_like(theta_flat)
|
|
27
|
-
|
|
28
25
|
np.add.at(grad_U, j_idx, grad_pair)
|
|
29
26
|
np.add.at(hess_U, j_idx, hess_pair)
|
|
30
|
-
|
|
31
|
-
|
|
27
|
+
# Compute U_value
|
|
28
|
+
U_value = 0.5 * np.sum(psi_pair)
|
|
29
|
+
return grad_U, hess_U, U_value
|
|
32
30
|
|
|
33
31
|
def _Omega_RELATIVE_DIFFERENCE_GPU(theta_flat, index, values, device, gamma):
|
|
34
32
|
j_idx, k_idx = index
|
|
@@ -38,26 +36,24 @@ def _Omega_RELATIVE_DIFFERENCE_GPU(theta_flat, index, values, device, gamma):
|
|
|
38
36
|
abs_diff = torch.abs(diff)
|
|
39
37
|
denom = theta_k + theta_j + gamma * abs_diff + 1e-8
|
|
40
38
|
num = diff ** 2
|
|
41
|
-
|
|
39
|
+
psi_pair = num / denom
|
|
40
|
+
psi_pair = values * psi_pair
|
|
42
41
|
# Compute gradient contributions
|
|
43
42
|
dpsi = (2 * diff * denom - num * (1 + gamma * torch.sign(diff))) / (denom ** 2)
|
|
44
43
|
grad_pair = values * (-dpsi)
|
|
45
|
-
|
|
46
44
|
# Compute Hessian contributions
|
|
47
45
|
d2psi = (2 * denom ** 2 - 4 * diff * denom * (1 + gamma * torch.sign(diff))
|
|
48
46
|
+ 2 * num * (1 + gamma * torch.sign(diff)) ** 2) / (denom ** 3 + 1e-8)
|
|
49
47
|
hess_pair = values * d2psi
|
|
50
|
-
|
|
51
48
|
# Initialize gradient and Hessian on the correct device
|
|
52
49
|
grad_U = torch.zeros_like(theta_flat, device=device)
|
|
53
50
|
hess_U = torch.zeros_like(theta_flat, device=device)
|
|
54
|
-
|
|
55
51
|
# Accumulate gradient contributions
|
|
56
52
|
grad_U.index_add_(0, j_idx, grad_pair)
|
|
57
53
|
grad_U.index_add_(0, k_idx, -grad_pair)
|
|
58
|
-
|
|
59
54
|
# Accumulate Hessian contributions
|
|
60
55
|
hess_U.index_add_(0, j_idx, hess_pair)
|
|
61
56
|
hess_U.index_add_(0, k_idx, hess_pair)
|
|
62
|
-
|
|
63
|
-
|
|
57
|
+
# Compute U_value
|
|
58
|
+
U_value = 0.5 * psi_pair.sum()
|
|
59
|
+
return grad_U, hess_U, U_value
|