AOT-biomaps 2.9.138__py3-none-any.whl → 2.9.279__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of AOT-biomaps might be problematic. Click here for more details.
- AOT_biomaps/AOT_Acoustic/AcousticTools.py +35 -115
- AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
- AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +22 -18
- AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
- AOT_biomaps/AOT_Experiment/_mainExperiment.py +102 -68
- AOT_biomaps/AOT_Optic/_mainOptic.py +124 -58
- AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +72 -108
- AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +474 -289
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +173 -68
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +360 -154
- AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +150 -111
- AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +281 -0
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +328 -0
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
- AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
- AOT_biomaps/AOT_Recon/AlgebraicRecon.py +359 -238
- AOT_biomaps/AOT_Recon/AnalyticRecon.py +29 -41
- AOT_biomaps/AOT_Recon/BayesianRecon.py +165 -91
- AOT_biomaps/AOT_Recon/DeepLearningRecon.py +4 -1
- AOT_biomaps/AOT_Recon/PrimalDualRecon.py +175 -31
- AOT_biomaps/AOT_Recon/ReconEnums.py +38 -3
- AOT_biomaps/AOT_Recon/ReconTools.py +184 -77
- AOT_biomaps/AOT_Recon/__init__.py +1 -0
- AOT_biomaps/AOT_Recon/_mainRecon.py +144 -74
- AOT_biomaps/__init__.py +4 -36
- {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/METADATA +2 -1
- aot_biomaps-2.9.279.dist-info/RECORD +47 -0
- aot_biomaps-2.9.138.dist-info/RECORD +0 -43
- {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/WHEEL +0 -0
- {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
from AOT_biomaps.AOT_Recon.ReconTools import power_method,
|
|
1
|
+
from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star
|
|
2
2
|
from AOT_biomaps.Config import config
|
|
3
|
-
|
|
3
|
+
from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType
|
|
4
4
|
import torch
|
|
5
5
|
from tqdm import trange
|
|
6
6
|
|
|
@@ -10,173 +10,212 @@ It includes Chambolle-Pock algorithms for Total Variation (TV) and Kullback-Leib
|
|
|
10
10
|
The methods can run on both CPU and GPU, with configurations set in the AOT_biomaps.Config module.
|
|
11
11
|
'''
|
|
12
12
|
|
|
13
|
-
def
|
|
14
|
-
|
|
13
|
+
def CP_TV(
|
|
14
|
+
SMatrix,
|
|
15
|
+
y,
|
|
16
|
+
alpha=1e-1,
|
|
17
|
+
theta=1.0,
|
|
18
|
+
numIterations=5000,
|
|
19
|
+
isSavingEachIteration=True,
|
|
20
|
+
L=None,
|
|
21
|
+
withTumor=True,
|
|
22
|
+
device=None,
|
|
23
|
+
max_saves=5000,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Chambolle-Pock algorithm for Total Variation (TV) regularization.
|
|
27
|
+
Works on both CPU and GPU.
|
|
28
|
+
Args:
|
|
29
|
+
SMatrix: System matrix (shape: T, Z, X, N)
|
|
30
|
+
y: Measurement data (shape: T, N)
|
|
31
|
+
alpha: Regularization parameter for TV
|
|
32
|
+
theta: Relaxation parameter (1.0 for standard Chambolle-Pock)
|
|
33
|
+
numIterations: Number of iterations
|
|
34
|
+
isSavingEachIteration: If True, returns selected intermediate reconstructions
|
|
35
|
+
L: Lipschitz constant (estimated if None)
|
|
36
|
+
withTumor: Boolean for description only
|
|
37
|
+
device: Torch device (auto-selected if None)
|
|
38
|
+
max_saves: Maximum number of intermediate saves (default: 5000)
|
|
39
|
+
"""
|
|
40
|
+
# Auto-select device if not provided
|
|
41
|
+
if device is None:
|
|
42
|
+
device = torch.device(f"cuda:{config.select_best_gpu()}" if torch.cuda.is_available() else "cpu")
|
|
43
|
+
else:
|
|
44
|
+
device = torch.device(device)
|
|
45
|
+
|
|
46
|
+
# Convert data to tensors and move to device
|
|
15
47
|
A = torch.tensor(SMatrix, dtype=torch.float32, device=device)
|
|
16
48
|
y = torch.tensor(y, dtype=torch.float32, device=device)
|
|
17
|
-
|
|
18
49
|
T, Z, X, N = SMatrix.shape
|
|
19
50
|
A_flat = A.permute(0, 3, 1, 2).reshape(T * N, Z * X)
|
|
20
51
|
y_flat = y.reshape(-1)
|
|
21
52
|
|
|
22
|
-
|
|
53
|
+
# Robust normalization
|
|
54
|
+
norm_A = A_flat.abs().max().clamp(min=1e-8)
|
|
55
|
+
norm_y = y_flat.abs().max().clamp(min=1e-8)
|
|
56
|
+
A_flat = A_flat / norm_A
|
|
57
|
+
y_flat = y_flat / norm_y
|
|
58
|
+
|
|
59
|
+
# Define forward/backward operators
|
|
60
|
+
P = lambda x: torch.matmul(A_flat, x)
|
|
23
61
|
PT = lambda y: torch.matmul(A_flat.T, y)
|
|
24
62
|
|
|
63
|
+
# Estimate Lipschitz constant if needed
|
|
25
64
|
if L is None:
|
|
26
|
-
|
|
65
|
+
try:
|
|
66
|
+
L = power_method(P, PT, y_flat, Z, X)
|
|
67
|
+
L = max(L, 1e-3)
|
|
68
|
+
except:
|
|
69
|
+
L = 1.0
|
|
27
70
|
|
|
28
71
|
sigma = 1.0 / L
|
|
29
72
|
tau = 1.0 / L
|
|
30
73
|
|
|
74
|
+
# Initialize variables
|
|
31
75
|
x = torch.zeros(Z * X, device=device)
|
|
32
76
|
p = torch.zeros((2, Z, X), device=device)
|
|
33
77
|
q = torch.zeros_like(y_flat)
|
|
34
78
|
x_tilde = x.clone()
|
|
35
79
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
description = f"AOT-BioMaps -- Primal/Dual Recontruction Tomography : Chambolle-Pock (TV : Gaussian Noise) α:{alpha:.4f} L: {L:.4f} ---- WITH TUMOR ---- processing on single CPU ----"
|
|
80
|
+
# Calculate save indices
|
|
81
|
+
if numIterations <= max_saves:
|
|
82
|
+
save_indices = list(range(numIterations))
|
|
40
83
|
else:
|
|
41
|
-
|
|
84
|
+
step = numIterations // max_saves
|
|
85
|
+
save_indices = list(range(0, numIterations, step))
|
|
86
|
+
if save_indices[-1] != numIterations - 1:
|
|
87
|
+
save_indices.append(numIterations - 1)
|
|
42
88
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
q = (q + sigma * P(x_tilde) - sigma * y_flat) / (1.0 + sigma)
|
|
89
|
+
I_reconMatrix = []
|
|
90
|
+
saved_indices = []
|
|
46
91
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
92
|
+
# Description for progress bar
|
|
93
|
+
tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
|
|
94
|
+
device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
|
|
95
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
|
|
50
96
|
|
|
51
|
-
|
|
52
|
-
|
|
97
|
+
# Main loop
|
|
98
|
+
for iteration in trange(numIterations, desc=description):
|
|
99
|
+
# Update p (TV proximal step)
|
|
100
|
+
grad_x = gradient(x_tilde.reshape(Z, X))
|
|
101
|
+
p = proj_l2(p + sigma * grad_x, alpha)
|
|
53
102
|
|
|
54
|
-
|
|
103
|
+
# Update q (data fidelity)
|
|
104
|
+
q = (q + sigma * (P(x_tilde) - y_flat)) / (1 + sigma)
|
|
55
105
|
|
|
56
|
-
|
|
106
|
+
# Update x
|
|
107
|
+
x_old = x.clone()
|
|
108
|
+
div_p = div(p).ravel() # Divergence calculation
|
|
109
|
+
ATq = PT(q)
|
|
110
|
+
x = (x - tau * (ATq - div_p)) / (1 + tau * 1e-6) # Light L2 regularization
|
|
57
111
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
y = torch.tensor(y, dtype=torch.float32, device=device)
|
|
61
|
-
T, Z, X, N = SMatrix.shape
|
|
62
|
-
A_flat = A.permute(0, 3, 1, 2).reshape(T * N, Z * X)
|
|
63
|
-
y_flat = y.reshape(-1)
|
|
64
|
-
P = lambda x: torch.matmul(A_flat, x.ravel())
|
|
65
|
-
PT = lambda y: torch.matmul(A_flat.T, y)
|
|
112
|
+
# Update x_tilde
|
|
113
|
+
x_tilde = x + theta * (x - x_old)
|
|
66
114
|
|
|
67
|
-
|
|
68
|
-
|
|
115
|
+
# Save intermediate result if needed
|
|
116
|
+
if isSavingEachIteration and iteration in save_indices:
|
|
117
|
+
I_reconMatrix.append(x.reshape(Z, X).clone() * (norm_y / norm_A))
|
|
118
|
+
saved_indices.append(iteration)
|
|
69
119
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
p = torch.zeros((2, Z, X), device=device)
|
|
74
|
-
q = torch.zeros_like(y_flat)
|
|
75
|
-
x_tilde = x.clone()
|
|
76
|
-
I_reconMatrix = [x.reshape(Z, X).cpu().numpy()]
|
|
77
|
-
|
|
78
|
-
if withTumor:
|
|
79
|
-
description = f"AOT-BioMaps -- Primal/Dual Recontruction Tomography : Chambolle-Pock (TV : Gaussian Noise) α:{alpha:.4f} L: {L:.4f} ---- WITH TUMOR ---- processing on GPU no.{torch.cuda.current_device()} ----"
|
|
120
|
+
# Return results
|
|
121
|
+
if isSavingEachIteration:
|
|
122
|
+
return [tensor.cpu().numpy() for tensor in I_reconMatrix], saved_indices
|
|
80
123
|
else:
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
124
|
+
return (x.reshape(Z, X) * (norm_y / norm_A)).cpu().numpy(), None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def CP_KL(
|
|
128
|
+
SMatrix,
|
|
129
|
+
y,
|
|
130
|
+
alpha=1e-9,
|
|
131
|
+
theta=1.0,
|
|
132
|
+
numIterations=5000,
|
|
133
|
+
isSavingEachIteration=True,
|
|
134
|
+
L=None,
|
|
135
|
+
withTumor=True,
|
|
136
|
+
device=None,
|
|
137
|
+
max_saves=5000,
|
|
138
|
+
):
|
|
139
|
+
"""
|
|
140
|
+
Chambolle-Pock algorithm for Kullback-Leibler (KL) divergence regularization.
|
|
141
|
+
Works on both CPU and GPU.
|
|
142
|
+
Args:
|
|
143
|
+
SMatrix: System matrix (shape: T, Z, X, N)
|
|
144
|
+
y: Measurement data (shape: T, X, N)
|
|
145
|
+
alpha: Regularization parameter
|
|
146
|
+
theta: Relaxation parameter (1.0 for standard Chambolle-Pock)
|
|
147
|
+
numIterations: Number of iterations
|
|
148
|
+
isSavingEachIteration: If True, returns selected intermediate reconstructions
|
|
149
|
+
L: Lipschitz constant (estimated if None)
|
|
150
|
+
withTumor: Boolean for description only
|
|
151
|
+
device: Torch device (auto-selected if None)
|
|
152
|
+
max_saves: Maximum number of intermediate saves (default: 5000)
|
|
153
|
+
"""
|
|
154
|
+
# Auto-select device if not provided
|
|
155
|
+
if device is None:
|
|
156
|
+
device = torch.device(f"cuda:{config.select_best_gpu()}" if torch.cuda.is_available() else "cpu")
|
|
157
|
+
else:
|
|
158
|
+
device = torch.device(device)
|
|
92
159
|
|
|
93
|
-
|
|
94
|
-
device = torch.device("cpu")
|
|
160
|
+
# Convert data to tensors and move to device
|
|
95
161
|
A = torch.tensor(SMatrix, dtype=torch.float32, device=device)
|
|
96
162
|
y = torch.tensor(y, dtype=torch.float32, device=device)
|
|
97
|
-
|
|
98
163
|
T, Z, X, N = SMatrix.shape
|
|
99
164
|
A_flat = A.permute(0, 3, 1, 2).reshape(T * N, Z * X)
|
|
100
165
|
y_flat = y.reshape(-1)
|
|
101
166
|
|
|
167
|
+
# Define forward/backward operators
|
|
102
168
|
P = lambda x: torch.matmul(A_flat, x.ravel())
|
|
103
169
|
PT = lambda y: torch.matmul(A_flat.T, y)
|
|
104
170
|
|
|
171
|
+
# Estimate Lipschitz constant if needed
|
|
105
172
|
if L is None:
|
|
106
|
-
L = power_method(P, PT, y_flat, Z, X
|
|
173
|
+
L = power_method(P, PT, y_flat, Z, X)
|
|
107
174
|
|
|
108
175
|
sigma = 1.0 / L
|
|
109
176
|
tau = 1.0 / L
|
|
110
177
|
|
|
178
|
+
# Initialize variables
|
|
111
179
|
x = torch.zeros(Z * X, device=device)
|
|
112
180
|
q = torch.zeros_like(y_flat)
|
|
113
181
|
x_tilde = x.clone()
|
|
114
182
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
description = f"AOT-BioMaps -- Primal/Dual Reconstruction Tomography: Chambolle-Pock (KL) α:{alpha:.4f} L: {L:.4f} ---- WITH TUMOR ---- processing on single CPU ----"
|
|
183
|
+
# Calculate save indices
|
|
184
|
+
if numIterations <= max_saves:
|
|
185
|
+
save_indices = list(range(numIterations))
|
|
119
186
|
else:
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)
|
|
125
|
-
|
|
126
|
-
# Mise à jour de x avec l'opérateur proximal pour G
|
|
127
|
-
x_old = x
|
|
128
|
-
x = prox_G(x - tau * PT(q), tau, PT(torch.ones_like(y_flat)))
|
|
129
|
-
|
|
130
|
-
x_tilde = x + theta * (x - x_old)
|
|
131
|
-
|
|
132
|
-
if iteration % 1 == 0:
|
|
133
|
-
I_reconMatrix.append(x.reshape(Z, X).cpu().numpy())
|
|
134
|
-
|
|
135
|
-
return I_reconMatrix if isSavingEachIteration else I_reconMatrix[-1]
|
|
136
|
-
|
|
137
|
-
def chambolle_pock_KL_gpu(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, withTumor):
|
|
138
|
-
# Sélection du GPU
|
|
139
|
-
device = torch.device(f"cuda:{config.select_best_gpu()}")
|
|
140
|
-
|
|
141
|
-
# Conversion des données en tenseurs et déplacement vers le GPU
|
|
142
|
-
A = torch.tensor(SMatrix, dtype=torch.float32, device=device)
|
|
143
|
-
y = torch.tensor(y, dtype=torch.float32, device=device)
|
|
144
|
-
|
|
145
|
-
T, Z, X, N = SMatrix.shape
|
|
146
|
-
A_flat = A.permute(0, 3, 1, 2).reshape(T * N, Z * X)
|
|
147
|
-
y_flat = y.reshape(-1)
|
|
148
|
-
|
|
149
|
-
P = lambda x: torch.matmul(A_flat, x.ravel())
|
|
150
|
-
PT = lambda y: torch.matmul(A_flat.T, y)
|
|
151
|
-
|
|
152
|
-
if L is None:
|
|
153
|
-
L = power_method(P, PT, y_flat, Z, X, isGPU=True)
|
|
154
|
-
|
|
155
|
-
sigma = 1.0 / L
|
|
156
|
-
tau = 1.0 / L
|
|
157
|
-
|
|
158
|
-
x = torch.zeros(Z * X, device=device)
|
|
159
|
-
q = torch.zeros_like(y_flat)
|
|
160
|
-
x_tilde = x.clone()
|
|
187
|
+
step = numIterations // max_saves
|
|
188
|
+
save_indices = list(range(0, numIterations, step))
|
|
189
|
+
if save_indices[-1] != numIterations - 1:
|
|
190
|
+
save_indices.append(numIterations - 1)
|
|
161
191
|
|
|
162
192
|
I_reconMatrix = [x.reshape(Z, X).cpu().numpy()]
|
|
193
|
+
saved_indices = [0]
|
|
163
194
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
else
|
|
167
|
-
|
|
195
|
+
# Description for progress bar
|
|
196
|
+
tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
|
|
197
|
+
device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
|
|
198
|
+
description = f"AOT-BioMaps -- Primal/Dual Reconstruction (KL) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
|
|
168
199
|
|
|
200
|
+
# Main loop
|
|
169
201
|
for iteration in trange(numIterations, desc=description):
|
|
170
|
-
#
|
|
202
|
+
# Update q (proximal step for F*)
|
|
171
203
|
q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)
|
|
172
204
|
|
|
173
|
-
#
|
|
174
|
-
x_old = x
|
|
205
|
+
# Update x (proximal step for G)
|
|
206
|
+
x_old = x.clone()
|
|
175
207
|
x = prox_G(x - tau * PT(q), tau, PT(torch.ones_like(y_flat)))
|
|
176
208
|
|
|
209
|
+
# Update x_tilde
|
|
177
210
|
x_tilde = x + theta * (x - x_old)
|
|
178
211
|
|
|
179
|
-
|
|
212
|
+
# Save intermediate result if needed
|
|
213
|
+
if isSavingEachIteration and iteration in save_indices:
|
|
180
214
|
I_reconMatrix.append(x.reshape(Z, X).cpu().numpy())
|
|
215
|
+
saved_indices.append(iteration)
|
|
181
216
|
|
|
182
|
-
|
|
217
|
+
# Return results
|
|
218
|
+
if isSavingEachIteration:
|
|
219
|
+
return I_reconMatrix, saved_indices
|
|
220
|
+
else:
|
|
221
|
+
return I_reconMatrix[-1], None
|
|
@@ -9,26 +9,24 @@ def _Omega_RELATIVE_DIFFERENCE_CPU(theta_flat, index, values, gamma):
|
|
|
9
9
|
theta_k = theta_flat[k_idx]
|
|
10
10
|
diff = theta_k - theta_j
|
|
11
11
|
abs_diff = np.abs(diff)
|
|
12
|
-
|
|
13
12
|
denom = theta_k + theta_j + gamma * abs_diff + 1e-8
|
|
14
13
|
num = diff ** 2
|
|
15
|
-
|
|
14
|
+
psi_pair = num / denom
|
|
15
|
+
psi_pair = values * psi_pair
|
|
16
16
|
# First derivative ∂U/∂θ_j
|
|
17
17
|
dpsi = (2 * diff * denom - num * (1 + gamma * np.sign(diff))) / (denom ** 2)
|
|
18
18
|
grad_pair = values * (-dpsi) # Note the negative sign: U contains ψ(θ_k, θ_j), seeking ∂/∂θ_j
|
|
19
|
-
|
|
20
19
|
# Second derivative ∂²U/∂θ_j² (numerically stable, approximate treatment)
|
|
21
20
|
d2psi = (2 * denom ** 2 - 4 * diff * denom * (1 + gamma * np.sign(diff))
|
|
22
21
|
+ 2 * num * (1 + gamma * np.sign(diff)) ** 2) / (denom ** 3 + 1e-8)
|
|
23
22
|
hess_pair = values * d2psi
|
|
24
|
-
|
|
25
23
|
grad_U = np.zeros_like(theta_flat)
|
|
26
24
|
hess_U = np.zeros_like(theta_flat)
|
|
27
|
-
|
|
28
25
|
np.add.at(grad_U, j_idx, grad_pair)
|
|
29
26
|
np.add.at(hess_U, j_idx, hess_pair)
|
|
30
|
-
|
|
31
|
-
|
|
27
|
+
# Compute U_value
|
|
28
|
+
U_value = 0.5 * np.sum(psi_pair)
|
|
29
|
+
return grad_U, hess_U, U_value
|
|
32
30
|
|
|
33
31
|
def _Omega_RELATIVE_DIFFERENCE_GPU(theta_flat, index, values, device, gamma):
|
|
34
32
|
j_idx, k_idx = index
|
|
@@ -38,26 +36,24 @@ def _Omega_RELATIVE_DIFFERENCE_GPU(theta_flat, index, values, device, gamma):
|
|
|
38
36
|
abs_diff = torch.abs(diff)
|
|
39
37
|
denom = theta_k + theta_j + gamma * abs_diff + 1e-8
|
|
40
38
|
num = diff ** 2
|
|
41
|
-
|
|
39
|
+
psi_pair = num / denom
|
|
40
|
+
psi_pair = values * psi_pair
|
|
42
41
|
# Compute gradient contributions
|
|
43
42
|
dpsi = (2 * diff * denom - num * (1 + gamma * torch.sign(diff))) / (denom ** 2)
|
|
44
43
|
grad_pair = values * (-dpsi)
|
|
45
|
-
|
|
46
44
|
# Compute Hessian contributions
|
|
47
45
|
d2psi = (2 * denom ** 2 - 4 * diff * denom * (1 + gamma * torch.sign(diff))
|
|
48
46
|
+ 2 * num * (1 + gamma * torch.sign(diff)) ** 2) / (denom ** 3 + 1e-8)
|
|
49
47
|
hess_pair = values * d2psi
|
|
50
|
-
|
|
51
48
|
# Initialize gradient and Hessian on the correct device
|
|
52
49
|
grad_U = torch.zeros_like(theta_flat, device=device)
|
|
53
50
|
hess_U = torch.zeros_like(theta_flat, device=device)
|
|
54
|
-
|
|
55
51
|
# Accumulate gradient contributions
|
|
56
52
|
grad_U.index_add_(0, j_idx, grad_pair)
|
|
57
53
|
grad_U.index_add_(0, k_idx, -grad_pair)
|
|
58
|
-
|
|
59
54
|
# Accumulate Hessian contributions
|
|
60
55
|
hess_U.index_add_(0, j_idx, hess_pair)
|
|
61
56
|
hess_U.index_add_(0, k_idx, hess_pair)
|
|
62
|
-
|
|
63
|
-
|
|
57
|
+
# Compute U_value
|
|
58
|
+
U_value = 0.5 * psi_pair.sum()
|
|
59
|
+
return grad_U, hess_U, U_value
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
import pycuda.driver as drv
|
|
2
|
+
import numpy as np
|
|
3
|
+
from pycuda.compiler import SourceModule
|
|
4
|
+
from tqdm import trange
|
|
5
|
+
import gc
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
class SparseSMatrix_CSR:
|
|
9
|
+
"""Construction d'une matrice CSR à partir d'un objet `manip`.
|
|
10
|
+
Usage:
|
|
11
|
+
S = SparseMatrixGPU(manip)
|
|
12
|
+
S.allocate()
|
|
13
|
+
Après allocate(), on a: row_ptr (host np.int64 array), row_ptr_gpu (device ptr),
|
|
14
|
+
h_col_ind, h_values, col_ind_gpu, values_gpu, norm_factor_inv.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, manip, block_rows=64, relative_threshold=0.3, device=0):
|
|
18
|
+
drv.init()
|
|
19
|
+
self.device = drv.Device(device)
|
|
20
|
+
self.ctx = self.device.make_context()
|
|
21
|
+
self.manip = manip
|
|
22
|
+
self.N = len(manip.AcousticFields)
|
|
23
|
+
self.T = manip.AcousticFields[0].field.shape[0]
|
|
24
|
+
self.Z = manip.AcousticFields[0].field.shape[1]
|
|
25
|
+
self.X = manip.AcousticFields[0].field.shape[2]
|
|
26
|
+
self.block_rows = block_rows
|
|
27
|
+
self.relative_threshold = relative_threshold
|
|
28
|
+
|
|
29
|
+
# --- FIX: Résolution du chemin du .cubin (dans AOT_Recon/) ---
|
|
30
|
+
# Le fichier SparseSMatrix_CSR.py est dans AOT_Recon/AOT_SparseSMatrix/
|
|
31
|
+
# On remonte d'un répertoire pour atteindre AOT_Recon/
|
|
32
|
+
cubin_parent_dir = os.path.dirname(os.path.dirname(__file__))
|
|
33
|
+
self.module_path = os.path.join(cubin_parent_dir, "AOT_biomaps_kernels.cubin")
|
|
34
|
+
# --- FIN FIX ---
|
|
35
|
+
|
|
36
|
+
self.h_dense = None
|
|
37
|
+
self.row_ptr = None
|
|
38
|
+
self.row_ptr_gpu = None
|
|
39
|
+
self.h_col_ind = None
|
|
40
|
+
self.h_values = None
|
|
41
|
+
self.total_nnz = 0
|
|
42
|
+
self.norm_factor_inv = None
|
|
43
|
+
self.sparse_mod = None
|
|
44
|
+
|
|
45
|
+
def __enter__(self):
|
|
46
|
+
return self
|
|
47
|
+
|
|
48
|
+
def __exit__(self, exc_type, exc, tb):
|
|
49
|
+
self.free()
|
|
50
|
+
|
|
51
|
+
def load_precompiled_module(self):
|
|
52
|
+
"""
|
|
53
|
+
Charge le module CUDA pré-compilé (.cubin) en utilisant le chemin résolu.
|
|
54
|
+
Supprime la logique de compilation JIT.
|
|
55
|
+
"""
|
|
56
|
+
so_path = self.module_path # Utilise le chemin résolu dans __init__
|
|
57
|
+
|
|
58
|
+
if not os.path.exists(so_path):
|
|
59
|
+
raise FileNotFoundError(
|
|
60
|
+
f"Le module CUDA {os.path.basename(so_path)} est introuvable au chemin: {so_path}. "
|
|
61
|
+
"Assurez-vous qu'il est compilé et bien placé."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
self.sparse_mod = drv.module_from_file(so_path)
|
|
66
|
+
print(f"✅ Module CUDA chargé depuis {so_path}")
|
|
67
|
+
except Exception as e:
|
|
68
|
+
raise RuntimeError(f"Le fichier {os.path.basename(so_path)} a été trouvé, mais PyCUDA n'a pas pu le charger. Vérifiez la compatibilité.") from e
|
|
69
|
+
|
|
70
|
+
def estimate_nnz_cpu(self):
|
|
71
|
+
"""Estimation rapide (non-exacte) — utile si tu veux une estimation faible.
|
|
72
|
+
Recommandé : utiliser la passe GPU exacte (count_nnz_per_row_kernel) à la place.
|
|
73
|
+
"""
|
|
74
|
+
total = 0
|
|
75
|
+
for n in range(self.N):
|
|
76
|
+
field = self.manip.AcousticFields[n].field
|
|
77
|
+
for t in range(self.T):
|
|
78
|
+
row = field[t].flatten()
|
|
79
|
+
row_max = np.max(np.abs(row))
|
|
80
|
+
thr = row_max * self.relative_threshold
|
|
81
|
+
total += np.count_nonzero(np.abs(row) > thr)
|
|
82
|
+
return int(total)
|
|
83
|
+
|
|
84
|
+
def allocate(self, kernel_module_path=None):
|
|
85
|
+
try:
|
|
86
|
+
# --- 1. Construction bloc par bloc (sans garder tout le dense si possible) ---
|
|
87
|
+
num_rows = self.N * self.T
|
|
88
|
+
num_cols = self.Z * self.X
|
|
89
|
+
bytes_float = np.dtype(np.float32).itemsize
|
|
90
|
+
|
|
91
|
+
# Charge module
|
|
92
|
+
# FIX: Toujours charger depuis self.module_path (résolu)
|
|
93
|
+
self.load_precompiled_module()
|
|
94
|
+
|
|
95
|
+
count_nnz_kernel = self.sparse_mod.get_function('count_nnz_rows_kernel')
|
|
96
|
+
fill_csr_kernel = self.sparse_mod.get_function('fill_kernel__CSR')
|
|
97
|
+
|
|
98
|
+
# allocate host row_ptr
|
|
99
|
+
self.row_ptr = np.zeros(num_rows + 1, dtype=np.int64)
|
|
100
|
+
|
|
101
|
+
# GPU temp buffers
|
|
102
|
+
dense_block_host = np.empty((self.block_rows, num_cols), dtype=np.float32)
|
|
103
|
+
dense_block_gpu = drv.mem_alloc(self.block_rows * num_cols * bytes_float)
|
|
104
|
+
row_nnz_gpu = drv.mem_alloc(self.block_rows * np.dtype(np.int32).itemsize)
|
|
105
|
+
|
|
106
|
+
block_size = 128
|
|
107
|
+
|
|
108
|
+
# --- Count NNZ per row using GPU kernel to be consistent with filling logic ---
|
|
109
|
+
for b in trange(0, num_rows, self.block_rows, desc='Comptage NNZ'):
|
|
110
|
+
current_rows = min(self.block_rows, num_rows - b)
|
|
111
|
+
# Fill dense_block_host from manip
|
|
112
|
+
for r in range(current_rows):
|
|
113
|
+
global_row = b + r
|
|
114
|
+
n_idx = global_row // self.T
|
|
115
|
+
t_idx = global_row % self.T
|
|
116
|
+
dense_block_host[r, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
|
|
117
|
+
drv.memcpy_htod(dense_block_gpu, dense_block_host)
|
|
118
|
+
|
|
119
|
+
grid = ((current_rows + block_size - 1) // block_size, 1, 1)
|
|
120
|
+
# Note: Assuming 'count_nnz_per_row_kernel' is the correct name (verified by user in prior steps)
|
|
121
|
+
count_nnz_kernel(dense_block_gpu, row_nnz_gpu,
|
|
122
|
+
np.int32(current_rows), np.int32(num_cols),
|
|
123
|
+
np.float32(self.relative_threshold),
|
|
124
|
+
block=(block_size, 1, 1), grid=grid)
|
|
125
|
+
|
|
126
|
+
row_nnz_host = np.empty(current_rows, dtype=np.int32)
|
|
127
|
+
drv.memcpy_dtoh(row_nnz_host, row_nnz_gpu)
|
|
128
|
+
self.row_ptr[b + 1:b + current_rows + 1] = self.row_ptr[b] + np.cumsum(row_nnz_host, dtype=np.int64)
|
|
129
|
+
|
|
130
|
+
# total nnz
|
|
131
|
+
self.total_nnz = int(self.row_ptr[-1])
|
|
132
|
+
print(f"NNZ total : {self.total_nnz}")
|
|
133
|
+
|
|
134
|
+
# allocate final arrays
|
|
135
|
+
self.h_col_ind = np.zeros(self.total_nnz, dtype=np.uint32)
|
|
136
|
+
self.h_values = np.zeros(self.total_nnz, dtype=np.float32)
|
|
137
|
+
|
|
138
|
+
# copy row_ptr to device once
|
|
139
|
+
self.row_ptr_gpu = drv.mem_alloc(self.row_ptr.nbytes)
|
|
140
|
+
drv.memcpy_htod(self.row_ptr_gpu, self.row_ptr)
|
|
141
|
+
|
|
142
|
+
# allocate device arrays for final csr
|
|
143
|
+
self.col_ind_gpu = drv.mem_alloc(self.h_col_ind.nbytes)
|
|
144
|
+
self.values_gpu = drv.mem_alloc(self.h_values.nbytes)
|
|
145
|
+
|
|
146
|
+
# --- Fill CSR per-block ---
|
|
147
|
+
for b in trange(0, num_rows, self.block_rows, desc='Remplissage CSR'):
|
|
148
|
+
current_rows = min(self.block_rows, num_rows - b)
|
|
149
|
+
for r in range(current_rows):
|
|
150
|
+
global_row = b + r
|
|
151
|
+
n_idx = global_row // self.T
|
|
152
|
+
t_idx = global_row % self.T
|
|
153
|
+
dense_block_host[r, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
|
|
154
|
+
drv.memcpy_htod(dense_block_gpu, dense_block_host)
|
|
155
|
+
|
|
156
|
+
grid = ((current_rows + block_size - 1) // block_size, 1, 1)
|
|
157
|
+
fill_csr_kernel(dense_block_gpu,
|
|
158
|
+
self.row_ptr_gpu,
|
|
159
|
+
self.col_ind_gpu,
|
|
160
|
+
self.values_gpu,
|
|
161
|
+
np.int32(b),
|
|
162
|
+
np.int32(current_rows),
|
|
163
|
+
np.int32(num_cols),
|
|
164
|
+
np.float32(self.relative_threshold),
|
|
165
|
+
np.int64(self.total_nnz),
|
|
166
|
+
block=(block_size, 1, 1), grid=grid)
|
|
167
|
+
drv.Context.synchronize()
|
|
168
|
+
|
|
169
|
+
# copy back
|
|
170
|
+
drv.memcpy_dtoh(self.h_col_ind, self.col_ind_gpu)
|
|
171
|
+
drv.memcpy_dtoh(self.h_values, self.values_gpu)
|
|
172
|
+
print('CSR généré ✔')
|
|
173
|
+
|
|
174
|
+
# compute normalization factor from CSR (sum per column)
|
|
175
|
+
self.compute_norm_factor_from_csr()
|
|
176
|
+
|
|
177
|
+
# free temporaries
|
|
178
|
+
dense_block_gpu.free(); row_nnz_gpu.free()
|
|
179
|
+
|
|
180
|
+
except Exception as e:
|
|
181
|
+
print(f"❌ Erreur détaillée : {e}")
|
|
182
|
+
self.free()
|
|
183
|
+
raise
|
|
184
|
+
|
|
185
|
+
def compute_norm_factor_from_csr(self):
|
|
186
|
+
ZX = self.Z * self.X
|
|
187
|
+
|
|
188
|
+
# 1) Allouer un vecteur de somme colonne sur le GPU
|
|
189
|
+
col_sum_gpu = drv.mem_alloc(ZX * np.dtype(np.float32).itemsize)
|
|
190
|
+
drv.memset_d32(col_sum_gpu, 0, ZX)
|
|
191
|
+
|
|
192
|
+
# 2) Récupérer le kernel
|
|
193
|
+
# FIX: Utiliser le nom générique 'accumulate_columns_atomic' comme dans SELL (si le binaire est partagé)
|
|
194
|
+
# Si le développeur utilise la convention __CSR, on la garde.
|
|
195
|
+
# Basé sur notre historique SELL, le nom est probablement générique 'accumulate_columns_atomic'.
|
|
196
|
+
# Je vais supposer que le nom est générique pour éviter une LogicError ici aussi.
|
|
197
|
+
acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic")
|
|
198
|
+
|
|
199
|
+
# 3) Lancer le kernel
|
|
200
|
+
threads = 256
|
|
201
|
+
blocks = (self.total_nnz + threads - 1) // threads
|
|
202
|
+
|
|
203
|
+
acc_kernel(
|
|
204
|
+
self.values_gpu,
|
|
205
|
+
self.col_ind_gpu,
|
|
206
|
+
np.int64(self.total_nnz),
|
|
207
|
+
col_sum_gpu,
|
|
208
|
+
block=(threads,1,1),
|
|
209
|
+
grid=(blocks,1,1)
|
|
210
|
+
)
|
|
211
|
+
drv.Context.synchronize()
|
|
212
|
+
|
|
213
|
+
# 4) Récupérer le résultat
|
|
214
|
+
norm = np.empty(ZX, dtype=np.float32)
|
|
215
|
+
drv.memcpy_dtoh(norm, col_sum_gpu)
|
|
216
|
+
col_sum_gpu.free()
|
|
217
|
+
|
|
218
|
+
norm = np.maximum(norm.astype(np.float64), 1e-6)
|
|
219
|
+
self.norm_factor_inv = (1.0 / norm).astype(np.float32)
|
|
220
|
+
|
|
221
|
+
self.norm_factor_inv_gpu = drv.mem_alloc(self.norm_factor_inv.nbytes)
|
|
222
|
+
drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
|
|
223
|
+
|
|
224
|
+
def getMatrixSize(self):
|
|
225
|
+
"""
|
|
226
|
+
Retourne la taille totale de la matrice CSR en Go (en sommant la mémoire GPU).
|
|
227
|
+
"""
|
|
228
|
+
if self.row_ptr is None:
|
|
229
|
+
return {"error": "La matrice sparse n'est pas encore allouée."}
|
|
230
|
+
|
|
231
|
+
total_bytes = 0
|
|
232
|
+
|
|
233
|
+
# Mémoire GPU (row_ptr_gpu, col_ind_gpu, values_gpu, norm_factor_inv_gpu)
|
|
234
|
+
if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
|
|
235
|
+
total_bytes += self.row_ptr_gpu.size
|
|
236
|
+
if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
|
|
237
|
+
total_bytes += self.col_ind_gpu.size
|
|
238
|
+
if hasattr(self, 'values_gpu') and self.values_gpu:
|
|
239
|
+
total_bytes += self.values_gpu.size
|
|
240
|
+
if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
|
|
241
|
+
total_bytes += self.norm_factor_inv_gpu.size
|
|
242
|
+
|
|
243
|
+
# NOTE: Les versions précédentes utilisaient le .size de l'objet DeviceAllocation,
|
|
244
|
+
# qui était problématique. Si l'erreur se reproduit ici, il faudra
|
|
245
|
+
# stocker la taille en octets comme nous l'avons fait pour SELL.
|
|
246
|
+
# Pour l'instant, nous conservons la méthode getMatrixSize originale de CSR.
|
|
247
|
+
|
|
248
|
+
return total_bytes / (1024**3)
|
|
249
|
+
|
|
250
|
+
def free(self):
|
|
251
|
+
try:
|
|
252
|
+
if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
|
|
253
|
+
self.col_ind_gpu.free()
|
|
254
|
+
if hasattr(self, 'values_gpu') and self.values_gpu:
|
|
255
|
+
self.values_gpu.free()
|
|
256
|
+
if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
|
|
257
|
+
self.row_ptr_gpu.free()
|
|
258
|
+
if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
|
|
259
|
+
self.norm_factor_inv_gpu.free()
|
|
260
|
+
if hasattr(self, 'ctx') and self.ctx:
|
|
261
|
+
try:
|
|
262
|
+
self.ctx.pop()
|
|
263
|
+
except Exception:
|
|
264
|
+
pass
|
|
265
|
+
self.ctx = None
|
|
266
|
+
print('✅ Mémoire GPU libérée.')
|
|
267
|
+
except Exception as e:
|
|
268
|
+
print(f"❌ Erreur lors de la libération de la mémoire GPU : {e}")
|
|
269
|
+
|
|
270
|
+
def compute_density(self):
|
|
271
|
+
"""
|
|
272
|
+
Retourne la densité réelle de la CSR = NNZ / (num_rows * num_cols)
|
|
273
|
+
Nécessite que self.h_values et self.row_ptr existent (host).
|
|
274
|
+
"""
|
|
275
|
+
if self.row_ptr is None or self.h_values is None:
|
|
276
|
+
raise RuntimeError("row_ptr et h_values requis pour calculer la densité")
|
|
277
|
+
num_rows = int(self.N * self.T)
|
|
278
|
+
num_cols = int(self.Z * self.X)
|
|
279
|
+
total_nnz = int(self.row_ptr[-1])
|
|
280
|
+
density = total_nnz / (num_rows * num_cols)
|
|
281
|
+
return density
|