AOT-biomaps 2.9.138__py3-none-any.whl → 2.9.279__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (31) hide show
  1. AOT_biomaps/AOT_Acoustic/AcousticTools.py +35 -115
  2. AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
  3. AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +22 -18
  4. AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
  5. AOT_biomaps/AOT_Experiment/_mainExperiment.py +102 -68
  6. AOT_biomaps/AOT_Optic/_mainOptic.py +124 -58
  7. AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +72 -108
  8. AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +474 -289
  9. AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +173 -68
  10. AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +360 -154
  11. AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +150 -111
  12. AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
  13. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +281 -0
  14. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +328 -0
  15. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
  16. AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
  17. AOT_biomaps/AOT_Recon/AlgebraicRecon.py +359 -238
  18. AOT_biomaps/AOT_Recon/AnalyticRecon.py +29 -41
  19. AOT_biomaps/AOT_Recon/BayesianRecon.py +165 -91
  20. AOT_biomaps/AOT_Recon/DeepLearningRecon.py +4 -1
  21. AOT_biomaps/AOT_Recon/PrimalDualRecon.py +175 -31
  22. AOT_biomaps/AOT_Recon/ReconEnums.py +38 -3
  23. AOT_biomaps/AOT_Recon/ReconTools.py +184 -77
  24. AOT_biomaps/AOT_Recon/__init__.py +1 -0
  25. AOT_biomaps/AOT_Recon/_mainRecon.py +144 -74
  26. AOT_biomaps/__init__.py +4 -36
  27. {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/METADATA +2 -1
  28. aot_biomaps-2.9.279.dist-info/RECORD +47 -0
  29. aot_biomaps-2.9.138.dist-info/RECORD +0 -43
  30. {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/WHEEL +0 -0
  31. {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient_cpu, gradient_gpu, div_cpu, div_gpu, proj_l2, prox_G, prox_F_star
1
+ from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star
2
2
  from AOT_biomaps.Config import config
3
-
3
+ from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType
4
4
  import torch
5
5
  from tqdm import trange
6
6
 
@@ -10,173 +10,212 @@ It includes Chambolle-Pock algorithms for Total Variation (TV) and Kullback-Leib
10
10
  The methods can run on both CPU and GPU, with configurations set in the AOT_biomaps.Config module.
11
11
  '''
12
12
 
13
- def chambolle_pock_TV_cpu(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, withTumor):
14
- device = torch.device("cpu")
13
+ def CP_TV(
14
+ SMatrix,
15
+ y,
16
+ alpha=1e-1,
17
+ theta=1.0,
18
+ numIterations=5000,
19
+ isSavingEachIteration=True,
20
+ L=None,
21
+ withTumor=True,
22
+ device=None,
23
+ max_saves=5000,
24
+ ):
25
+ """
26
+ Chambolle-Pock algorithm for Total Variation (TV) regularization.
27
+ Works on both CPU and GPU.
28
+ Args:
29
+ SMatrix: System matrix (shape: T, Z, X, N)
30
+ y: Measurement data (shape: T, N)
31
+ alpha: Regularization parameter for TV
32
+ theta: Relaxation parameter (1.0 for standard Chambolle-Pock)
33
+ numIterations: Number of iterations
34
+ isSavingEachIteration: If True, returns selected intermediate reconstructions
35
+ L: Lipschitz constant (estimated if None)
36
+ withTumor: Boolean for description only
37
+ device: Torch device (auto-selected if None)
38
+ max_saves: Maximum number of intermediate saves (default: 5000)
39
+ """
40
+ # Auto-select device if not provided
41
+ if device is None:
42
+ device = torch.device(f"cuda:{config.select_best_gpu()}" if torch.cuda.is_available() else "cpu")
43
+ else:
44
+ device = torch.device(device)
45
+
46
+ # Convert data to tensors and move to device
15
47
  A = torch.tensor(SMatrix, dtype=torch.float32, device=device)
16
48
  y = torch.tensor(y, dtype=torch.float32, device=device)
17
-
18
49
  T, Z, X, N = SMatrix.shape
19
50
  A_flat = A.permute(0, 3, 1, 2).reshape(T * N, Z * X)
20
51
  y_flat = y.reshape(-1)
21
52
 
22
- P = lambda x: torch.matmul(A_flat, x.ravel())
53
+ # Robust normalization
54
+ norm_A = A_flat.abs().max().clamp(min=1e-8)
55
+ norm_y = y_flat.abs().max().clamp(min=1e-8)
56
+ A_flat = A_flat / norm_A
57
+ y_flat = y_flat / norm_y
58
+
59
+ # Define forward/backward operators
60
+ P = lambda x: torch.matmul(A_flat, x)
23
61
  PT = lambda y: torch.matmul(A_flat.T, y)
24
62
 
63
+ # Estimate Lipschitz constant if needed
25
64
  if L is None:
26
- L = power_method(P, PT, y_flat, Z, X, isGPU=True)
65
+ try:
66
+ L = power_method(P, PT, y_flat, Z, X)
67
+ L = max(L, 1e-3)
68
+ except:
69
+ L = 1.0
27
70
 
28
71
  sigma = 1.0 / L
29
72
  tau = 1.0 / L
30
73
 
74
+ # Initialize variables
31
75
  x = torch.zeros(Z * X, device=device)
32
76
  p = torch.zeros((2, Z, X), device=device)
33
77
  q = torch.zeros_like(y_flat)
34
78
  x_tilde = x.clone()
35
79
 
36
- I_reconMatrix = [x.reshape(Z, X).cpu().numpy()]
37
-
38
- if withTumor:
39
- description = f"AOT-BioMaps -- Primal/Dual Recontruction Tomography : Chambolle-Pock (TV : Gaussian Noise) α:{alpha:.4f} L: {L:.4f} ---- WITH TUMOR ---- processing on single CPU ----"
80
+ # Calculate save indices
81
+ if numIterations <= max_saves:
82
+ save_indices = list(range(numIterations))
40
83
  else:
41
- description = f"AOT-BioMaps -- Primal/Dual Recontruction Tomography : Chambolle-Pock (TV : Gaussian Noise) α:{alpha:.4f} L: {L:.4f} ---- WITHOUT TUMOR ---- processing on single CPU ----"
84
+ step = numIterations // max_saves
85
+ save_indices = list(range(0, numIterations, step))
86
+ if save_indices[-1] != numIterations - 1:
87
+ save_indices.append(numIterations - 1)
42
88
 
43
- for iteration in trange(numIterations, desc=description):
44
- p = proj_l2(p + sigma * gradient_cpu(x_tilde.reshape(Z, X)), alpha)
45
- q = (q + sigma * P(x_tilde) - sigma * y_flat) / (1.0 + sigma)
89
+ I_reconMatrix = []
90
+ saved_indices = []
46
91
 
47
- x_old = x
48
- x = x + tau * div_cpu(p).ravel() - tau * PT(q)
49
- x_tilde = x + theta * (x - x_old)
92
+ # Description for progress bar
93
+ tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
94
+ device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
95
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
50
96
 
51
- if iteration % 1 == 0:
52
- I_reconMatrix.append(x.reshape(Z, X).cpu().numpy())
97
+ # Main loop
98
+ for iteration in trange(numIterations, desc=description):
99
+ # Update p (TV proximal step)
100
+ grad_x = gradient(x_tilde.reshape(Z, X))
101
+ p = proj_l2(p + sigma * grad_x, alpha)
53
102
 
54
- return I_reconMatrix if isSavingEachIteration else I_reconMatrix[-1]
103
+ # Update q (data fidelity)
104
+ q = (q + sigma * (P(x_tilde) - y_flat)) / (1 + sigma)
55
105
 
56
- def chambolle_pock_TV_gpu(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, withTumor):
106
+ # Update x
107
+ x_old = x.clone()
108
+ div_p = div(p).ravel() # Divergence calculation
109
+ ATq = PT(q)
110
+ x = (x - tau * (ATq - div_p)) / (1 + tau * 1e-6) # Light L2 regularization
57
111
 
58
- device = torch.device(f"cuda:{config.select_best_gpu()}")
59
- A = torch.tensor(SMatrix, dtype=torch.float32, device=device)
60
- y = torch.tensor(y, dtype=torch.float32, device=device)
61
- T, Z, X, N = SMatrix.shape
62
- A_flat = A.permute(0, 3, 1, 2).reshape(T * N, Z * X)
63
- y_flat = y.reshape(-1)
64
- P = lambda x: torch.matmul(A_flat, x.ravel())
65
- PT = lambda y: torch.matmul(A_flat.T, y)
112
+ # Update x_tilde
113
+ x_tilde = x + theta * (x - x_old)
66
114
 
67
- if L is None:
68
- L = power_method(P, PT, y_flat, Z, X, isGPU=True)
115
+ # Save intermediate result if needed
116
+ if isSavingEachIteration and iteration in save_indices:
117
+ I_reconMatrix.append(x.reshape(Z, X).clone() * (norm_y / norm_A))
118
+ saved_indices.append(iteration)
69
119
 
70
- sigma = 1.0 / L
71
- tau = 1.0 / L
72
- x = torch.zeros(Z * X, device=device)
73
- p = torch.zeros((2, Z, X), device=device)
74
- q = torch.zeros_like(y_flat)
75
- x_tilde = x.clone()
76
- I_reconMatrix = [x.reshape(Z, X).cpu().numpy()]
77
-
78
- if withTumor:
79
- description = f"AOT-BioMaps -- Primal/Dual Recontruction Tomography : Chambolle-Pock (TV : Gaussian Noise) α:{alpha:.4f} L: {L:.4f} ---- WITH TUMOR ---- processing on GPU no.{torch.cuda.current_device()} ----"
120
+ # Return results
121
+ if isSavingEachIteration:
122
+ return [tensor.cpu().numpy() for tensor in I_reconMatrix], saved_indices
80
123
  else:
81
- description = f"AOT-BioMaps -- Primal/Dual Recontruction Tomography : Chambolle-Pock (TV: Gaussian Noise) α:{alpha:.4f} L: {L:.4f} ---- WITHOUT TUMOR ---- processing on GPU no.{torch.cuda.current_device()} ----"
82
-
83
- for iteration in trange(numIterations, desc=description):
84
- p = proj_l2(p + sigma * gradient_gpu(x_tilde.reshape(Z, X)), alpha)
85
- q = (q + sigma * P(x_tilde) - sigma * y_flat) / (1.0 + sigma)
86
- x_old = x
87
- x = x + tau * div_gpu(p).ravel() - tau * PT(q)
88
- x_tilde = x + theta * (x - x_old)
89
- if iteration % 1 == 0:
90
- I_reconMatrix.append(x.reshape(Z, X).cpu().numpy())
91
- return I_reconMatrix if isSavingEachIteration else I_reconMatrix[-1]
124
+ return (x.reshape(Z, X) * (norm_y / norm_A)).cpu().numpy(), None
125
+
126
+
127
+ def CP_KL(
128
+ SMatrix,
129
+ y,
130
+ alpha=1e-9,
131
+ theta=1.0,
132
+ numIterations=5000,
133
+ isSavingEachIteration=True,
134
+ L=None,
135
+ withTumor=True,
136
+ device=None,
137
+ max_saves=5000,
138
+ ):
139
+ """
140
+ Chambolle-Pock algorithm for Kullback-Leibler (KL) divergence regularization.
141
+ Works on both CPU and GPU.
142
+ Args:
143
+ SMatrix: System matrix (shape: T, Z, X, N)
144
+ y: Measurement data (shape: T, X, N)
145
+ alpha: Regularization parameter
146
+ theta: Relaxation parameter (1.0 for standard Chambolle-Pock)
147
+ numIterations: Number of iterations
148
+ isSavingEachIteration: If True, returns selected intermediate reconstructions
149
+ L: Lipschitz constant (estimated if None)
150
+ withTumor: Boolean for description only
151
+ device: Torch device (auto-selected if None)
152
+ max_saves: Maximum number of intermediate saves (default: 5000)
153
+ """
154
+ # Auto-select device if not provided
155
+ if device is None:
156
+ device = torch.device(f"cuda:{config.select_best_gpu()}" if torch.cuda.is_available() else "cpu")
157
+ else:
158
+ device = torch.device(device)
92
159
 
93
- def chambolle_pock_KL_cpu(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, withTumor):
94
- device = torch.device("cpu")
160
+ # Convert data to tensors and move to device
95
161
  A = torch.tensor(SMatrix, dtype=torch.float32, device=device)
96
162
  y = torch.tensor(y, dtype=torch.float32, device=device)
97
-
98
163
  T, Z, X, N = SMatrix.shape
99
164
  A_flat = A.permute(0, 3, 1, 2).reshape(T * N, Z * X)
100
165
  y_flat = y.reshape(-1)
101
166
 
167
+ # Define forward/backward operators
102
168
  P = lambda x: torch.matmul(A_flat, x.ravel())
103
169
  PT = lambda y: torch.matmul(A_flat.T, y)
104
170
 
171
+ # Estimate Lipschitz constant if needed
105
172
  if L is None:
106
- L = power_method(P, PT, y_flat, Z, X, isGPU=False)
173
+ L = power_method(P, PT, y_flat, Z, X)
107
174
 
108
175
  sigma = 1.0 / L
109
176
  tau = 1.0 / L
110
177
 
178
+ # Initialize variables
111
179
  x = torch.zeros(Z * X, device=device)
112
180
  q = torch.zeros_like(y_flat)
113
181
  x_tilde = x.clone()
114
182
 
115
- I_reconMatrix = [x.reshape(Z, X).cpu().numpy()]
116
-
117
- if withTumor:
118
- description = f"AOT-BioMaps -- Primal/Dual Reconstruction Tomography: Chambolle-Pock (KL) α:{alpha:.4f} L: {L:.4f} ---- WITH TUMOR ---- processing on single CPU ----"
183
+ # Calculate save indices
184
+ if numIterations <= max_saves:
185
+ save_indices = list(range(numIterations))
119
186
  else:
120
- description = f"AOT-BioMaps -- Primal/Dual Reconstruction Tomography: Chambolle-Pock (KL) α:{alpha:.4f} L: {L:.4f} ---- WITHOUT TUMOR ---- processing on single CPU ----"
121
-
122
- for iteration in trange(numIterations, desc=description):
123
- # Mise à jour de q avec l'opérateur proximal pour F*
124
- q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)
125
-
126
- # Mise à jour de x avec l'opérateur proximal pour G
127
- x_old = x
128
- x = prox_G(x - tau * PT(q), tau, PT(torch.ones_like(y_flat)))
129
-
130
- x_tilde = x + theta * (x - x_old)
131
-
132
- if iteration % 1 == 0:
133
- I_reconMatrix.append(x.reshape(Z, X).cpu().numpy())
134
-
135
- return I_reconMatrix if isSavingEachIteration else I_reconMatrix[-1]
136
-
137
- def chambolle_pock_KL_gpu(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, withTumor):
138
- # Sélection du GPU
139
- device = torch.device(f"cuda:{config.select_best_gpu()}")
140
-
141
- # Conversion des données en tenseurs et déplacement vers le GPU
142
- A = torch.tensor(SMatrix, dtype=torch.float32, device=device)
143
- y = torch.tensor(y, dtype=torch.float32, device=device)
144
-
145
- T, Z, X, N = SMatrix.shape
146
- A_flat = A.permute(0, 3, 1, 2).reshape(T * N, Z * X)
147
- y_flat = y.reshape(-1)
148
-
149
- P = lambda x: torch.matmul(A_flat, x.ravel())
150
- PT = lambda y: torch.matmul(A_flat.T, y)
151
-
152
- if L is None:
153
- L = power_method(P, PT, y_flat, Z, X, isGPU=True)
154
-
155
- sigma = 1.0 / L
156
- tau = 1.0 / L
157
-
158
- x = torch.zeros(Z * X, device=device)
159
- q = torch.zeros_like(y_flat)
160
- x_tilde = x.clone()
187
+ step = numIterations // max_saves
188
+ save_indices = list(range(0, numIterations, step))
189
+ if save_indices[-1] != numIterations - 1:
190
+ save_indices.append(numIterations - 1)
161
191
 
162
192
  I_reconMatrix = [x.reshape(Z, X).cpu().numpy()]
193
+ saved_indices = [0]
163
194
 
164
- if withTumor:
165
- description = f"AOT-BioMaps -- Primal/Dual Reconstruction Tomography: Chambolle-Pock (KL) α:{alpha:.4f} L: {L:.4f} ---- WITH TUMOR ---- processing on GPU no.{torch.cuda.current_device()} ----"
166
- else:
167
- description = f"AOT-BioMaps -- Primal/Dual Reconstruction Tomography: Chambolle-Pock (KL) α:{alpha:.4f} L: {L:.4f} ---- WITHOUT TUMOR ---- processing on GPU no.{torch.cuda.current_device()} ----"
195
+ # Description for progress bar
196
+ tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
197
+ device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
198
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (KL) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
168
199
 
200
+ # Main loop
169
201
  for iteration in trange(numIterations, desc=description):
170
- # Mise à jour de q avec l'opérateur proximal pour F*
202
+ # Update q (proximal step for F*)
171
203
  q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)
172
204
 
173
- # Mise à jour de x avec l'opérateur proximal pour G
174
- x_old = x
205
+ # Update x (proximal step for G)
206
+ x_old = x.clone()
175
207
  x = prox_G(x - tau * PT(q), tau, PT(torch.ones_like(y_flat)))
176
208
 
209
+ # Update x_tilde
177
210
  x_tilde = x + theta * (x - x_old)
178
211
 
179
- if iteration % 1 == 0:
212
+ # Save intermediate result if needed
213
+ if isSavingEachIteration and iteration in save_indices:
180
214
  I_reconMatrix.append(x.reshape(Z, X).cpu().numpy())
215
+ saved_indices.append(iteration)
181
216
 
182
- return I_reconMatrix if isSavingEachIteration else I_reconMatrix[-1]
217
+ # Return results
218
+ if isSavingEachIteration:
219
+ return I_reconMatrix, saved_indices
220
+ else:
221
+ return I_reconMatrix[-1], None
@@ -9,26 +9,24 @@ def _Omega_RELATIVE_DIFFERENCE_CPU(theta_flat, index, values, gamma):
9
9
  theta_k = theta_flat[k_idx]
10
10
  diff = theta_k - theta_j
11
11
  abs_diff = np.abs(diff)
12
-
13
12
  denom = theta_k + theta_j + gamma * abs_diff + 1e-8
14
13
  num = diff ** 2
15
-
14
+ psi_pair = num / denom
15
+ psi_pair = values * psi_pair
16
16
  # First derivative ∂U/∂θ_j
17
17
  dpsi = (2 * diff * denom - num * (1 + gamma * np.sign(diff))) / (denom ** 2)
18
18
  grad_pair = values * (-dpsi) # Note the negative sign: U contains ψ(θ_k, θ_j), seeking ∂/∂θ_j
19
-
20
19
  # Second derivative ∂²U/∂θ_j² (numerically stable, approximate treatment)
21
20
  d2psi = (2 * denom ** 2 - 4 * diff * denom * (1 + gamma * np.sign(diff))
22
21
  + 2 * num * (1 + gamma * np.sign(diff)) ** 2) / (denom ** 3 + 1e-8)
23
22
  hess_pair = values * d2psi
24
-
25
23
  grad_U = np.zeros_like(theta_flat)
26
24
  hess_U = np.zeros_like(theta_flat)
27
-
28
25
  np.add.at(grad_U, j_idx, grad_pair)
29
26
  np.add.at(hess_U, j_idx, hess_pair)
30
-
31
- return grad_U, hess_U
27
+ # Compute U_value
28
+ U_value = 0.5 * np.sum(psi_pair)
29
+ return grad_U, hess_U, U_value
32
30
 
33
31
  def _Omega_RELATIVE_DIFFERENCE_GPU(theta_flat, index, values, device, gamma):
34
32
  j_idx, k_idx = index
@@ -38,26 +36,24 @@ def _Omega_RELATIVE_DIFFERENCE_GPU(theta_flat, index, values, device, gamma):
38
36
  abs_diff = torch.abs(diff)
39
37
  denom = theta_k + theta_j + gamma * abs_diff + 1e-8
40
38
  num = diff ** 2
41
-
39
+ psi_pair = num / denom
40
+ psi_pair = values * psi_pair
42
41
  # Compute gradient contributions
43
42
  dpsi = (2 * diff * denom - num * (1 + gamma * torch.sign(diff))) / (denom ** 2)
44
43
  grad_pair = values * (-dpsi)
45
-
46
44
  # Compute Hessian contributions
47
45
  d2psi = (2 * denom ** 2 - 4 * diff * denom * (1 + gamma * torch.sign(diff))
48
46
  + 2 * num * (1 + gamma * torch.sign(diff)) ** 2) / (denom ** 3 + 1e-8)
49
47
  hess_pair = values * d2psi
50
-
51
48
  # Initialize gradient and Hessian on the correct device
52
49
  grad_U = torch.zeros_like(theta_flat, device=device)
53
50
  hess_U = torch.zeros_like(theta_flat, device=device)
54
-
55
51
  # Accumulate gradient contributions
56
52
  grad_U.index_add_(0, j_idx, grad_pair)
57
53
  grad_U.index_add_(0, k_idx, -grad_pair)
58
-
59
54
  # Accumulate Hessian contributions
60
55
  hess_U.index_add_(0, j_idx, hess_pair)
61
56
  hess_U.index_add_(0, k_idx, hess_pair)
62
-
63
- return grad_U, hess_U
57
+ # Compute U_value
58
+ U_value = 0.5 * psi_pair.sum()
59
+ return grad_U, hess_U, U_value
@@ -0,0 +1,281 @@
1
+ import pycuda.driver as drv
2
+ import numpy as np
3
+ from pycuda.compiler import SourceModule
4
+ from tqdm import trange
5
+ import gc
6
+ import os
7
+
8
+ class SparseSMatrix_CSR:
9
+ """Construction d'une matrice CSR à partir d'un objet `manip`.
10
+ Usage:
11
+ S = SparseMatrixGPU(manip)
12
+ S.allocate()
13
+ Après allocate(), on a: row_ptr (host np.int64 array), row_ptr_gpu (device ptr),
14
+ h_col_ind, h_values, col_ind_gpu, values_gpu, norm_factor_inv.
15
+ """
16
+
17
+ def __init__(self, manip, block_rows=64, relative_threshold=0.3, device=0):
18
+ drv.init()
19
+ self.device = drv.Device(device)
20
+ self.ctx = self.device.make_context()
21
+ self.manip = manip
22
+ self.N = len(manip.AcousticFields)
23
+ self.T = manip.AcousticFields[0].field.shape[0]
24
+ self.Z = manip.AcousticFields[0].field.shape[1]
25
+ self.X = manip.AcousticFields[0].field.shape[2]
26
+ self.block_rows = block_rows
27
+ self.relative_threshold = relative_threshold
28
+
29
+ # --- FIX: Résolution du chemin du .cubin (dans AOT_Recon/) ---
30
+ # Le fichier SparseSMatrix_CSR.py est dans AOT_Recon/AOT_SparseSMatrix/
31
+ # On remonte d'un répertoire pour atteindre AOT_Recon/
32
+ cubin_parent_dir = os.path.dirname(os.path.dirname(__file__))
33
+ self.module_path = os.path.join(cubin_parent_dir, "AOT_biomaps_kernels.cubin")
34
+ # --- FIN FIX ---
35
+
36
+ self.h_dense = None
37
+ self.row_ptr = None
38
+ self.row_ptr_gpu = None
39
+ self.h_col_ind = None
40
+ self.h_values = None
41
+ self.total_nnz = 0
42
+ self.norm_factor_inv = None
43
+ self.sparse_mod = None
44
+
45
+ def __enter__(self):
46
+ return self
47
+
48
+ def __exit__(self, exc_type, exc, tb):
49
+ self.free()
50
+
51
+ def load_precompiled_module(self):
52
+ """
53
+ Charge le module CUDA pré-compilé (.cubin) en utilisant le chemin résolu.
54
+ Supprime la logique de compilation JIT.
55
+ """
56
+ so_path = self.module_path # Utilise le chemin résolu dans __init__
57
+
58
+ if not os.path.exists(so_path):
59
+ raise FileNotFoundError(
60
+ f"Le module CUDA {os.path.basename(so_path)} est introuvable au chemin: {so_path}. "
61
+ "Assurez-vous qu'il est compilé et bien placé."
62
+ )
63
+
64
+ try:
65
+ self.sparse_mod = drv.module_from_file(so_path)
66
+ print(f"✅ Module CUDA chargé depuis {so_path}")
67
+ except Exception as e:
68
+ raise RuntimeError(f"Le fichier {os.path.basename(so_path)} a été trouvé, mais PyCUDA n'a pas pu le charger. Vérifiez la compatibilité.") from e
69
+
70
+ def estimate_nnz_cpu(self):
71
+ """Estimation rapide (non-exacte) — utile si tu veux une estimation faible.
72
+ Recommandé : utiliser la passe GPU exacte (count_nnz_per_row_kernel) à la place.
73
+ """
74
+ total = 0
75
+ for n in range(self.N):
76
+ field = self.manip.AcousticFields[n].field
77
+ for t in range(self.T):
78
+ row = field[t].flatten()
79
+ row_max = np.max(np.abs(row))
80
+ thr = row_max * self.relative_threshold
81
+ total += np.count_nonzero(np.abs(row) > thr)
82
+ return int(total)
83
+
84
+ def allocate(self, kernel_module_path=None):
85
+ try:
86
+ # --- 1. Construction bloc par bloc (sans garder tout le dense si possible) ---
87
+ num_rows = self.N * self.T
88
+ num_cols = self.Z * self.X
89
+ bytes_float = np.dtype(np.float32).itemsize
90
+
91
+ # Charge module
92
+ # FIX: Toujours charger depuis self.module_path (résolu)
93
+ self.load_precompiled_module()
94
+
95
+ count_nnz_kernel = self.sparse_mod.get_function('count_nnz_rows_kernel')
96
+ fill_csr_kernel = self.sparse_mod.get_function('fill_kernel__CSR')
97
+
98
+ # allocate host row_ptr
99
+ self.row_ptr = np.zeros(num_rows + 1, dtype=np.int64)
100
+
101
+ # GPU temp buffers
102
+ dense_block_host = np.empty((self.block_rows, num_cols), dtype=np.float32)
103
+ dense_block_gpu = drv.mem_alloc(self.block_rows * num_cols * bytes_float)
104
+ row_nnz_gpu = drv.mem_alloc(self.block_rows * np.dtype(np.int32).itemsize)
105
+
106
+ block_size = 128
107
+
108
+ # --- Count NNZ per row using GPU kernel to be consistent with filling logic ---
109
+ for b in trange(0, num_rows, self.block_rows, desc='Comptage NNZ'):
110
+ current_rows = min(self.block_rows, num_rows - b)
111
+ # Fill dense_block_host from manip
112
+ for r in range(current_rows):
113
+ global_row = b + r
114
+ n_idx = global_row // self.T
115
+ t_idx = global_row % self.T
116
+ dense_block_host[r, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
117
+ drv.memcpy_htod(dense_block_gpu, dense_block_host)
118
+
119
+ grid = ((current_rows + block_size - 1) // block_size, 1, 1)
120
+ # Note: Assuming 'count_nnz_per_row_kernel' is the correct name (verified by user in prior steps)
121
+ count_nnz_kernel(dense_block_gpu, row_nnz_gpu,
122
+ np.int32(current_rows), np.int32(num_cols),
123
+ np.float32(self.relative_threshold),
124
+ block=(block_size, 1, 1), grid=grid)
125
+
126
+ row_nnz_host = np.empty(current_rows, dtype=np.int32)
127
+ drv.memcpy_dtoh(row_nnz_host, row_nnz_gpu)
128
+ self.row_ptr[b + 1:b + current_rows + 1] = self.row_ptr[b] + np.cumsum(row_nnz_host, dtype=np.int64)
129
+
130
+ # total nnz
131
+ self.total_nnz = int(self.row_ptr[-1])
132
+ print(f"NNZ total : {self.total_nnz}")
133
+
134
+ # allocate final arrays
135
+ self.h_col_ind = np.zeros(self.total_nnz, dtype=np.uint32)
136
+ self.h_values = np.zeros(self.total_nnz, dtype=np.float32)
137
+
138
+ # copy row_ptr to device once
139
+ self.row_ptr_gpu = drv.mem_alloc(self.row_ptr.nbytes)
140
+ drv.memcpy_htod(self.row_ptr_gpu, self.row_ptr)
141
+
142
+ # allocate device arrays for final csr
143
+ self.col_ind_gpu = drv.mem_alloc(self.h_col_ind.nbytes)
144
+ self.values_gpu = drv.mem_alloc(self.h_values.nbytes)
145
+
146
+ # --- Fill CSR per-block ---
147
+ for b in trange(0, num_rows, self.block_rows, desc='Remplissage CSR'):
148
+ current_rows = min(self.block_rows, num_rows - b)
149
+ for r in range(current_rows):
150
+ global_row = b + r
151
+ n_idx = global_row // self.T
152
+ t_idx = global_row % self.T
153
+ dense_block_host[r, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
154
+ drv.memcpy_htod(dense_block_gpu, dense_block_host)
155
+
156
+ grid = ((current_rows + block_size - 1) // block_size, 1, 1)
157
+ fill_csr_kernel(dense_block_gpu,
158
+ self.row_ptr_gpu,
159
+ self.col_ind_gpu,
160
+ self.values_gpu,
161
+ np.int32(b),
162
+ np.int32(current_rows),
163
+ np.int32(num_cols),
164
+ np.float32(self.relative_threshold),
165
+ np.int64(self.total_nnz),
166
+ block=(block_size, 1, 1), grid=grid)
167
+ drv.Context.synchronize()
168
+
169
+ # copy back
170
+ drv.memcpy_dtoh(self.h_col_ind, self.col_ind_gpu)
171
+ drv.memcpy_dtoh(self.h_values, self.values_gpu)
172
+ print('CSR généré ✔')
173
+
174
+ # compute normalization factor from CSR (sum per column)
175
+ self.compute_norm_factor_from_csr()
176
+
177
+ # free temporaries
178
+ dense_block_gpu.free(); row_nnz_gpu.free()
179
+
180
+ except Exception as e:
181
+ print(f"❌ Erreur détaillée : {e}")
182
+ self.free()
183
+ raise
184
+
185
+ def compute_norm_factor_from_csr(self):
186
+ ZX = self.Z * self.X
187
+
188
+ # 1) Allouer un vecteur de somme colonne sur le GPU
189
+ col_sum_gpu = drv.mem_alloc(ZX * np.dtype(np.float32).itemsize)
190
+ drv.memset_d32(col_sum_gpu, 0, ZX)
191
+
192
+ # 2) Récupérer le kernel
193
+ # FIX: Utiliser le nom générique 'accumulate_columns_atomic' comme dans SELL (si le binaire est partagé)
194
+ # Si le développeur utilise la convention __CSR, on la garde.
195
+ # Basé sur notre historique SELL, le nom est probablement générique 'accumulate_columns_atomic'.
196
+ # Je vais supposer que le nom est générique pour éviter une LogicError ici aussi.
197
+ acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic")
198
+
199
+ # 3) Lancer le kernel
200
+ threads = 256
201
+ blocks = (self.total_nnz + threads - 1) // threads
202
+
203
+ acc_kernel(
204
+ self.values_gpu,
205
+ self.col_ind_gpu,
206
+ np.int64(self.total_nnz),
207
+ col_sum_gpu,
208
+ block=(threads,1,1),
209
+ grid=(blocks,1,1)
210
+ )
211
+ drv.Context.synchronize()
212
+
213
+ # 4) Récupérer le résultat
214
+ norm = np.empty(ZX, dtype=np.float32)
215
+ drv.memcpy_dtoh(norm, col_sum_gpu)
216
+ col_sum_gpu.free()
217
+
218
+ norm = np.maximum(norm.astype(np.float64), 1e-6)
219
+ self.norm_factor_inv = (1.0 / norm).astype(np.float32)
220
+
221
+ self.norm_factor_inv_gpu = drv.mem_alloc(self.norm_factor_inv.nbytes)
222
+ drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
223
+
224
+ def getMatrixSize(self):
225
+ """
226
+ Retourne la taille totale de la matrice CSR en Go (en sommant la mémoire GPU).
227
+ """
228
+ if self.row_ptr is None:
229
+ return {"error": "La matrice sparse n'est pas encore allouée."}
230
+
231
+ total_bytes = 0
232
+
233
+ # Mémoire GPU (row_ptr_gpu, col_ind_gpu, values_gpu, norm_factor_inv_gpu)
234
+ if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
235
+ total_bytes += self.row_ptr_gpu.size
236
+ if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
237
+ total_bytes += self.col_ind_gpu.size
238
+ if hasattr(self, 'values_gpu') and self.values_gpu:
239
+ total_bytes += self.values_gpu.size
240
+ if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
241
+ total_bytes += self.norm_factor_inv_gpu.size
242
+
243
+ # NOTE: Les versions précédentes utilisaient le .size de l'objet DeviceAllocation,
244
+ # qui était problématique. Si l'erreur se reproduit ici, il faudra
245
+ # stocker la taille en octets comme nous l'avons fait pour SELL.
246
+ # Pour l'instant, nous conservons la méthode getMatrixSize originale de CSR.
247
+
248
+ return total_bytes / (1024**3)
249
+
250
+ def free(self):
251
+ try:
252
+ if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
253
+ self.col_ind_gpu.free()
254
+ if hasattr(self, 'values_gpu') and self.values_gpu:
255
+ self.values_gpu.free()
256
+ if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
257
+ self.row_ptr_gpu.free()
258
+ if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
259
+ self.norm_factor_inv_gpu.free()
260
+ if hasattr(self, 'ctx') and self.ctx:
261
+ try:
262
+ self.ctx.pop()
263
+ except Exception:
264
+ pass
265
+ self.ctx = None
266
+ print('✅ Mémoire GPU libérée.')
267
+ except Exception as e:
268
+ print(f"❌ Erreur lors de la libération de la mémoire GPU : {e}")
269
+
270
+ def compute_density(self):
271
+ """
272
+ Retourne la densité réelle de la CSR = NNZ / (num_rows * num_cols)
273
+ Nécessite que self.h_values et self.row_ptr existent (host).
274
+ """
275
+ if self.row_ptr is None or self.h_values is None:
276
+ raise RuntimeError("row_ptr et h_values requis pour calculer la densité")
277
+ num_rows = int(self.N * self.T)
278
+ num_cols = int(self.Z * self.X)
279
+ total_nnz = int(self.row_ptr[-1])
280
+ density = total_nnz / (num_rows * num_cols)
281
+ return density