AOT-biomaps 2.9.138__py3-none-any.whl → 2.9.279__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of AOT-biomaps might be problematic. Click here for more details.
- AOT_biomaps/AOT_Acoustic/AcousticTools.py +35 -115
- AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
- AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +22 -18
- AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
- AOT_biomaps/AOT_Experiment/_mainExperiment.py +102 -68
- AOT_biomaps/AOT_Optic/_mainOptic.py +124 -58
- AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +72 -108
- AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +474 -289
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +173 -68
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +360 -154
- AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +150 -111
- AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +281 -0
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +328 -0
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
- AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
- AOT_biomaps/AOT_Recon/AlgebraicRecon.py +359 -238
- AOT_biomaps/AOT_Recon/AnalyticRecon.py +29 -41
- AOT_biomaps/AOT_Recon/BayesianRecon.py +165 -91
- AOT_biomaps/AOT_Recon/DeepLearningRecon.py +4 -1
- AOT_biomaps/AOT_Recon/PrimalDualRecon.py +175 -31
- AOT_biomaps/AOT_Recon/ReconEnums.py +38 -3
- AOT_biomaps/AOT_Recon/ReconTools.py +184 -77
- AOT_biomaps/AOT_Recon/__init__.py +1 -0
- AOT_biomaps/AOT_Recon/_mainRecon.py +144 -74
- AOT_biomaps/__init__.py +4 -36
- {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/METADATA +2 -1
- aot_biomaps-2.9.279.dist-info/RECORD +47 -0
- aot_biomaps-2.9.138.dist-info/RECORD +0 -43
- {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/WHEEL +0 -0
- {aot_biomaps-2.9.138.dist-info → aot_biomaps-2.9.279.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
import pycuda.driver as drv
|
|
2
|
+
import numpy as np
|
|
3
|
+
from tqdm import trange
|
|
4
|
+
import os
|
|
5
|
+
import gc
|
|
6
|
+
|
|
7
|
+
class SparseSMatrix_SELL:
|
|
8
|
+
def __init__(self, manip, block_rows=64, relative_threshold=0.3, device=0,
|
|
9
|
+
module_path="AOT_biomaps_kernels.cubin", slice_height=32):
|
|
10
|
+
drv.init()
|
|
11
|
+
self.device = drv.Device(device)
|
|
12
|
+
self.ctx = self.device.make_context()
|
|
13
|
+
self.manip = manip
|
|
14
|
+
self.N = len(manip.AcousticFields)
|
|
15
|
+
self.T = manip.AcousticFields[0].field.shape[0]
|
|
16
|
+
self.Z = manip.AcousticFields[0].field.shape[1]
|
|
17
|
+
self.X = manip.AcousticFields[0].field.shape[2]
|
|
18
|
+
self.block_rows = block_rows
|
|
19
|
+
self.relative_threshold = relative_threshold
|
|
20
|
+
|
|
21
|
+
# --- PATH RESOLUTION FIX ---
|
|
22
|
+
# The cubin file is located in the parent directory (AOT_Recon/)
|
|
23
|
+
# We use os.path.dirname(os.path.dirname(__file__)) to go up one directory level.
|
|
24
|
+
cubin_parent_dir = os.path.dirname(os.path.dirname(__file__))
|
|
25
|
+
self.module_path = os.path.join(cubin_parent_dir, module_path)
|
|
26
|
+
# --- END FIX ---
|
|
27
|
+
|
|
28
|
+
self.slice_height = slice_height
|
|
29
|
+
|
|
30
|
+
# SELL arrays (device) & Size Tracking (CRITICAL FIX: Initialized attributes)
|
|
31
|
+
self.sell_values_gpu = None
|
|
32
|
+
self.sell_colinds_gpu = None
|
|
33
|
+
self.slice_ptr = None
|
|
34
|
+
self.slice_len = None
|
|
35
|
+
self.slice_ptr_gpu = None
|
|
36
|
+
self.slice_len_gpu = None
|
|
37
|
+
|
|
38
|
+
# Attributes to store allocated size in bytes (bypassing the problematic .size attribute)
|
|
39
|
+
self.sell_values_gpu_size = 0
|
|
40
|
+
self.sell_colinds_gpu_size = 0
|
|
41
|
+
self.slice_ptr_gpu_size = 0
|
|
42
|
+
self.slice_len_gpu_size = 0
|
|
43
|
+
|
|
44
|
+
self.total_storage = 0
|
|
45
|
+
|
|
46
|
+
self.norm_factor_inv = None
|
|
47
|
+
self.norm_factor_inv_gpu = None
|
|
48
|
+
self.norm_factor_inv_gpu_size = 0
|
|
49
|
+
|
|
50
|
+
self.sparse_mod = None
|
|
51
|
+
self.load_module()
|
|
52
|
+
|
|
53
|
+
def load_module(self):
|
|
54
|
+
"""Loads the pre-compiled CUDA module (.cubin file)."""
|
|
55
|
+
|
|
56
|
+
# Check if the file exists at the calculated absolute path
|
|
57
|
+
if not os.path.exists(self.module_path):
|
|
58
|
+
# The path is now correctly calculated to the parent directory.
|
|
59
|
+
raise FileNotFoundError(f"CUDA module {os.path.basename(self.module_path)} not found at path: {self.module_path}")
|
|
60
|
+
|
|
61
|
+
# Try to load the module
|
|
62
|
+
try:
|
|
63
|
+
self.sparse_mod = drv.module_from_file(self.module_path)
|
|
64
|
+
print(f"Loaded CUDA module {os.path.basename(self.module_path)}")
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"❌ Error loading CUDA module {os.path.basename(self.module_path)}: {e}")
|
|
67
|
+
raise RuntimeError(f"File {os.path.basename(self.module_path)} was found, but PyCUDA could not load it. Check compatibility.") from e
|
|
68
|
+
|
|
69
|
+
def free(self):
|
|
70
|
+
try:
|
|
71
|
+
# Free device allocations
|
|
72
|
+
attrs = ["sell_values_gpu","sell_colinds_gpu","slice_ptr_gpu","slice_len_gpu","norm_factor_inv_gpu"]
|
|
73
|
+
for a in attrs:
|
|
74
|
+
if hasattr(self, a) and getattr(self, a) is not None:
|
|
75
|
+
getattr(self, a).free()
|
|
76
|
+
setattr(self, a, None)
|
|
77
|
+
|
|
78
|
+
# Reset stored sizes
|
|
79
|
+
self.sell_values_gpu_size = 0
|
|
80
|
+
self.sell_colinds_gpu_size = 0
|
|
81
|
+
self.slice_ptr_gpu_size = 0
|
|
82
|
+
self.slice_len_gpu_size = 0
|
|
83
|
+
self.norm_factor_inv_gpu_size = 0
|
|
84
|
+
|
|
85
|
+
if hasattr(self, 'ctx') and self.ctx:
|
|
86
|
+
try: self.ctx.pop()
|
|
87
|
+
except Exception: pass
|
|
88
|
+
self.ctx = None
|
|
89
|
+
except Exception as e:
|
|
90
|
+
print("Error freeing GPU memory:", e)
|
|
91
|
+
|
|
92
|
+
def allocate(self):
|
|
93
|
+
"""
|
|
94
|
+
Build SELL-C-σ directly from manip AcousticFields in streaming blocks.
|
|
95
|
+
NOTE: This is the logic of allocate_sell_c_sigma_direct from the working class.
|
|
96
|
+
"""
|
|
97
|
+
if self.sparse_mod is None:
|
|
98
|
+
raise RuntimeError("CUDA module not loaded. Check compilation.")
|
|
99
|
+
|
|
100
|
+
# NOTE: Les noms de kernel (count_nnz_rows_kernel, fill_kernel__SELL) sont utilisés
|
|
101
|
+
# car ils sont présents dans la classe fonctionnelle.
|
|
102
|
+
count_kernel = self.sparse_mod.get_function("count_nnz_rows_kernel")
|
|
103
|
+
fill_kernel = self.sparse_mod.get_function("fill_kernel__SELL")
|
|
104
|
+
|
|
105
|
+
num_rows = int(self.N * self.T)
|
|
106
|
+
num_cols = int(self.Z * self.X)
|
|
107
|
+
C = int(self.slice_height)
|
|
108
|
+
|
|
109
|
+
# host temporary block
|
|
110
|
+
br = int(self.block_rows)
|
|
111
|
+
bytes_per_elem = np.dtype(np.float32).itemsize
|
|
112
|
+
dense_host = np.empty((br, num_cols), dtype=np.float32)
|
|
113
|
+
|
|
114
|
+
# Allocation 1: Dense block GPU memory
|
|
115
|
+
dense_gpu_size = dense_host.nbytes
|
|
116
|
+
dense_gpu = drv.mem_alloc(dense_gpu_size)
|
|
117
|
+
|
|
118
|
+
# 1) count nnz per row (on host via small blocks with GPU kernel)
|
|
119
|
+
row_nnz = np.zeros(num_rows, dtype=np.int32)
|
|
120
|
+
row_nnz_gpu_block_size = br * np.dtype(np.int32).itemsize
|
|
121
|
+
row_nnz_gpu_block = drv.mem_alloc(row_nnz_gpu_block_size)
|
|
122
|
+
|
|
123
|
+
block = 256
|
|
124
|
+
for b in trange(0, num_rows, br, desc="Count NNZ per row"):
|
|
125
|
+
R = min(br, num_rows - b)
|
|
126
|
+
# fill dense_host
|
|
127
|
+
for i in range(R):
|
|
128
|
+
rg = b + i
|
|
129
|
+
n_idx = rg // self.T
|
|
130
|
+
t_idx = rg % self.T
|
|
131
|
+
dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
|
|
132
|
+
# copy only R rows
|
|
133
|
+
drv.memcpy_htod(dense_gpu, dense_host)
|
|
134
|
+
grid = ((R + block - 1) // block, 1, 1)
|
|
135
|
+
count_kernel(dense_gpu, row_nnz_gpu_block, np.int32(R), np.int32(num_cols), np.float32(self.relative_threshold),
|
|
136
|
+
block=(block,1,1), grid=grid)
|
|
137
|
+
tmp = np.empty(R, dtype=np.int32)
|
|
138
|
+
drv.memcpy_dtoh(tmp, row_nnz_gpu_block)
|
|
139
|
+
row_nnz[b:b+R] = tmp
|
|
140
|
+
|
|
141
|
+
row_nnz_gpu_block.free()
|
|
142
|
+
dense_gpu.free()
|
|
143
|
+
|
|
144
|
+
# 2) compute per-slice maxlen and slice_ptr
|
|
145
|
+
num_slices = (num_rows + C - 1) // C
|
|
146
|
+
slice_len = np.zeros(num_slices, dtype=np.int32)
|
|
147
|
+
for s in range(num_slices):
|
|
148
|
+
r0 = s * C
|
|
149
|
+
r1 = min(num_rows, r0 + C)
|
|
150
|
+
slice_len[s] = int(np.max(row_nnz[r0:r1])) if (r1>r0) else 0
|
|
151
|
+
# slice_ptr (int64)
|
|
152
|
+
slice_ptr = np.zeros(num_slices + 1, dtype=np.int64)
|
|
153
|
+
for s in range(num_slices):
|
|
154
|
+
slice_ptr[s+1] = slice_ptr[s] + (slice_len[s] * C)
|
|
155
|
+
total_storage = int(slice_ptr[-1])
|
|
156
|
+
self.total_storage = total_storage
|
|
157
|
+
print(f"SELL: num_rows={num_rows}, num_slices={num_slices}, total_storage(padded)={total_storage}")
|
|
158
|
+
|
|
159
|
+
# allocate device SELL arrays (values float32, colinds uint32)
|
|
160
|
+
self.sell_values_gpu_size = total_storage * np.dtype(np.float32).itemsize
|
|
161
|
+
self.sell_colinds_gpu_size = total_storage * np.dtype(np.uint32).itemsize
|
|
162
|
+
|
|
163
|
+
self.sell_values_gpu = drv.mem_alloc(self.sell_values_gpu_size)
|
|
164
|
+
self.sell_colinds_gpu = drv.mem_alloc(self.sell_colinds_gpu_size)
|
|
165
|
+
|
|
166
|
+
# allocate slice metadata on device
|
|
167
|
+
self.slice_ptr = slice_ptr
|
|
168
|
+
self.slice_len = slice_len
|
|
169
|
+
|
|
170
|
+
self.slice_ptr_gpu_size = self.slice_ptr.nbytes
|
|
171
|
+
self.slice_len_gpu_size = self.slice_len.nbytes
|
|
172
|
+
|
|
173
|
+
self.slice_ptr_gpu = drv.mem_alloc(self.slice_ptr_gpu_size)
|
|
174
|
+
self.slice_len_gpu = drv.mem_alloc(self.slice_len_gpu_size)
|
|
175
|
+
|
|
176
|
+
drv.memcpy_htod(self.slice_ptr_gpu, self.slice_ptr)
|
|
177
|
+
drv.memcpy_htod(self.slice_len_gpu, self.slice_len)
|
|
178
|
+
|
|
179
|
+
# 3) fill SELL arrays by streaming blocks again (use GPU fill kernel)
|
|
180
|
+
# reuse dense_host and allocate new dense_gpu
|
|
181
|
+
dense_host = np.empty((br, num_cols), dtype=np.float32)
|
|
182
|
+
|
|
183
|
+
dense_gpu_2_size = dense_host.nbytes
|
|
184
|
+
dense_gpu = drv.mem_alloc(dense_gpu_2_size)
|
|
185
|
+
|
|
186
|
+
# we also need row_nnz on device per-block; supply global row_nnz on host but the kernel recomputes threshold
|
|
187
|
+
row_nnz_host_gpu_size = br * np.dtype(np.int32).itemsize
|
|
188
|
+
row_nnz_host_gpu = drv.mem_alloc(row_nnz_host_gpu_size)
|
|
189
|
+
|
|
190
|
+
for b in trange(0, num_rows, br, desc="Fill SELL"):
|
|
191
|
+
R = min(br, num_rows - b)
|
|
192
|
+
for i in range(R):
|
|
193
|
+
rg = b + i
|
|
194
|
+
n_idx = rg // self.T
|
|
195
|
+
t_idx = rg % self.T
|
|
196
|
+
dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
|
|
197
|
+
drv.memcpy_htod(dense_gpu, dense_host)
|
|
198
|
+
# We pass a dummy row_nnz pointer (not used in this kernel; left for API)
|
|
199
|
+
# Kernel expects rows_in_block, rows_global_offset to know where to write.
|
|
200
|
+
grid = ((R + block - 1) // block, 1, 1)
|
|
201
|
+
fill_kernel(dense_gpu,
|
|
202
|
+
np.intp(0), # placeholder for row_nnz pointer (not used)
|
|
203
|
+
self.slice_ptr_gpu,
|
|
204
|
+
self.slice_len_gpu,
|
|
205
|
+
self.sell_colinds_gpu,
|
|
206
|
+
self.sell_values_gpu,
|
|
207
|
+
np.int32(R),
|
|
208
|
+
np.int32(num_cols),
|
|
209
|
+
np.int32(b), # rows_global_offset
|
|
210
|
+
np.int32(C),
|
|
211
|
+
np.float32(self.relative_threshold),
|
|
212
|
+
block=(block,1,1), grid=grid)
|
|
213
|
+
dense_gpu.free()
|
|
214
|
+
row_nnz_host_gpu.free()
|
|
215
|
+
|
|
216
|
+
# 4) compute norm_factor_inv via GPU accumulate (col sums)
|
|
217
|
+
self.compute_norm_factor()
|
|
218
|
+
|
|
219
|
+
def apply_apodization_gpu(self, window_vector_gpu):
|
|
220
|
+
"""
|
|
221
|
+
Applique le fenêtrage directement sur self.sell_values_gpu
|
|
222
|
+
en utilisant les indices de colonnes (pixels) pour référencer
|
|
223
|
+
la fenêtre. Opération : A_values[i] *= W_vec[A_colinds[i]].
|
|
224
|
+
"""
|
|
225
|
+
if self.sparse_mod is None:
|
|
226
|
+
raise RuntimeError("Le module CUDA n'a pas été chargé.")
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
apodize_kernel = self.sparse_mod.get_function("apply_apodisation_kernel__SELL")
|
|
230
|
+
except drv.LogicError as e:
|
|
231
|
+
raise RuntimeError(
|
|
232
|
+
f"Le kernel CUDA 'multiply_sell_by_window_kernel' est manquant dans le .cubin. "
|
|
233
|
+
f"Veuillez le compiler et l'ajouter. Erreur : {e}"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Le total_storage inclut les éléments non-nuls et le padding SELL.
|
|
237
|
+
threads = 256
|
|
238
|
+
blocks = (self.total_storage + threads - 1) // threads
|
|
239
|
+
|
|
240
|
+
# Lancement du kernel. Il travaille sur total_storage éléments.
|
|
241
|
+
apodize_kernel(
|
|
242
|
+
self.sell_values_gpu,
|
|
243
|
+
self.sell_colinds_gpu,
|
|
244
|
+
window_vector_gpu,
|
|
245
|
+
np.int64(self.total_storage),
|
|
246
|
+
block=(threads, 1, 1),
|
|
247
|
+
grid=(blocks, 1, 1)
|
|
248
|
+
)
|
|
249
|
+
drv.Context.synchronize()
|
|
250
|
+
print("✅ Multiplication par le fenêtrage effectuée in-place sur GPU (SELL-C-σ).")
|
|
251
|
+
# --- Ajout de la fonction de normalisation (qui fonctionne) ---
|
|
252
|
+
def compute_norm_factor(self):
|
|
253
|
+
"""
|
|
254
|
+
Accumulate column sums on GPU using accumulate_columns_atomic, then compute inverse.
|
|
255
|
+
"""
|
|
256
|
+
if self.total_storage == 0:
|
|
257
|
+
raise RuntimeError("sell not built")
|
|
258
|
+
ZX = int(self.Z * self.X)
|
|
259
|
+
|
|
260
|
+
# allocate col sum on device
|
|
261
|
+
col_sum_gpu_size = ZX * np.dtype(np.float32).itemsize
|
|
262
|
+
col_sum_gpu = drv.mem_alloc(col_sum_gpu_size)
|
|
263
|
+
drv.memset_d32(col_sum_gpu, 0, ZX)
|
|
264
|
+
|
|
265
|
+
# FIX: Kernel name is "accumulate_columns_atomic"
|
|
266
|
+
acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic")
|
|
267
|
+
|
|
268
|
+
threads = 256
|
|
269
|
+
blocks = (self.total_storage + threads - 1) // threads
|
|
270
|
+
acc_kernel(self.sell_values_gpu, self.sell_colinds_gpu, np.int64(self.total_storage), col_sum_gpu,
|
|
271
|
+
block=(threads,1,1), grid=(blocks,1,1))
|
|
272
|
+
drv.Context.synchronize()
|
|
273
|
+
|
|
274
|
+
# copy back
|
|
275
|
+
norm_host = np.empty(ZX, dtype=np.float32)
|
|
276
|
+
drv.memcpy_dtoh(norm_host, col_sum_gpu)
|
|
277
|
+
col_sum_gpu.free()
|
|
278
|
+
|
|
279
|
+
norm = np.maximum(norm_host.astype(np.float64), 1e-6)
|
|
280
|
+
self.norm_factor_inv = (1.0 / norm).astype(np.float32)
|
|
281
|
+
if self.norm_factor_inv_gpu is not None:
|
|
282
|
+
self.norm_factor_inv_gpu.free()
|
|
283
|
+
|
|
284
|
+
self.norm_factor_inv_gpu_size = self.norm_factor_inv.nbytes
|
|
285
|
+
self.norm_factor_inv_gpu = drv.mem_alloc(self.norm_factor_inv_gpu_size)
|
|
286
|
+
drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
|
|
287
|
+
|
|
288
|
+
def compute_density(self):
|
|
289
|
+
"""
|
|
290
|
+
Returns only the density of the SELL-C-σ matrix.
|
|
291
|
+
"""
|
|
292
|
+
if not hasattr(self, 'slice_ptr') or self.slice_ptr is None:
|
|
293
|
+
raise RuntimeError("The SELL-C-σ matrix is not allocated.")
|
|
294
|
+
|
|
295
|
+
num_rows = self.N * self.T
|
|
296
|
+
num_cols = self.Z * self.X
|
|
297
|
+
total_elements = num_rows * num_cols
|
|
298
|
+
|
|
299
|
+
# Conservative estimate of non-zeros (excluding padding)
|
|
300
|
+
nnz_ell_estimated = int(0.9 * self.total_storage)
|
|
301
|
+
|
|
302
|
+
return nnz_ell_estimated / total_elements # Returns only the density
|
|
303
|
+
|
|
304
|
+
def getMatrixSize(self):
|
|
305
|
+
"""
|
|
306
|
+
Returns the total size of the SELL-C-σ matrix in Gigabytes (GB).
|
|
307
|
+
"""
|
|
308
|
+
if self.sell_values_gpu is None:
|
|
309
|
+
return {"error": "The SELL-C-σ matrix is not yet allocated."}
|
|
310
|
+
|
|
311
|
+
total_bytes = 0
|
|
312
|
+
|
|
313
|
+
# Host-side arrays (using .nbytes which works for NumPy arrays)
|
|
314
|
+
if hasattr(self, 'slice_ptr') and self.slice_ptr is not None:
|
|
315
|
+
total_bytes += self.slice_ptr.nbytes
|
|
316
|
+
if hasattr(self, 'slice_len') and self.slice_len is not None:
|
|
317
|
+
total_bytes += self.slice_len.nbytes
|
|
318
|
+
if hasattr(self, 'norm_factor_inv') and self.norm_factor_inv is not None:
|
|
319
|
+
total_bytes += self.norm_factor_inv.nbytes
|
|
320
|
+
|
|
321
|
+
# GPU-side arrays (using the stored size attributes instead of the problematic .size)
|
|
322
|
+
total_bytes += self.sell_values_gpu_size
|
|
323
|
+
total_bytes += self.sell_colinds_gpu_size
|
|
324
|
+
total_bytes += self.slice_ptr_gpu_size
|
|
325
|
+
total_bytes += self.slice_len_gpu_size
|
|
326
|
+
total_bytes += self.norm_factor_inv_gpu_size
|
|
327
|
+
|
|
328
|
+
return total_bytes / (1024 ** 3) # Returns only the size in GB
|
|
Binary file
|