AOT-biomaps 2.9.261__py3-none-any.whl → 2.9.318__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of AOT-biomaps might be problematic. Click here for more details.
- AOT_biomaps/AOT_Experiment/Tomography.py +124 -0
- AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +400 -10
- AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +207 -84
- AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +442 -11
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +48 -26
- AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +172 -134
- AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
- AOT_biomaps/AOT_Recon/AlgebraicRecon.py +27 -20
- AOT_biomaps/AOT_Recon/PrimalDualRecon.py +94 -41
- AOT_biomaps/AOT_Recon/ReconTools.py +164 -18
- AOT_biomaps/__init__.py +58 -1
- {aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.318.dist-info}/METADATA +1 -1
- {aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.318.dist-info}/RECORD +15 -14
- {aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.318.dist-info}/WHEEL +0 -0
- {aot_biomaps-2.9.261.dist-info → aot_biomaps-2.9.318.dist-info}/top_level.txt +0 -0
|
@@ -3,17 +3,6 @@ import numpy as np
|
|
|
3
3
|
from tqdm import trange
|
|
4
4
|
import os
|
|
5
5
|
import gc
|
|
6
|
-
import subprocess
|
|
7
|
-
import sys
|
|
8
|
-
|
|
9
|
-
# ==============================================================================
|
|
10
|
-
# ATTENTION: MANUAL CONFIGURATION REQUIRED
|
|
11
|
-
# ==============================================================================
|
|
12
|
-
# The 'sm_XX' value must correspond to the Compute Capability of your NVIDIA GPU.
|
|
13
|
-
# Examples: Kepler (sm_35), Maxwell (sm_50), Pascal (sm_61), Turing (sm_75), Ampere (sm_86).
|
|
14
|
-
# THIS LINE MUST BE MODIFIED BY THE USER to target their specific architecture.
|
|
15
|
-
GPU_COMPUTE_CAPABILITY = "sm_86"
|
|
16
|
-
# ==============================================================================
|
|
17
6
|
|
|
18
7
|
class SparseSMatrix_SELL:
|
|
19
8
|
def __init__(self, manip, block_rows=64, relative_threshold=0.3, device=0,
|
|
@@ -28,104 +17,71 @@ class SparseSMatrix_SELL:
|
|
|
28
17
|
self.X = manip.AcousticFields[0].field.shape[2]
|
|
29
18
|
self.block_rows = block_rows
|
|
30
19
|
self.relative_threshold = relative_threshold
|
|
31
|
-
|
|
32
|
-
|
|
20
|
+
|
|
21
|
+
# --- PATH RESOLUTION FIX ---
|
|
22
|
+
# The cubin file is located in the parent directory (AOT_Recon/)
|
|
23
|
+
# We use os.path.dirname(os.path.dirname(__file__)) to go up one directory level.
|
|
24
|
+
cubin_parent_dir = os.path.dirname(os.path.dirname(__file__))
|
|
25
|
+
self.module_path = os.path.join(cubin_parent_dir, module_path)
|
|
26
|
+
# --- END FIX ---
|
|
27
|
+
|
|
33
28
|
self.slice_height = slice_height
|
|
34
29
|
|
|
35
|
-
# SELL arrays (device)
|
|
30
|
+
# SELL arrays (device) & Size Tracking (CRITICAL FIX: Initialized attributes)
|
|
36
31
|
self.sell_values_gpu = None
|
|
37
32
|
self.sell_colinds_gpu = None
|
|
38
33
|
self.slice_ptr = None
|
|
39
34
|
self.slice_len = None
|
|
40
35
|
self.slice_ptr_gpu = None
|
|
41
36
|
self.slice_len_gpu = None
|
|
37
|
+
|
|
38
|
+
# Attributes to store allocated size in bytes (bypassing the problematic .size attribute)
|
|
39
|
+
self.sell_values_gpu_size = 0
|
|
40
|
+
self.sell_colinds_gpu_size = 0
|
|
41
|
+
self.slice_ptr_gpu_size = 0
|
|
42
|
+
self.slice_len_gpu_size = 0
|
|
43
|
+
|
|
42
44
|
self.total_storage = 0
|
|
43
45
|
|
|
44
46
|
self.norm_factor_inv = None
|
|
45
47
|
self.norm_factor_inv_gpu = None
|
|
48
|
+
self.norm_factor_inv_gpu_size = 0
|
|
46
49
|
|
|
47
50
|
self.sparse_mod = None
|
|
48
51
|
self.load_module()
|
|
49
|
-
|
|
50
|
-
def _compile_cubin(self, source_file="AOT_biomaps_kernels.cu"):
|
|
51
|
-
"""
|
|
52
|
-
Tries to compile the .cu file into .cubin using nvcc.
|
|
53
|
-
"""
|
|
54
|
-
print("="*60)
|
|
55
|
-
print("🛠️ CUDA COMPILATION REQUIRED")
|
|
56
|
-
print(f"Attempting to compile {source_file} to {os.path.basename(self.module_path)}...")
|
|
57
|
-
|
|
58
|
-
# The source file is assumed to be in the same directory as this Python file.
|
|
59
|
-
source_path = os.path.join(os.path.dirname(__file__), source_file)
|
|
60
|
-
cubin_path = self.module_path
|
|
61
|
-
|
|
62
|
-
if not os.path.exists(source_path):
|
|
63
|
-
print(f"❌ CRITICAL ERROR: CUDA source file {source_file} not found at {source_path}.")
|
|
64
|
-
raise FileNotFoundError(f"Could not find source file {source_file} for compilation. AOT_biomaps installation might be incomplete.")
|
|
65
|
-
|
|
66
|
-
# Construction of the nvcc command
|
|
67
|
-
command = [
|
|
68
|
-
'nvcc',
|
|
69
|
-
'-cubin',
|
|
70
|
-
f'-arch={GPU_COMPUTE_CAPABILITY}', # USES THE VARIABLE DEFINED ABOVE
|
|
71
|
-
source_path,
|
|
72
|
-
'-o',
|
|
73
|
-
cubin_path
|
|
74
|
-
]
|
|
75
|
-
|
|
76
|
-
print(f"Executing command: {' '.join(command)}")
|
|
77
|
-
|
|
78
|
-
try:
|
|
79
|
-
# Executes the command and waits for completion
|
|
80
|
-
result = subprocess.run(
|
|
81
|
-
command,
|
|
82
|
-
check=True,
|
|
83
|
-
capture_output=True,
|
|
84
|
-
text=True
|
|
85
|
-
)
|
|
86
|
-
print(f"🎉 Compilation successful! File created: {os.path.basename(cubin_path)}")
|
|
87
|
-
# print("Output nvcc:\n", result.stdout) # Uncomment to see detailed output
|
|
88
|
-
print("="*60)
|
|
89
|
-
return True
|
|
90
|
-
|
|
91
|
-
except subprocess.CalledProcessError as e:
|
|
92
|
-
print("❌ NVCC COMPILATION ERROR:")
|
|
93
|
-
print(f"Check GPU architecture: {GPU_COMPUTE_CAPABILITY}")
|
|
94
|
-
print(f"Standard error:\n{e.stderr}")
|
|
95
|
-
print("="*60)
|
|
96
|
-
return False
|
|
97
|
-
|
|
98
|
-
except FileNotFoundError:
|
|
99
|
-
print("❌ ERROR: The 'nvcc' command was not found.")
|
|
100
|
-
print("Ensure that the CUDA Toolkit is installed and 'nvcc' is in your PATH (or your Conda environment).")
|
|
101
|
-
print("="*60)
|
|
102
|
-
return False
|
|
103
52
|
|
|
104
53
|
def load_module(self):
|
|
105
|
-
"""
|
|
54
|
+
"""Loads the pre-compiled CUDA module (.cubin file)."""
|
|
106
55
|
|
|
56
|
+
# Check if the file exists at the calculated absolute path
|
|
107
57
|
if not os.path.exists(self.module_path):
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
if not self._compile_cubin():
|
|
111
|
-
# If compilation fails, re-raise the original error.
|
|
112
|
-
raise FileNotFoundError(f"{os.path.basename(self.module_path)} not found and compilation failed. Check nvcc and GPU architecture ({GPU_COMPUTE_CAPABILITY}).")
|
|
58
|
+
# The path is now correctly calculated to the parent directory.
|
|
59
|
+
raise FileNotFoundError(f"CUDA module {os.path.basename(self.module_path)} not found at path: {self.module_path}")
|
|
113
60
|
|
|
114
|
-
# Try to load
|
|
61
|
+
# Try to load the module
|
|
115
62
|
try:
|
|
116
63
|
self.sparse_mod = drv.module_from_file(self.module_path)
|
|
117
64
|
print(f"Loaded CUDA module {os.path.basename(self.module_path)}")
|
|
118
65
|
except Exception as e:
|
|
119
66
|
print(f"❌ Error loading CUDA module {os.path.basename(self.module_path)}: {e}")
|
|
120
|
-
raise RuntimeError(f"File {os.path.basename(self.module_path)} was found, but PyCUDA could not load it.") from e
|
|
67
|
+
raise RuntimeError(f"File {os.path.basename(self.module_path)} was found, but PyCUDA could not load it. Check compatibility.") from e
|
|
121
68
|
|
|
122
69
|
def free(self):
|
|
123
70
|
try:
|
|
71
|
+
# Free device allocations
|
|
124
72
|
attrs = ["sell_values_gpu","sell_colinds_gpu","slice_ptr_gpu","slice_len_gpu","norm_factor_inv_gpu"]
|
|
125
73
|
for a in attrs:
|
|
126
74
|
if hasattr(self, a) and getattr(self, a) is not None:
|
|
127
75
|
getattr(self, a).free()
|
|
128
76
|
setattr(self, a, None)
|
|
77
|
+
|
|
78
|
+
# Reset stored sizes
|
|
79
|
+
self.sell_values_gpu_size = 0
|
|
80
|
+
self.sell_colinds_gpu_size = 0
|
|
81
|
+
self.slice_ptr_gpu_size = 0
|
|
82
|
+
self.slice_len_gpu_size = 0
|
|
83
|
+
self.norm_factor_inv_gpu_size = 0
|
|
84
|
+
|
|
129
85
|
if hasattr(self, 'ctx') and self.ctx:
|
|
130
86
|
try: self.ctx.pop()
|
|
131
87
|
except Exception: pass
|
|
@@ -136,11 +92,11 @@ class SparseSMatrix_SELL:
|
|
|
136
92
|
def allocate(self):
|
|
137
93
|
"""
|
|
138
94
|
Build SELL-C-σ directly from manip AcousticFields in streaming blocks.
|
|
95
|
+
Corrected: per-block row_nnz copy, zeroing of host block, proper sync.
|
|
139
96
|
"""
|
|
140
|
-
# Ensures the module is loaded before attempting to retrieve functions
|
|
141
97
|
if self.sparse_mod is None:
|
|
142
|
-
|
|
143
|
-
|
|
98
|
+
raise RuntimeError("CUDA module not loaded. Check compilation.")
|
|
99
|
+
|
|
144
100
|
count_kernel = self.sparse_mod.get_function("count_nnz_rows_kernel")
|
|
145
101
|
fill_kernel = self.sparse_mod.get_function("fill_kernel__SELL")
|
|
146
102
|
|
|
@@ -148,30 +104,34 @@ class SparseSMatrix_SELL:
|
|
|
148
104
|
num_cols = int(self.Z * self.X)
|
|
149
105
|
C = int(self.slice_height)
|
|
150
106
|
|
|
151
|
-
# host temporary block
|
|
152
107
|
br = int(self.block_rows)
|
|
153
|
-
bytes_per_elem = np.dtype(np.float32).itemsize
|
|
154
108
|
dense_host = np.empty((br, num_cols), dtype=np.float32)
|
|
155
|
-
|
|
109
|
+
|
|
110
|
+
# Allocation dense buffer on device (size = br * num_cols)
|
|
111
|
+
dense_gpu_size = dense_host.nbytes
|
|
112
|
+
dense_gpu = drv.mem_alloc(dense_gpu_size)
|
|
156
113
|
|
|
157
|
-
# 1) count nnz per row (
|
|
114
|
+
# 1) count nnz per row (per block)
|
|
158
115
|
row_nnz = np.zeros(num_rows, dtype=np.int32)
|
|
159
|
-
|
|
116
|
+
row_nnz_gpu_block_size = br * np.dtype(np.int32).itemsize
|
|
117
|
+
row_nnz_gpu_block = drv.mem_alloc(row_nnz_gpu_block_size)
|
|
160
118
|
|
|
161
|
-
block =
|
|
119
|
+
block = 128
|
|
162
120
|
for b in trange(0, num_rows, br, desc="Count NNZ per row"):
|
|
163
121
|
R = min(br, num_rows - b)
|
|
164
|
-
#
|
|
122
|
+
# zero the host block to avoid garbage in tail when R < br
|
|
123
|
+
dense_host.fill(0.0)
|
|
165
124
|
for i in range(R):
|
|
166
125
|
rg = b + i
|
|
167
126
|
n_idx = rg // self.T
|
|
168
127
|
t_idx = rg % self.T
|
|
169
128
|
dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
|
|
170
|
-
# copy
|
|
129
|
+
# copy whole buffer (safe because we zeroed tail)
|
|
171
130
|
drv.memcpy_htod(dense_gpu, dense_host)
|
|
172
131
|
grid = ((R + block - 1) // block, 1, 1)
|
|
173
132
|
count_kernel(dense_gpu, row_nnz_gpu_block, np.int32(R), np.int32(num_cols), np.float32(self.relative_threshold),
|
|
174
|
-
|
|
133
|
+
block=(block,1,1), grid=grid)
|
|
134
|
+
drv.Context.synchronize()
|
|
175
135
|
tmp = np.empty(R, dtype=np.int32)
|
|
176
136
|
drv.memcpy_dtoh(tmp, row_nnz_gpu_block)
|
|
177
137
|
row_nnz[b:b+R] = tmp
|
|
@@ -186,7 +146,6 @@ class SparseSMatrix_SELL:
|
|
|
186
146
|
r0 = s * C
|
|
187
147
|
r1 = min(num_rows, r0 + C)
|
|
188
148
|
slice_len[s] = int(np.max(row_nnz[r0:r1])) if (r1>r0) else 0
|
|
189
|
-
# slice_ptr (int64)
|
|
190
149
|
slice_ptr = np.zeros(num_slices + 1, dtype=np.int64)
|
|
191
150
|
for s in range(num_slices):
|
|
192
151
|
slice_ptr[s+1] = slice_ptr[s] + (slice_len[s] * C)
|
|
@@ -195,85 +154,169 @@ class SparseSMatrix_SELL:
|
|
|
195
154
|
print(f"SELL: num_rows={num_rows}, num_slices={num_slices}, total_storage(padded)={total_storage}")
|
|
196
155
|
|
|
197
156
|
# allocate device SELL arrays (values float32, colinds uint32)
|
|
198
|
-
self.
|
|
199
|
-
self.
|
|
157
|
+
self.sell_values_gpu_size = total_storage * np.dtype(np.float32).itemsize
|
|
158
|
+
self.sell_colinds_gpu_size = total_storage * np.dtype(np.uint32).itemsize
|
|
159
|
+
|
|
160
|
+
# allocate and optionally zero them
|
|
161
|
+
self.sell_values_gpu = drv.mem_alloc(self.sell_values_gpu_size)
|
|
162
|
+
# It's good practice to zero the values buffer to avoid leftover memory
|
|
163
|
+
drv.memset_d32(self.sell_values_gpu, 0, total_storage)
|
|
164
|
+
|
|
165
|
+
self.sell_colinds_gpu = drv.mem_alloc(self.sell_colinds_gpu_size)
|
|
166
|
+
drv.memset_d32(self.sell_colinds_gpu, 0, total_storage)
|
|
167
|
+
|
|
200
168
|
# allocate slice metadata on device
|
|
201
169
|
self.slice_ptr = slice_ptr
|
|
202
170
|
self.slice_len = slice_len
|
|
203
|
-
|
|
204
|
-
self.
|
|
171
|
+
|
|
172
|
+
self.slice_ptr_gpu_size = self.slice_ptr.nbytes
|
|
173
|
+
self.slice_len_gpu_size = self.slice_len.nbytes
|
|
174
|
+
|
|
175
|
+
self.slice_ptr_gpu = drv.mem_alloc(self.slice_ptr_gpu_size)
|
|
176
|
+
self.slice_len_gpu = drv.mem_alloc(self.slice_len_gpu_size)
|
|
177
|
+
|
|
205
178
|
drv.memcpy_htod(self.slice_ptr_gpu, self.slice_ptr)
|
|
206
179
|
drv.memcpy_htod(self.slice_len_gpu, self.slice_len)
|
|
207
180
|
|
|
208
181
|
# 3) fill SELL arrays by streaming blocks again (use GPU fill kernel)
|
|
209
|
-
# reuse dense_host and dense_gpu
|
|
210
182
|
dense_host = np.empty((br, num_cols), dtype=np.float32)
|
|
211
183
|
dense_gpu = drv.mem_alloc(dense_host.nbytes)
|
|
212
|
-
|
|
184
|
+
|
|
185
|
+
# For per-block row_nnz pointer we allocate a buffer of max block size once, then reuse
|
|
213
186
|
row_nnz_host_gpu = drv.mem_alloc(br * np.dtype(np.int32).itemsize)
|
|
214
187
|
|
|
215
188
|
for b in trange(0, num_rows, br, desc="Fill SELL"):
|
|
216
189
|
R = min(br, num_rows - b)
|
|
190
|
+
dense_host.fill(0.0)
|
|
217
191
|
for i in range(R):
|
|
218
192
|
rg = b + i
|
|
219
193
|
n_idx = rg // self.T
|
|
220
194
|
t_idx = rg % self.T
|
|
221
195
|
dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
|
|
196
|
+
# copy host block
|
|
222
197
|
drv.memcpy_htod(dense_gpu, dense_host)
|
|
223
|
-
#
|
|
224
|
-
|
|
198
|
+
# copy corresponding row_nnz slice (only R entries)
|
|
199
|
+
drv.memcpy_htod(row_nnz_host_gpu, row_nnz[b:b+R])
|
|
200
|
+
|
|
225
201
|
grid = ((R + block - 1) // block, 1, 1)
|
|
226
202
|
fill_kernel(dense_gpu,
|
|
227
|
-
|
|
203
|
+
row_nnz_host_gpu,
|
|
228
204
|
self.slice_ptr_gpu,
|
|
229
205
|
self.slice_len_gpu,
|
|
230
206
|
self.sell_colinds_gpu,
|
|
231
207
|
self.sell_values_gpu,
|
|
232
208
|
np.int32(R),
|
|
233
209
|
np.int32(num_cols),
|
|
234
|
-
np.int32(b),
|
|
210
|
+
np.int32(b), # rows_global_offset
|
|
235
211
|
np.int32(C),
|
|
236
212
|
np.float32(self.relative_threshold),
|
|
237
213
|
block=(block,1,1), grid=grid)
|
|
214
|
+
drv.Context.synchronize()
|
|
215
|
+
|
|
238
216
|
dense_gpu.free()
|
|
239
217
|
row_nnz_host_gpu.free()
|
|
240
218
|
|
|
241
|
-
# At this point sell_values_gpu and sell_colinds_gpu are filled.
|
|
242
|
-
|
|
243
219
|
# 4) compute norm_factor_inv via GPU accumulate (col sums)
|
|
244
220
|
self.compute_norm_factor()
|
|
245
221
|
|
|
222
|
+
def apply_apodization_gpu(self, window_vector_gpu):
|
|
223
|
+
"""
|
|
224
|
+
Applique le fenêtrage directement sur self.sell_values_gpu
|
|
225
|
+
en utilisant les indices de colonnes (pixels) pour référencer
|
|
226
|
+
la fenêtre. Opération : A_values[i] *= W_vec[A_colinds[i]].
|
|
227
|
+
"""
|
|
228
|
+
if self.sparse_mod is None:
|
|
229
|
+
raise RuntimeError("Le module CUDA n'a pas été chargé.")
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
apodize_kernel = self.sparse_mod.get_function("apply_apodisation_kernel__SELL")
|
|
233
|
+
except drv.LogicError as e:
|
|
234
|
+
raise RuntimeError(
|
|
235
|
+
f"Le kernel CUDA 'multiply_sell_by_window_kernel' est manquant dans le .cubin. "
|
|
236
|
+
f"Veuillez le compiler et l'ajouter. Erreur : {e}"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Le total_storage inclut les éléments non-nuls et le padding SELL.
|
|
240
|
+
threads = 128
|
|
241
|
+
blocks = (self.total_storage + threads - 1) // threads
|
|
242
|
+
|
|
243
|
+
# Lancement du kernel. Il travaille sur total_storage éléments.
|
|
244
|
+
apodize_kernel(
|
|
245
|
+
self.sell_values_gpu,
|
|
246
|
+
self.sell_colinds_gpu,
|
|
247
|
+
window_vector_gpu,
|
|
248
|
+
np.int64(self.total_storage),
|
|
249
|
+
block=(threads, 1, 1),
|
|
250
|
+
grid=(blocks, 1, 1)
|
|
251
|
+
)
|
|
252
|
+
drv.Context.synchronize()
|
|
253
|
+
print("✅ Multiplication par le fenêtrage effectuée in-place sur GPU (SELL-C-σ).")
|
|
254
|
+
|
|
246
255
|
def compute_norm_factor(self):
|
|
247
256
|
"""
|
|
248
|
-
|
|
257
|
+
Compute the TRUE MLEM normalization norm_factor_inv = 1 / (A^T * 1)
|
|
258
|
+
by performing a SELL backprojection of a vector of ones.
|
|
259
|
+
This is the ONLY correct normalization for MLEM.
|
|
249
260
|
"""
|
|
250
|
-
if self.total_storage == 0:
|
|
251
|
-
raise RuntimeError("sell not built")
|
|
252
261
|
ZX = int(self.Z * self.X)
|
|
262
|
+
TN = int(self.T * self.N)
|
|
253
263
|
|
|
254
|
-
#
|
|
255
|
-
|
|
256
|
-
drv.memset_d32(
|
|
264
|
+
# Allocate device vector of ones (projections)
|
|
265
|
+
ones_gpu = drv.mem_alloc(TN * np.dtype(np.float32).itemsize)
|
|
266
|
+
drv.memset_d32(ones_gpu, 0x3f800000, TN) # 1.0f bit pattern
|
|
257
267
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
268
|
+
# Allocate output for backprojection (ZX pixels)
|
|
269
|
+
c_gpu = drv.mem_alloc(ZX * np.dtype(np.float32).itemsize)
|
|
270
|
+
drv.memset_d32(c_gpu, 0, ZX)
|
|
271
|
+
|
|
272
|
+
# Get SELL backprojection kernel
|
|
273
|
+
try:
|
|
274
|
+
bp_kernel = self.sparse_mod.get_function("backprojection_kernel__SELL")
|
|
275
|
+
except Exception as e:
|
|
276
|
+
raise RuntimeError("Missing kernel backprojection_kernel__SELL in the cubin") from e
|
|
277
|
+
|
|
278
|
+
threads = 256
|
|
279
|
+
blocks = (TN + threads - 1) // threads
|
|
280
|
+
|
|
281
|
+
# Launch GPU backprojection
|
|
282
|
+
bp_kernel(
|
|
283
|
+
self.sell_values_gpu,
|
|
284
|
+
self.sell_colinds_gpu,
|
|
285
|
+
self.slice_ptr_gpu,
|
|
286
|
+
self.slice_len_gpu,
|
|
287
|
+
ones_gpu,
|
|
288
|
+
c_gpu,
|
|
289
|
+
np.int32(TN),
|
|
290
|
+
# np.int32(ZX),
|
|
291
|
+
np.int32(self.slice_height),
|
|
292
|
+
# np.int64(self.total_storage),
|
|
293
|
+
block=(threads, 1, 1), # Utilise le nouveau nombre de threads
|
|
294
|
+
grid=(blocks, 1, 1)
|
|
295
|
+
)
|
|
263
296
|
drv.Context.synchronize()
|
|
264
297
|
|
|
265
|
-
#
|
|
266
|
-
|
|
267
|
-
drv.memcpy_dtoh(
|
|
268
|
-
|
|
298
|
+
# Copy back to host
|
|
299
|
+
c_host = np.empty(ZX, dtype=np.float32)
|
|
300
|
+
drv.memcpy_dtoh(c_host, c_gpu)
|
|
301
|
+
ones_gpu.free()
|
|
302
|
+
c_gpu.free()
|
|
303
|
+
|
|
304
|
+
# Avoid divide-by-zero
|
|
305
|
+
c_host = np.maximum(c_host, 1e-6)
|
|
306
|
+
|
|
307
|
+
# Compute inverse (stored for use in MLEM)
|
|
308
|
+
self.norm_factor_inv = (1.0 / c_host).astype(np.float32)
|
|
269
309
|
|
|
270
|
-
|
|
271
|
-
self.norm_factor_inv = (1.0 / norm).astype(np.float32)
|
|
310
|
+
# Upload to GPU
|
|
272
311
|
if self.norm_factor_inv_gpu is not None:
|
|
273
312
|
self.norm_factor_inv_gpu.free()
|
|
274
|
-
|
|
313
|
+
|
|
314
|
+
self.norm_factor_inv_gpu_size = self.norm_factor_inv.nbytes
|
|
315
|
+
self.norm_factor_inv_gpu = drv.mem_alloc(self.norm_factor_inv_gpu_size)
|
|
275
316
|
drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
|
|
276
317
|
|
|
318
|
+
print("✓ Normalization (A^T*1) computed for MLEM.")
|
|
319
|
+
|
|
277
320
|
def compute_density(self):
|
|
278
321
|
"""
|
|
279
322
|
Returns only the density of the SELL-C-σ matrix.
|
|
@@ -288,7 +331,7 @@ class SparseSMatrix_SELL:
|
|
|
288
331
|
# Conservative estimate of non-zeros (excluding padding)
|
|
289
332
|
nnz_ell_estimated = int(0.9 * self.total_storage)
|
|
290
333
|
|
|
291
|
-
return nnz_ell_estimated / total_elements
|
|
334
|
+
return nnz_ell_estimated / total_elements # Returns only the density
|
|
292
335
|
|
|
293
336
|
def getMatrixSize(self):
|
|
294
337
|
"""
|
|
@@ -299,7 +342,7 @@ class SparseSMatrix_SELL:
|
|
|
299
342
|
|
|
300
343
|
total_bytes = 0
|
|
301
344
|
|
|
302
|
-
# Host-side arrays
|
|
345
|
+
# Host-side arrays (using .nbytes which works for NumPy arrays)
|
|
303
346
|
if hasattr(self, 'slice_ptr') and self.slice_ptr is not None:
|
|
304
347
|
total_bytes += self.slice_ptr.nbytes
|
|
305
348
|
if hasattr(self, 'slice_len') and self.slice_len is not None:
|
|
@@ -307,16 +350,11 @@ class SparseSMatrix_SELL:
|
|
|
307
350
|
if hasattr(self, 'norm_factor_inv') and self.norm_factor_inv is not None:
|
|
308
351
|
total_bytes += self.norm_factor_inv.nbytes
|
|
309
352
|
|
|
310
|
-
# GPU-side arrays
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
total_bytes += self.slice_len_gpu.size
|
|
319
|
-
if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
|
|
320
|
-
total_bytes += self.norm_factor_inv_gpu.size
|
|
321
|
-
|
|
322
|
-
return total_bytes / (1024 ** 3) # Returns only the size in GB
|
|
353
|
+
# GPU-side arrays (using the stored size attributes instead of the problematic .size)
|
|
354
|
+
total_bytes += self.sell_values_gpu_size
|
|
355
|
+
total_bytes += self.sell_colinds_gpu_size
|
|
356
|
+
total_bytes += self.slice_ptr_gpu_size
|
|
357
|
+
total_bytes += self.slice_len_gpu_size
|
|
358
|
+
total_bytes += self.norm_factor_inv_gpu_size
|
|
359
|
+
|
|
360
|
+
return total_bytes / (1024 ** 3) # Returns only the size in GB
|
|
Binary file
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import concurrent
|
|
2
|
+
|
|
3
|
+
from AOT_biomaps.AOT_Recon.ReconTools import get_apodization_vector_gpu
|
|
2
4
|
from ._mainRecon import Recon
|
|
3
5
|
from .ReconEnums import ReconType, OptimizerType, ProcessType, SMatrixType
|
|
4
6
|
from .AOT_Optimizers import MLEM, LS
|
|
@@ -43,8 +45,6 @@ class AlgebraicRecon(Recon):
|
|
|
43
45
|
|
|
44
46
|
self.sparseThreshold = sparseThreshold
|
|
45
47
|
|
|
46
|
-
self.Z_dim = None # Used for sparse matrix reconstruction
|
|
47
|
-
|
|
48
48
|
if self.numIterations <= 0:
|
|
49
49
|
raise ValueError("Number of iterations must be greater than 0.")
|
|
50
50
|
if self.numSubsets <= 0:
|
|
@@ -729,6 +729,8 @@ class AlgebraicRecon(Recon):
|
|
|
729
729
|
"""
|
|
730
730
|
sparse_matrix = SparseSMatrix_SELL(self.experiment,relative_threshold=self.sparseThreshold)
|
|
731
731
|
sparse_matrix.allocate()
|
|
732
|
+
# fenetre_gpu = get_apodization_vector_gpu(sparse_matrix)
|
|
733
|
+
# sparse_matrix.apply_apodization_gpu(fenetre_gpu)
|
|
732
734
|
if isShowLogs:
|
|
733
735
|
print(f" Sparse matrix size: {sparse_matrix.getMatrixSize()} GB")
|
|
734
736
|
print(f"Sparse matrix density: {sparse_matrix.compute_density()}")
|
|
@@ -756,7 +758,6 @@ class AlgebraicRecon(Recon):
|
|
|
756
758
|
max_saves=self.maxSaves,
|
|
757
759
|
show_logs=show_logs,
|
|
758
760
|
smatrixType=self.smatrixType,
|
|
759
|
-
Z=self.Z_dim
|
|
760
761
|
)
|
|
761
762
|
else:
|
|
762
763
|
self.reconLaser, self.indices = MLEM(SMatrix=self.SMatrix,
|
|
@@ -770,30 +771,36 @@ class AlgebraicRecon(Recon):
|
|
|
770
771
|
max_saves=self.maxSaves,
|
|
771
772
|
show_logs=show_logs,
|
|
772
773
|
smatrixType=self.smatrixType,
|
|
773
|
-
Z=self.Z_dim
|
|
774
774
|
)
|
|
775
775
|
elif self.optimizer.value == OptimizerType.LS.value:
|
|
776
776
|
if self.alpha is None:
|
|
777
777
|
raise ValueError("Alpha (regularization parameter) must be set for LS reconstruction.")
|
|
778
778
|
if withTumor:
|
|
779
|
-
self.reconPhantom, self.indices = LS(SMatrix=self.SMatrix,
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
779
|
+
self.reconPhantom, self.indices = LS(SMatrix=self.SMatrix,
|
|
780
|
+
y=self.experiment.AOsignal_withTumor,
|
|
781
|
+
numIterations=self.numIterations,
|
|
782
|
+
isSavingEachIteration=self.isSavingEachIteration,
|
|
783
|
+
withTumor=withTumor,
|
|
784
|
+
device=self.device,
|
|
785
|
+
use_numba=self.isMultiCPU,
|
|
786
|
+
denominator_threshold=self.denominatorThreshold,
|
|
787
|
+
max_saves=self.maxSaves,
|
|
788
|
+
show_logs=show_logs,
|
|
789
|
+
smatrixType=self.smatrixType
|
|
787
790
|
)
|
|
788
791
|
else:
|
|
789
|
-
self.reconLaser, self.indices = LS(SMatrix=self.SMatrix,
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
792
|
+
self.reconLaser, self.indices = LS(SMatrix=self.SMatrix,
|
|
793
|
+
y=self.experiment.AOsignal_withoutTumor,
|
|
794
|
+
numIterations=self.numIterations,
|
|
795
|
+
isSavingEachIteration=self.isSavingEachIteration,
|
|
796
|
+
withTumor=withTumor,
|
|
797
|
+
alpha=self.alpha,
|
|
798
|
+
device=self.device,
|
|
799
|
+
use_numba=self.isMultiCPU,
|
|
800
|
+
denominator_threshold=self.denominatorThreshold,
|
|
801
|
+
max_saves=self.maxSaves,
|
|
802
|
+
show_logs=show_logs,
|
|
803
|
+
smatrixType=self.smatrixType
|
|
797
804
|
)
|
|
798
805
|
else:
|
|
799
806
|
raise ValueError(f"Only MLEM and LS are supported for simple algebraic reconstruction. {self.optimizer.value} need Bayesian reconstruction")
|