AOT-biomaps 2.9.167__py3-none-any.whl → 2.9.270__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (29) hide show
  1. AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
  2. AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +14 -7
  3. AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
  4. AOT_biomaps/AOT_Experiment/_mainExperiment.py +95 -55
  5. AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +48 -13
  6. AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +9 -6
  7. AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +118 -38
  8. AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +305 -102
  9. AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +1 -1
  10. AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
  11. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +281 -0
  12. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +295 -0
  13. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
  14. AOT_biomaps/AOT_Recon/AOT_biomaps_kernels.cubin +0 -0
  15. AOT_biomaps/AOT_Recon/AlgebraicRecon.py +262 -149
  16. AOT_biomaps/AOT_Recon/AnalyticRecon.py +27 -42
  17. AOT_biomaps/AOT_Recon/BayesianRecon.py +84 -151
  18. AOT_biomaps/AOT_Recon/DeepLearningRecon.py +1 -1
  19. AOT_biomaps/AOT_Recon/PrimalDualRecon.py +69 -62
  20. AOT_biomaps/AOT_Recon/ReconEnums.py +27 -2
  21. AOT_biomaps/AOT_Recon/ReconTools.py +120 -12
  22. AOT_biomaps/AOT_Recon/__init__.py +1 -0
  23. AOT_biomaps/AOT_Recon/_mainRecon.py +73 -59
  24. AOT_biomaps/__init__.py +4 -74
  25. {aot_biomaps-2.9.167.dist-info → aot_biomaps-2.9.270.dist-info}/METADATA +2 -1
  26. aot_biomaps-2.9.270.dist-info/RECORD +47 -0
  27. aot_biomaps-2.9.167.dist-info/RECORD +0 -43
  28. {aot_biomaps-2.9.167.dist-info → aot_biomaps-2.9.270.dist-info}/WHEEL +0 -0
  29. {aot_biomaps-2.9.167.dist-info → aot_biomaps-2.9.270.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,281 @@
1
+ import pycuda.driver as drv
2
+ import numpy as np
3
+ from pycuda.compiler import SourceModule
4
+ from tqdm import trange
5
+ import gc
6
+ import os
7
+
8
+ class SparseSMatrix_CSR:
9
+ """Construction d'une matrice CSR à partir d'un objet `manip`.
10
+ Usage:
11
+ S = SparseMatrixGPU(manip)
12
+ S.allocate()
13
+ Après allocate(), on a: row_ptr (host np.int64 array), row_ptr_gpu (device ptr),
14
+ h_col_ind, h_values, col_ind_gpu, values_gpu, norm_factor_inv.
15
+ """
16
+
17
+ def __init__(self, manip, block_rows=64, relative_threshold=0.3, device=0):
18
+ drv.init()
19
+ self.device = drv.Device(device)
20
+ self.ctx = self.device.make_context()
21
+ self.manip = manip
22
+ self.N = len(manip.AcousticFields)
23
+ self.T = manip.AcousticFields[0].field.shape[0]
24
+ self.Z = manip.AcousticFields[0].field.shape[1]
25
+ self.X = manip.AcousticFields[0].field.shape[2]
26
+ self.block_rows = block_rows
27
+ self.relative_threshold = relative_threshold
28
+
29
+ # --- FIX: Résolution du chemin du .cubin (dans AOT_Recon/) ---
30
+ # Le fichier SparseSMatrix_CSR.py est dans AOT_Recon/AOT_SparseSMatrix/
31
+ # On remonte d'un répertoire pour atteindre AOT_Recon/
32
+ cubin_parent_dir = os.path.dirname(os.path.dirname(__file__))
33
+ self.module_path = os.path.join(cubin_parent_dir, "AOT_biomaps_kernels.cubin")
34
+ # --- FIN FIX ---
35
+
36
+ self.h_dense = None
37
+ self.row_ptr = None
38
+ self.row_ptr_gpu = None
39
+ self.h_col_ind = None
40
+ self.h_values = None
41
+ self.total_nnz = 0
42
+ self.norm_factor_inv = None
43
+ self.sparse_mod = None
44
+
45
+ def __enter__(self):
46
+ return self
47
+
48
+ def __exit__(self, exc_type, exc, tb):
49
+ self.free()
50
+
51
+ def load_precompiled_module(self):
52
+ """
53
+ Charge le module CUDA pré-compilé (.cubin) en utilisant le chemin résolu.
54
+ Supprime la logique de compilation JIT.
55
+ """
56
+ so_path = self.module_path # Utilise le chemin résolu dans __init__
57
+
58
+ if not os.path.exists(so_path):
59
+ raise FileNotFoundError(
60
+ f"Le module CUDA {os.path.basename(so_path)} est introuvable au chemin: {so_path}. "
61
+ "Assurez-vous qu'il est compilé et bien placé."
62
+ )
63
+
64
+ try:
65
+ self.sparse_mod = drv.module_from_file(so_path)
66
+ print(f"✅ Module CUDA chargé depuis {so_path}")
67
+ except Exception as e:
68
+ raise RuntimeError(f"Le fichier {os.path.basename(so_path)} a été trouvé, mais PyCUDA n'a pas pu le charger. Vérifiez la compatibilité.") from e
69
+
70
+ def estimate_nnz_cpu(self):
71
+ """Estimation rapide (non-exacte) — utile si tu veux une estimation faible.
72
+ Recommandé : utiliser la passe GPU exacte (count_nnz_per_row_kernel) à la place.
73
+ """
74
+ total = 0
75
+ for n in range(self.N):
76
+ field = self.manip.AcousticFields[n].field
77
+ for t in range(self.T):
78
+ row = field[t].flatten()
79
+ row_max = np.max(np.abs(row))
80
+ thr = row_max * self.relative_threshold
81
+ total += np.count_nonzero(np.abs(row) > thr)
82
+ return int(total)
83
+
84
+ def allocate(self, kernel_module_path=None):
85
+ try:
86
+ # --- 1. Construction bloc par bloc (sans garder tout le dense si possible) ---
87
+ num_rows = self.N * self.T
88
+ num_cols = self.Z * self.X
89
+ bytes_float = np.dtype(np.float32).itemsize
90
+
91
+ # Charge module
92
+ # FIX: Toujours charger depuis self.module_path (résolu)
93
+ self.load_precompiled_module()
94
+
95
+ count_nnz_kernel = self.sparse_mod.get_function('count_nnz_rows_kernel')
96
+ fill_csr_kernel = self.sparse_mod.get_function('fill_kernel__CSR')
97
+
98
+ # allocate host row_ptr
99
+ self.row_ptr = np.zeros(num_rows + 1, dtype=np.int64)
100
+
101
+ # GPU temp buffers
102
+ dense_block_host = np.empty((self.block_rows, num_cols), dtype=np.float32)
103
+ dense_block_gpu = drv.mem_alloc(self.block_rows * num_cols * bytes_float)
104
+ row_nnz_gpu = drv.mem_alloc(self.block_rows * np.dtype(np.int32).itemsize)
105
+
106
+ block_size = 128
107
+
108
+ # --- Count NNZ per row using GPU kernel to be consistent with filling logic ---
109
+ for b in trange(0, num_rows, self.block_rows, desc='Comptage NNZ'):
110
+ current_rows = min(self.block_rows, num_rows - b)
111
+ # Fill dense_block_host from manip
112
+ for r in range(current_rows):
113
+ global_row = b + r
114
+ n_idx = global_row // self.T
115
+ t_idx = global_row % self.T
116
+ dense_block_host[r, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
117
+ drv.memcpy_htod(dense_block_gpu, dense_block_host)
118
+
119
+ grid = ((current_rows + block_size - 1) // block_size, 1, 1)
120
+ # Note: Assuming 'count_nnz_per_row_kernel' is the correct name (verified by user in prior steps)
121
+ count_nnz_kernel(dense_block_gpu, row_nnz_gpu,
122
+ np.int32(current_rows), np.int32(num_cols),
123
+ np.float32(self.relative_threshold),
124
+ block=(block_size, 1, 1), grid=grid)
125
+
126
+ row_nnz_host = np.empty(current_rows, dtype=np.int32)
127
+ drv.memcpy_dtoh(row_nnz_host, row_nnz_gpu)
128
+ self.row_ptr[b + 1:b + current_rows + 1] = self.row_ptr[b] + np.cumsum(row_nnz_host, dtype=np.int64)
129
+
130
+ # total nnz
131
+ self.total_nnz = int(self.row_ptr[-1])
132
+ print(f"NNZ total : {self.total_nnz}")
133
+
134
+ # allocate final arrays
135
+ self.h_col_ind = np.zeros(self.total_nnz, dtype=np.uint32)
136
+ self.h_values = np.zeros(self.total_nnz, dtype=np.float32)
137
+
138
+ # copy row_ptr to device once
139
+ self.row_ptr_gpu = drv.mem_alloc(self.row_ptr.nbytes)
140
+ drv.memcpy_htod(self.row_ptr_gpu, self.row_ptr)
141
+
142
+ # allocate device arrays for final csr
143
+ self.col_ind_gpu = drv.mem_alloc(self.h_col_ind.nbytes)
144
+ self.values_gpu = drv.mem_alloc(self.h_values.nbytes)
145
+
146
+ # --- Fill CSR per-block ---
147
+ for b in trange(0, num_rows, self.block_rows, desc='Remplissage CSR'):
148
+ current_rows = min(self.block_rows, num_rows - b)
149
+ for r in range(current_rows):
150
+ global_row = b + r
151
+ n_idx = global_row // self.T
152
+ t_idx = global_row % self.T
153
+ dense_block_host[r, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
154
+ drv.memcpy_htod(dense_block_gpu, dense_block_host)
155
+
156
+ grid = ((current_rows + block_size - 1) // block_size, 1, 1)
157
+ fill_csr_kernel(dense_block_gpu,
158
+ self.row_ptr_gpu,
159
+ self.col_ind_gpu,
160
+ self.values_gpu,
161
+ np.int32(b),
162
+ np.int32(current_rows),
163
+ np.int32(num_cols),
164
+ np.float32(self.relative_threshold),
165
+ np.int64(self.total_nnz),
166
+ block=(block_size, 1, 1), grid=grid)
167
+ drv.Context.synchronize()
168
+
169
+ # copy back
170
+ drv.memcpy_dtoh(self.h_col_ind, self.col_ind_gpu)
171
+ drv.memcpy_dtoh(self.h_values, self.values_gpu)
172
+ print('CSR généré ✔')
173
+
174
+ # compute normalization factor from CSR (sum per column)
175
+ self.compute_norm_factor_from_csr()
176
+
177
+ # free temporaries
178
+ dense_block_gpu.free(); row_nnz_gpu.free()
179
+
180
+ except Exception as e:
181
+ print(f"❌ Erreur détaillée : {e}")
182
+ self.free()
183
+ raise
184
+
185
+ def compute_norm_factor_from_csr(self):
186
+ ZX = self.Z * self.X
187
+
188
+ # 1) Allouer un vecteur de somme colonne sur le GPU
189
+ col_sum_gpu = drv.mem_alloc(ZX * np.dtype(np.float32).itemsize)
190
+ drv.memset_d32(col_sum_gpu, 0, ZX)
191
+
192
+ # 2) Récupérer le kernel
193
+ # FIX: Utiliser le nom générique 'accumulate_columns_atomic' comme dans SELL (si le binaire est partagé)
194
+ # Si le développeur utilise la convention __CSR, on la garde.
195
+ # Basé sur notre historique SELL, le nom est probablement générique 'accumulate_columns_atomic'.
196
+ # Je vais supposer que le nom est générique pour éviter une LogicError ici aussi.
197
+ acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic")
198
+
199
+ # 3) Lancer le kernel
200
+ threads = 256
201
+ blocks = (self.total_nnz + threads - 1) // threads
202
+
203
+ acc_kernel(
204
+ self.values_gpu,
205
+ self.col_ind_gpu,
206
+ np.int64(self.total_nnz),
207
+ col_sum_gpu,
208
+ block=(threads,1,1),
209
+ grid=(blocks,1,1)
210
+ )
211
+ drv.Context.synchronize()
212
+
213
+ # 4) Récupérer le résultat
214
+ norm = np.empty(ZX, dtype=np.float32)
215
+ drv.memcpy_dtoh(norm, col_sum_gpu)
216
+ col_sum_gpu.free()
217
+
218
+ norm = np.maximum(norm.astype(np.float64), 1e-6)
219
+ self.norm_factor_inv = (1.0 / norm).astype(np.float32)
220
+
221
+ self.norm_factor_inv_gpu = drv.mem_alloc(self.norm_factor_inv.nbytes)
222
+ drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
223
+
224
+ def getMatrixSize(self):
225
+ """
226
+ Retourne la taille totale de la matrice CSR en Go (en sommant la mémoire GPU).
227
+ """
228
+ if self.row_ptr is None:
229
+ return {"error": "La matrice sparse n'est pas encore allouée."}
230
+
231
+ total_bytes = 0
232
+
233
+ # Mémoire GPU (row_ptr_gpu, col_ind_gpu, values_gpu, norm_factor_inv_gpu)
234
+ if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
235
+ total_bytes += self.row_ptr_gpu.size
236
+ if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
237
+ total_bytes += self.col_ind_gpu.size
238
+ if hasattr(self, 'values_gpu') and self.values_gpu:
239
+ total_bytes += self.values_gpu.size
240
+ if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
241
+ total_bytes += self.norm_factor_inv_gpu.size
242
+
243
+ # NOTE: Les versions précédentes utilisaient le .size de l'objet DeviceAllocation,
244
+ # qui était problématique. Si l'erreur se reproduit ici, il faudra
245
+ # stocker la taille en octets comme nous l'avons fait pour SELL.
246
+ # Pour l'instant, nous conservons la méthode getMatrixSize originale de CSR.
247
+
248
+ return total_bytes / (1024**3)
249
+
250
+ def free(self):
251
+ try:
252
+ if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
253
+ self.col_ind_gpu.free()
254
+ if hasattr(self, 'values_gpu') and self.values_gpu:
255
+ self.values_gpu.free()
256
+ if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
257
+ self.row_ptr_gpu.free()
258
+ if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
259
+ self.norm_factor_inv_gpu.free()
260
+ if hasattr(self, 'ctx') and self.ctx:
261
+ try:
262
+ self.ctx.pop()
263
+ except Exception:
264
+ pass
265
+ self.ctx = None
266
+ print('✅ Mémoire GPU libérée.')
267
+ except Exception as e:
268
+ print(f"❌ Erreur lors de la libération de la mémoire GPU : {e}")
269
+
270
+ def compute_density(self):
271
+ """
272
+ Retourne la densité réelle de la CSR = NNZ / (num_rows * num_cols)
273
+ Nécessite que self.h_values et self.row_ptr existent (host).
274
+ """
275
+ if self.row_ptr is None or self.h_values is None:
276
+ raise RuntimeError("row_ptr et h_values requis pour calculer la densité")
277
+ num_rows = int(self.N * self.T)
278
+ num_cols = int(self.Z * self.X)
279
+ total_nnz = int(self.row_ptr[-1])
280
+ density = total_nnz / (num_rows * num_cols)
281
+ return density
@@ -0,0 +1,295 @@
1
+ import pycuda.driver as drv
2
+ import numpy as np
3
+ from tqdm import trange
4
+ import os
5
+ import gc
6
+
7
+ class SparseSMatrix_SELL:
8
+ def __init__(self, manip, block_rows=64, relative_threshold=0.3, device=0,
9
+ module_path="AOT_biomaps_kernels.cubin", slice_height=32):
10
+ drv.init()
11
+ self.device = drv.Device(device)
12
+ self.ctx = self.device.make_context()
13
+ self.manip = manip
14
+ self.N = len(manip.AcousticFields)
15
+ self.T = manip.AcousticFields[0].field.shape[0]
16
+ self.Z = manip.AcousticFields[0].field.shape[1]
17
+ self.X = manip.AcousticFields[0].field.shape[2]
18
+ self.block_rows = block_rows
19
+ self.relative_threshold = relative_threshold
20
+
21
+ # --- PATH RESOLUTION FIX ---
22
+ # The cubin file is located in the parent directory (AOT_Recon/)
23
+ # We use os.path.dirname(os.path.dirname(__file__)) to go up one directory level.
24
+ cubin_parent_dir = os.path.dirname(os.path.dirname(__file__))
25
+ self.module_path = os.path.join(cubin_parent_dir, module_path)
26
+ # --- END FIX ---
27
+
28
+ self.slice_height = slice_height
29
+
30
+ # SELL arrays (device) & Size Tracking (CRITICAL FIX: Initialized attributes)
31
+ self.sell_values_gpu = None
32
+ self.sell_colinds_gpu = None
33
+ self.slice_ptr = None
34
+ self.slice_len = None
35
+ self.slice_ptr_gpu = None
36
+ self.slice_len_gpu = None
37
+
38
+ # Attributes to store allocated size in bytes (bypassing the problematic .size attribute)
39
+ self.sell_values_gpu_size = 0
40
+ self.sell_colinds_gpu_size = 0
41
+ self.slice_ptr_gpu_size = 0
42
+ self.slice_len_gpu_size = 0
43
+
44
+ self.total_storage = 0
45
+
46
+ self.norm_factor_inv = None
47
+ self.norm_factor_inv_gpu = None
48
+ self.norm_factor_inv_gpu_size = 0
49
+
50
+ self.sparse_mod = None
51
+ self.load_module()
52
+
53
+ def load_module(self):
54
+ """Loads the pre-compiled CUDA module (.cubin file)."""
55
+
56
+ # Check if the file exists at the calculated absolute path
57
+ if not os.path.exists(self.module_path):
58
+ # The path is now correctly calculated to the parent directory.
59
+ raise FileNotFoundError(f"CUDA module {os.path.basename(self.module_path)} not found at path: {self.module_path}")
60
+
61
+ # Try to load the module
62
+ try:
63
+ self.sparse_mod = drv.module_from_file(self.module_path)
64
+ print(f"Loaded CUDA module {os.path.basename(self.module_path)}")
65
+ except Exception as e:
66
+ print(f"❌ Error loading CUDA module {os.path.basename(self.module_path)}: {e}")
67
+ raise RuntimeError(f"File {os.path.basename(self.module_path)} was found, but PyCUDA could not load it. Check compatibility.") from e
68
+
69
+ def free(self):
70
+ try:
71
+ # Free device allocations
72
+ attrs = ["sell_values_gpu","sell_colinds_gpu","slice_ptr_gpu","slice_len_gpu","norm_factor_inv_gpu"]
73
+ for a in attrs:
74
+ if hasattr(self, a) and getattr(self, a) is not None:
75
+ getattr(self, a).free()
76
+ setattr(self, a, None)
77
+
78
+ # Reset stored sizes
79
+ self.sell_values_gpu_size = 0
80
+ self.sell_colinds_gpu_size = 0
81
+ self.slice_ptr_gpu_size = 0
82
+ self.slice_len_gpu_size = 0
83
+ self.norm_factor_inv_gpu_size = 0
84
+
85
+ if hasattr(self, 'ctx') and self.ctx:
86
+ try: self.ctx.pop()
87
+ except Exception: pass
88
+ self.ctx = None
89
+ except Exception as e:
90
+ print("Error freeing GPU memory:", e)
91
+
92
+ def allocate(self):
93
+ """
94
+ Build SELL-C-σ directly from manip AcousticFields in streaming blocks.
95
+ """
96
+ # Ensures the module is loaded before attempting to retrieve functions
97
+ if self.sparse_mod is None:
98
+ raise RuntimeError("CUDA module not loaded. Check compilation.")
99
+
100
+ count_kernel = self.sparse_mod.get_function("count_nnz_rows_kernel")
101
+ fill_kernel = self.sparse_mod.get_function("fill_kernel__SELL")
102
+
103
+ num_rows = int(self.N * self.T)
104
+ num_cols = int(self.Z * self.X)
105
+ C = int(self.slice_height)
106
+
107
+ # host temporary block
108
+ br = int(self.block_rows)
109
+ bytes_per_elem = np.dtype(np.float32).itemsize
110
+ dense_host = np.empty((br, num_cols), dtype=np.float32)
111
+
112
+ # Allocation 1: Dense block GPU memory
113
+ dense_gpu_size = dense_host.nbytes
114
+ dense_gpu = drv.mem_alloc(dense_gpu_size)
115
+
116
+ # 1) count nnz per row (on host via small blocks with GPU kernel)
117
+ row_nnz = np.zeros(num_rows, dtype=np.int32)
118
+ row_nnz_gpu_block_size = br * np.dtype(np.int32).itemsize
119
+ row_nnz_gpu_block = drv.mem_alloc(row_nnz_gpu_block_size)
120
+
121
+ block = 256
122
+ for b in trange(0, num_rows, br, desc="Count NNZ per row"):
123
+ R = min(br, num_rows - b)
124
+ # fill dense_host
125
+ for i in range(R):
126
+ rg = b + i
127
+ n_idx = rg // self.T
128
+ t_idx = rg % self.T
129
+ dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
130
+ # copy only R rows
131
+ drv.memcpy_htod(dense_gpu, dense_host)
132
+ grid = ((R + block - 1) // block, 1, 1)
133
+ count_kernel(dense_gpu, row_nnz_gpu_block, np.int32(R), np.int32(num_cols), np.float32(self.relative_threshold),
134
+ block=(block,1,1), grid=grid)
135
+ tmp = np.empty(R, dtype=np.int32)
136
+ drv.memcpy_dtoh(tmp, row_nnz_gpu_block)
137
+ row_nnz[b:b+R] = tmp
138
+
139
+ row_nnz_gpu_block.free()
140
+ dense_gpu.free()
141
+
142
+ # 2) compute per-slice maxlen and slice_ptr
143
+ num_slices = (num_rows + C - 1) // C
144
+ slice_len = np.zeros(num_slices, dtype=np.int32)
145
+ for s in range(num_slices):
146
+ r0 = s * C
147
+ r1 = min(num_rows, r0 + C)
148
+ slice_len[s] = int(np.max(row_nnz[r0:r1])) if (r1>r0) else 0
149
+ # slice_ptr (int64)
150
+ slice_ptr = np.zeros(num_slices + 1, dtype=np.int64)
151
+ for s in range(num_slices):
152
+ slice_ptr[s+1] = slice_ptr[s] + (slice_len[s] * C)
153
+ total_storage = int(slice_ptr[-1])
154
+ self.total_storage = total_storage
155
+ print(f"SELL: num_rows={num_rows}, num_slices={num_slices}, total_storage(padded)={total_storage}")
156
+
157
+ # allocate device SELL arrays (values float32, colinds uint32)
158
+ self.sell_values_gpu_size = total_storage * np.dtype(np.float32).itemsize
159
+ self.sell_colinds_gpu_size = total_storage * np.dtype(np.uint32).itemsize
160
+
161
+ self.sell_values_gpu = drv.mem_alloc(self.sell_values_gpu_size)
162
+ self.sell_colinds_gpu = drv.mem_alloc(self.sell_colinds_gpu_size)
163
+
164
+ # allocate slice metadata on device
165
+ self.slice_ptr = slice_ptr
166
+ self.slice_len = slice_len
167
+
168
+ self.slice_ptr_gpu_size = self.slice_ptr.nbytes
169
+ self.slice_len_gpu_size = self.slice_len.nbytes
170
+
171
+ self.slice_ptr_gpu = drv.mem_alloc(self.slice_ptr_gpu_size)
172
+ self.slice_len_gpu = drv.mem_alloc(self.slice_len_gpu_size)
173
+
174
+ drv.memcpy_htod(self.slice_ptr_gpu, self.slice_ptr)
175
+ drv.memcpy_htod(self.slice_len_gpu, self.slice_len)
176
+
177
+ # 3) fill SELL arrays by streaming blocks again (use GPU fill kernel)
178
+ # reuse dense_host and allocate new dense_gpu
179
+ dense_host = np.empty((br, num_cols), dtype=np.float32)
180
+
181
+ dense_gpu_2_size = dense_host.nbytes
182
+ dense_gpu = drv.mem_alloc(dense_gpu_2_size)
183
+
184
+ # we also need row_nnz on device per-block; supply global row_nnz on host but the kernel recomputes threshold
185
+ row_nnz_host_gpu_size = br * np.dtype(np.int32).itemsize
186
+ row_nnz_host_gpu = drv.mem_alloc(row_nnz_host_gpu_size)
187
+
188
+ for b in trange(0, num_rows, br, desc="Fill SELL"):
189
+ R = min(br, num_rows - b)
190
+ for i in range(R):
191
+ rg = b + i
192
+ n_idx = rg // self.T
193
+ t_idx = rg % self.T
194
+ dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
195
+ drv.memcpy_htod(dense_gpu, dense_host)
196
+ # We pass a dummy row_nnz pointer (not used in this kernel; left for API)
197
+ # Kernel expects rows_in_block, rows_global_offset to know where to write.
198
+ grid = ((R + block - 1) // block, 1, 1)
199
+ fill_kernel(dense_gpu,
200
+ np.intp(0), # placeholder for row_nnz pointer (not used)
201
+ self.slice_ptr_gpu,
202
+ self.slice_len_gpu,
203
+ self.sell_colinds_gpu,
204
+ self.sell_values_gpu,
205
+ np.int32(R),
206
+ np.int32(num_cols),
207
+ np.int32(b), # rows_global_offset
208
+ np.int32(C),
209
+ np.float32(self.relative_threshold),
210
+ block=(block,1,1), grid=grid)
211
+ dense_gpu.free()
212
+ row_nnz_host_gpu.free()
213
+
214
+ # At this point sell_values_gpu and sell_colinds_gpu are filled.
215
+
216
+ # 4) compute norm_factor_inv via GPU accumulate (col sums)
217
+ self.compute_norm_factor()
218
+
219
+ def compute_norm_factor(self):
220
+ """
221
+ Accumulate column sums on GPU using accumulate_columns_atomic, then compute inverse.
222
+ """
223
+ if self.total_storage == 0:
224
+ raise RuntimeError("sell not built")
225
+ ZX = int(self.Z * self.X)
226
+
227
+ # allocate col sum on device
228
+ col_sum_gpu_size = ZX * np.dtype(np.float32).itemsize
229
+ col_sum_gpu = drv.mem_alloc(col_sum_gpu_size)
230
+ drv.memset_d32(col_sum_gpu, 0, ZX)
231
+
232
+ # FIX: Kernel name is "accumulate_columns_atomic"
233
+ acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic")
234
+
235
+ threads = 256
236
+ blocks = (self.total_storage + threads - 1) // threads
237
+ acc_kernel(self.sell_values_gpu, self.sell_colinds_gpu, np.int64(self.total_storage), col_sum_gpu,
238
+ block=(threads,1,1), grid=(blocks,1,1))
239
+ drv.Context.synchronize()
240
+
241
+ # copy back
242
+ norm_host = np.empty(ZX, dtype=np.float32)
243
+ drv.memcpy_dtoh(norm_host, col_sum_gpu)
244
+ col_sum_gpu.free()
245
+
246
+ norm = np.maximum(norm_host.astype(np.float64), 1e-6)
247
+ self.norm_factor_inv = (1.0 / norm).astype(np.float32)
248
+ if self.norm_factor_inv_gpu is not None:
249
+ self.norm_factor_inv_gpu.free()
250
+
251
+ self.norm_factor_inv_gpu_size = self.norm_factor_inv.nbytes
252
+ self.norm_factor_inv_gpu = drv.mem_alloc(self.norm_factor_inv_gpu_size)
253
+ drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
254
+
255
+ def compute_density(self):
256
+ """
257
+ Returns only the density of the SELL-C-σ matrix.
258
+ """
259
+ if not hasattr(self, 'slice_ptr') or self.slice_ptr is None:
260
+ raise RuntimeError("The SELL-C-σ matrix is not allocated.")
261
+
262
+ num_rows = self.N * self.T
263
+ num_cols = self.Z * self.X
264
+ total_elements = num_rows * num_cols
265
+
266
+ # Conservative estimate of non-zeros (excluding padding)
267
+ nnz_ell_estimated = int(0.9 * self.total_storage)
268
+
269
+ return nnz_ell_estimated / total_elements # Returns only the density
270
+
271
+ def getMatrixSize(self):
272
+ """
273
+ Returns the total size of the SELL-C-σ matrix in Gigabytes (GB).
274
+ """
275
+ if self.sell_values_gpu is None:
276
+ return {"error": "The SELL-C-σ matrix is not yet allocated."}
277
+
278
+ total_bytes = 0
279
+
280
+ # Host-side arrays (using .nbytes which works for NumPy arrays)
281
+ if hasattr(self, 'slice_ptr') and self.slice_ptr is not None:
282
+ total_bytes += self.slice_ptr.nbytes
283
+ if hasattr(self, 'slice_len') and self.slice_len is not None:
284
+ total_bytes += self.slice_len.nbytes
285
+ if hasattr(self, 'norm_factor_inv') and self.norm_factor_inv is not None:
286
+ total_bytes += self.norm_factor_inv.nbytes
287
+
288
+ # GPU-side arrays (using the stored size attributes instead of the problematic .size)
289
+ total_bytes += self.sell_values_gpu_size
290
+ total_bytes += self.sell_colinds_gpu_size
291
+ total_bytes += self.slice_ptr_gpu_size
292
+ total_bytes += self.slice_len_gpu_size
293
+ total_bytes += self.norm_factor_inv_gpu_size
294
+
295
+ return total_bytes / (1024 ** 3) # Returns only the size in GB
@@ -0,0 +1,2 @@
1
+ from .SparseSMatrix_CSR import *
2
+ from .SparseSMatrix_SELL import *