AOT-biomaps 2.9.177__py3-none-any.whl → 2.9.261__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AOT-biomaps might be problematic. Click here for more details.

Files changed (28) hide show
  1. AOT_biomaps/AOT_Acoustic/StructuredWave.py +2 -2
  2. AOT_biomaps/AOT_Acoustic/_mainAcoustic.py +11 -6
  3. AOT_biomaps/AOT_Experiment/Tomography.py +74 -4
  4. AOT_biomaps/AOT_Experiment/_mainExperiment.py +95 -55
  5. AOT_biomaps/AOT_Recon/AOT_Optimizers/DEPIERRO.py +48 -13
  6. AOT_biomaps/AOT_Recon/AOT_Optimizers/LS.py +9 -6
  7. AOT_biomaps/AOT_Recon/AOT_Optimizers/MAPEM.py +118 -38
  8. AOT_biomaps/AOT_Recon/AOT_Optimizers/MLEM.py +268 -102
  9. AOT_biomaps/AOT_Recon/AOT_Optimizers/PDHG.py +1 -1
  10. AOT_biomaps/AOT_Recon/AOT_PotentialFunctions/RelativeDifferences.py +10 -14
  11. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_CSR.py +252 -0
  12. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/SparseSMatrix_SELL.py +322 -0
  13. AOT_biomaps/AOT_Recon/AOT_SparseSMatrix/__init__.py +2 -0
  14. AOT_biomaps/AOT_Recon/AlgebraicRecon.py +248 -141
  15. AOT_biomaps/AOT_Recon/AnalyticRecon.py +27 -42
  16. AOT_biomaps/AOT_Recon/BayesianRecon.py +84 -151
  17. AOT_biomaps/AOT_Recon/DeepLearningRecon.py +1 -1
  18. AOT_biomaps/AOT_Recon/PrimalDualRecon.py +69 -62
  19. AOT_biomaps/AOT_Recon/ReconEnums.py +27 -2
  20. AOT_biomaps/AOT_Recon/ReconTools.py +84 -13
  21. AOT_biomaps/AOT_Recon/__init__.py +1 -0
  22. AOT_biomaps/AOT_Recon/_mainRecon.py +72 -58
  23. AOT_biomaps/__init__.py +4 -93
  24. {aot_biomaps-2.9.177.dist-info → aot_biomaps-2.9.261.dist-info}/METADATA +2 -1
  25. aot_biomaps-2.9.261.dist-info/RECORD +46 -0
  26. aot_biomaps-2.9.177.dist-info/RECORD +0 -43
  27. {aot_biomaps-2.9.177.dist-info → aot_biomaps-2.9.261.dist-info}/WHEEL +0 -0
  28. {aot_biomaps-2.9.177.dist-info → aot_biomaps-2.9.261.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,252 @@
1
+ # sparse_matrix_gpu.py
2
+ import pycuda.driver as drv
3
+ import numpy as np
4
+ from pycuda.compiler import SourceModule
5
+ from tqdm import trange
6
+ import gc
7
+ import os
8
+
9
+ class SparseSMatrix_CSR:
10
+ """Construction d'une matrice CSR à partir d'un objet `manip`.
11
+ Usage:
12
+ S = SparseMatrixGPU(manip)
13
+ S.allocate()
14
+ Après allocate(), on a: row_ptr (host np.int64 array), row_ptr_gpu (device ptr),
15
+ h_col_ind, h_values, col_ind_gpu, values_gpu, norm_factor_inv.
16
+ """
17
+
18
+ def __init__(self, manip, block_rows=64, relative_threshold=0.3, device=0):
19
+ drv.init()
20
+ self.device = drv.Device(device)
21
+ self.ctx = self.device.make_context()
22
+ self.manip = manip
23
+ self.N = len(manip.AcousticFields)
24
+ self.T = manip.AcousticFields[0].field.shape[0]
25
+ self.Z = manip.AcousticFields[0].field.shape[1]
26
+ self.X = manip.AcousticFields[0].field.shape[2]
27
+ self.block_rows = block_rows
28
+ self.relative_threshold = relative_threshold
29
+ self.h_dense = None
30
+ self.row_ptr = None
31
+ self.row_ptr_gpu = None
32
+ self.h_col_ind = None
33
+ self.h_values = None
34
+ self.total_nnz = 0
35
+ self.norm_factor_inv = None
36
+ self.sparse_mod = None
37
+
38
+ def __enter__(self):
39
+ return self
40
+
41
+ def __exit__(self, exc_type, exc, tb):
42
+ self.free()
43
+
44
+ def load_precompiled_module(self, so_path="AOT_biomaps_kernels.cubin"):
45
+ try:
46
+ # If a PTX or cubin is provided via path
47
+ self.sparse_mod = drv.module_from_file(so_path)
48
+ print(f"✅ Module CUDA chargé depuis {so_path}")
49
+ except Exception:
50
+ # Fallback: try to compile from bundled source (if available)
51
+ src_path = os.path.join(os.path.dirname(__file__), 'AOT_biomaps_kernels.cu')
52
+ if os.path.exists(src_path):
53
+ print("Compilation JIT du kernel CUDA depuis source...")
54
+ with open(src_path, 'r') as f:
55
+ src = f.read()
56
+ self.sparse_mod = SourceModule(src, no_extern_c=True)
57
+ print("✅ Module compilé JIT")
58
+ else:
59
+ raise
60
+
61
+ def estimate_nnz_cpu(self):
62
+ """Estimation rapide (non-exacte) — utile si tu veux une estimation faible.
63
+ Recommandé : utiliser la passe GPU exacte (count_nnz_per_row_kernel) à la place.
64
+ """
65
+ total = 0
66
+ for n in range(self.N):
67
+ field = self.manip.AcousticFields[n].field
68
+ for t in range(self.T):
69
+ row = field[t].flatten()
70
+ row_max = np.max(np.abs(row))
71
+ thr = row_max * self.relative_threshold
72
+ total += np.count_nonzero(np.abs(row) > thr)
73
+ return int(total)
74
+
75
+ def allocate(self, kernel_module_path=None):
76
+ try:
77
+ # --- 1. Construction bloc par bloc (sans garder tout le dense si possible) ---
78
+ num_rows = self.N * self.T
79
+ num_cols = self.Z * self.X
80
+ bytes_float = np.dtype(np.float32).itemsize
81
+
82
+ # Charge module
83
+ if kernel_module_path:
84
+ self.load_precompiled_module(kernel_module_path)
85
+ else:
86
+ self.load_precompiled_module('AOT_biomaps_kernels.cubin')
87
+
88
+ count_nnz_kernel = self.sparse_mod.get_function('count_nnz_per_row_kernel')
89
+ fill_csr_kernel = self.sparse_mod.get_function('fill_kernel__CSR')
90
+
91
+ # allocate host row_ptr
92
+ self.row_ptr = np.zeros(num_rows + 1, dtype=np.int64)
93
+
94
+ # GPU temp buffers
95
+ dense_block_host = np.empty((self.block_rows, num_cols), dtype=np.float32)
96
+ dense_block_gpu = drv.mem_alloc(self.block_rows * num_cols * bytes_float)
97
+ row_nnz_gpu = drv.mem_alloc(self.block_rows * np.dtype(np.int32).itemsize)
98
+
99
+ block_size = 128
100
+
101
+ # --- Count NNZ per row using GPU kernel to be consistent with filling logic ---
102
+ for b in trange(0, num_rows, self.block_rows, desc='Comptage NNZ'):
103
+ current_rows = min(self.block_rows, num_rows - b)
104
+ # Fill dense_block_host from manip
105
+ for r in range(current_rows):
106
+ global_row = b + r
107
+ n_idx = global_row // self.T
108
+ t_idx = global_row % self.T
109
+ dense_block_host[r, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
110
+ drv.memcpy_htod(dense_block_gpu, dense_block_host)
111
+
112
+ grid = ((current_rows + block_size - 1) // block_size, 1, 1)
113
+ count_nnz_kernel(dense_block_gpu, row_nnz_gpu,
114
+ np.int32(current_rows), np.int32(num_cols),
115
+ np.float32(self.relative_threshold),
116
+ block=(block_size, 1, 1), grid=grid)
117
+
118
+ row_nnz_host = np.empty(current_rows, dtype=np.int32)
119
+ drv.memcpy_dtoh(row_nnz_host, row_nnz_gpu)
120
+ self.row_ptr[b + 1:b + current_rows + 1] = self.row_ptr[b] + np.cumsum(row_nnz_host, dtype=np.int64)
121
+
122
+ # total nnz
123
+ self.total_nnz = int(self.row_ptr[-1])
124
+ print(f"NNZ total : {self.total_nnz}")
125
+
126
+ # allocate final arrays
127
+ self.h_col_ind = np.zeros(self.total_nnz, dtype=np.uint32)
128
+ self.h_values = np.zeros(self.total_nnz, dtype=np.float32)
129
+
130
+ # copy row_ptr to device once
131
+ self.row_ptr_gpu = drv.mem_alloc(self.row_ptr.nbytes)
132
+ drv.memcpy_htod(self.row_ptr_gpu, self.row_ptr)
133
+
134
+ # allocate device arrays for final csr
135
+ self.col_ind_gpu = drv.mem_alloc(self.h_col_ind.nbytes)
136
+ self.values_gpu = drv.mem_alloc(self.h_values.nbytes)
137
+
138
+ # --- Fill CSR per-block ---
139
+ for b in trange(0, num_rows, self.block_rows, desc='Remplissage CSR'):
140
+ current_rows = min(self.block_rows, num_rows - b)
141
+ for r in range(current_rows):
142
+ global_row = b + r
143
+ n_idx = global_row // self.T
144
+ t_idx = global_row % self.T
145
+ dense_block_host[r, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
146
+ drv.memcpy_htod(dense_block_gpu, dense_block_host)
147
+
148
+ grid = ((current_rows + block_size - 1) // block_size, 1, 1)
149
+ fill_csr_kernel(dense_block_gpu,
150
+ self.row_ptr_gpu,
151
+ self.col_ind_gpu,
152
+ self.values_gpu,
153
+ np.int32(b),
154
+ np.int32(current_rows),
155
+ np.int32(num_cols),
156
+ np.float32(self.relative_threshold),
157
+ np.int64(self.total_nnz),
158
+ block=(block_size, 1, 1), grid=grid)
159
+ drv.Context.synchronize()
160
+
161
+ # copy back
162
+ drv.memcpy_dtoh(self.h_col_ind, self.col_ind_gpu)
163
+ drv.memcpy_dtoh(self.h_values, self.values_gpu)
164
+ print('CSR généré ✔')
165
+
166
+ # compute normalization factor from CSR (sum per column)
167
+ self.compute_norm_factor_from_csr()
168
+
169
+ # free temporaries
170
+ dense_block_gpu.free(); row_nnz_gpu.free()
171
+
172
+ except Exception as e:
173
+ print(f"❌ Erreur détaillée : {e}")
174
+ self.free()
175
+ raise
176
+
177
+ def compute_norm_factor_from_csr(self):
178
+ ZX = self.Z * self.X
179
+
180
+ # 1) Allouer un vecteur de somme colonne sur le GPU
181
+ col_sum_gpu = drv.mem_alloc(ZX * np.dtype(np.float32).itemsize)
182
+ drv.memset_d32(col_sum_gpu, 0, ZX)
183
+
184
+ # 2) Récupérer le kernel
185
+ acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic__CSR")
186
+
187
+ # 3) Lancer le kernel
188
+ threads = 256
189
+ blocks = (self.total_nnz + threads - 1) // threads
190
+
191
+ acc_kernel(
192
+ self.values_gpu,
193
+ self.col_ind_gpu,
194
+ np.int64(self.total_nnz),
195
+ col_sum_gpu,
196
+ block=(threads,1,1),
197
+ grid=(blocks,1,1)
198
+ )
199
+ drv.Context.synchronize()
200
+
201
+ # 4) Récupérer le résultat
202
+ norm = np.empty(ZX, dtype=np.float32)
203
+ drv.memcpy_dtoh(norm, col_sum_gpu)
204
+ col_sum_gpu.free()
205
+
206
+ norm = np.maximum(norm.astype(np.float64), 1e-6)
207
+ self.norm_factor_inv = (1.0 / norm).astype(np.float32)
208
+
209
+ self.norm_factor_inv_gpu = drv.mem_alloc(self.norm_factor_inv.nbytes)
210
+ drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
211
+
212
+ def getMatrixSize(self):
213
+ if self.row_ptr is None:
214
+ return {"error": "La matrice sparse n'est pas encore allouée."}
215
+ total = (self.row_ptr.nbytes if self.row_ptr is not None else 0) + \
216
+ (self.h_col_ind.nbytes if self.h_col_ind is not None else 0) + \
217
+ (self.h_values.nbytes if self.h_values is not None else 0)
218
+ return total / (1024**3)
219
+
220
+ def free(self):
221
+ try:
222
+ if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
223
+ self.col_ind_gpu.free()
224
+ if hasattr(self, 'values_gpu') and self.values_gpu:
225
+ self.values_gpu.free()
226
+ if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
227
+ self.row_ptr_gpu.free()
228
+ if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
229
+ self.norm_factor_inv_gpu.free()
230
+ if hasattr(self, 'ctx') and self.ctx:
231
+ try:
232
+ self.ctx.pop()
233
+ except Exception:
234
+ pass
235
+ self.ctx = None
236
+ print('✅ Mémoire GPU libérée.')
237
+ except Exception as e:
238
+ print(f"❌ Erreur lors de la libération de la mémoire GPU : {e}")
239
+
240
+ def compute_density(self):
241
+ """
242
+ Retourne la densité réelle de la CSR = NNZ / (num_rows * num_cols)
243
+ Nécessite que self.h_values et self.row_ptr existent (host).
244
+ """
245
+ if self.row_ptr is None or self.h_values is None:
246
+ raise RuntimeError("row_ptr et h_values requis pour calculer la densité")
247
+ num_rows = int(self.N * self.T)
248
+ num_cols = int(self.Z * self.X)
249
+ total_nnz = int(self.row_ptr[-1])
250
+ density = total_nnz / (num_rows * num_cols)
251
+ return density
252
+
@@ -0,0 +1,322 @@
1
+ import pycuda.driver as drv
2
+ import numpy as np
3
+ from tqdm import trange
4
+ import os
5
+ import gc
6
+ import subprocess
7
+ import sys
8
+
9
+ # ==============================================================================
10
+ # ATTENTION: MANUAL CONFIGURATION REQUIRED
11
+ # ==============================================================================
12
+ # The 'sm_XX' value must correspond to the Compute Capability of your NVIDIA GPU.
13
+ # Examples: Kepler (sm_35), Maxwell (sm_50), Pascal (sm_61), Turing (sm_75), Ampere (sm_86).
14
+ # THIS LINE MUST BE MODIFIED BY THE USER to target their specific architecture.
15
+ GPU_COMPUTE_CAPABILITY = "sm_86"
16
+ # ==============================================================================
17
+
18
+ class SparseSMatrix_SELL:
19
+ def __init__(self, manip, block_rows=64, relative_threshold=0.3, device=0,
20
+ module_path="AOT_biomaps_kernels.cubin", slice_height=32):
21
+ drv.init()
22
+ self.device = drv.Device(device)
23
+ self.ctx = self.device.make_context()
24
+ self.manip = manip
25
+ self.N = len(manip.AcousticFields)
26
+ self.T = manip.AcousticFields[0].field.shape[0]
27
+ self.Z = manip.AcousticFields[0].field.shape[1]
28
+ self.X = manip.AcousticFields[0].field.shape[2]
29
+ self.block_rows = block_rows
30
+ self.relative_threshold = relative_threshold
31
+ # The module_path is relative to this file's directory
32
+ self.module_path = os.path.join(os.path.dirname(__file__), module_path)
33
+ self.slice_height = slice_height
34
+
35
+ # SELL arrays (device)
36
+ self.sell_values_gpu = None
37
+ self.sell_colinds_gpu = None
38
+ self.slice_ptr = None
39
+ self.slice_len = None
40
+ self.slice_ptr_gpu = None
41
+ self.slice_len_gpu = None
42
+ self.total_storage = 0
43
+
44
+ self.norm_factor_inv = None
45
+ self.norm_factor_inv_gpu = None
46
+
47
+ self.sparse_mod = None
48
+ self.load_module()
49
+
50
+ def _compile_cubin(self, source_file="AOT_biomaps_kernels.cu"):
51
+ """
52
+ Tries to compile the .cu file into .cubin using nvcc.
53
+ """
54
+ print("="*60)
55
+ print("🛠️ CUDA COMPILATION REQUIRED")
56
+ print(f"Attempting to compile {source_file} to {os.path.basename(self.module_path)}...")
57
+
58
+ # The source file is assumed to be in the same directory as this Python file.
59
+ source_path = os.path.join(os.path.dirname(__file__), source_file)
60
+ cubin_path = self.module_path
61
+
62
+ if not os.path.exists(source_path):
63
+ print(f"❌ CRITICAL ERROR: CUDA source file {source_file} not found at {source_path}.")
64
+ raise FileNotFoundError(f"Could not find source file {source_file} for compilation. AOT_biomaps installation might be incomplete.")
65
+
66
+ # Construction of the nvcc command
67
+ command = [
68
+ 'nvcc',
69
+ '-cubin',
70
+ f'-arch={GPU_COMPUTE_CAPABILITY}', # USES THE VARIABLE DEFINED ABOVE
71
+ source_path,
72
+ '-o',
73
+ cubin_path
74
+ ]
75
+
76
+ print(f"Executing command: {' '.join(command)}")
77
+
78
+ try:
79
+ # Executes the command and waits for completion
80
+ result = subprocess.run(
81
+ command,
82
+ check=True,
83
+ capture_output=True,
84
+ text=True
85
+ )
86
+ print(f"🎉 Compilation successful! File created: {os.path.basename(cubin_path)}")
87
+ # print("Output nvcc:\n", result.stdout) # Uncomment to see detailed output
88
+ print("="*60)
89
+ return True
90
+
91
+ except subprocess.CalledProcessError as e:
92
+ print("❌ NVCC COMPILATION ERROR:")
93
+ print(f"Check GPU architecture: {GPU_COMPUTE_CAPABILITY}")
94
+ print(f"Standard error:\n{e.stderr}")
95
+ print("="*60)
96
+ return False
97
+
98
+ except FileNotFoundError:
99
+ print("❌ ERROR: The 'nvcc' command was not found.")
100
+ print("Ensure that the CUDA Toolkit is installed and 'nvcc' is in your PATH (or your Conda environment).")
101
+ print("="*60)
102
+ return False
103
+
104
+ def load_module(self):
105
+ """Tries to load the CUDA module. If the file is missing, it attempts to compile it."""
106
+
107
+ if not os.path.exists(self.module_path):
108
+ print(f"CUDA module {os.path.basename(self.module_path)} missing. Attempting compilation...")
109
+
110
+ if not self._compile_cubin():
111
+ # If compilation fails, re-raise the original error.
112
+ raise FileNotFoundError(f"{os.path.basename(self.module_path)} not found and compilation failed. Check nvcc and GPU architecture ({GPU_COMPUTE_CAPABILITY}).")
113
+
114
+ # Try to load after compilation (or if the file existed)
115
+ try:
116
+ self.sparse_mod = drv.module_from_file(self.module_path)
117
+ print(f"Loaded CUDA module {os.path.basename(self.module_path)}")
118
+ except Exception as e:
119
+ print(f"❌ Error loading CUDA module {os.path.basename(self.module_path)}: {e}")
120
+ raise RuntimeError(f"File {os.path.basename(self.module_path)} was found, but PyCUDA could not load it.") from e
121
+
122
+ def free(self):
123
+ try:
124
+ attrs = ["sell_values_gpu","sell_colinds_gpu","slice_ptr_gpu","slice_len_gpu","norm_factor_inv_gpu"]
125
+ for a in attrs:
126
+ if hasattr(self, a) and getattr(self, a) is not None:
127
+ getattr(self, a).free()
128
+ setattr(self, a, None)
129
+ if hasattr(self, 'ctx') and self.ctx:
130
+ try: self.ctx.pop()
131
+ except Exception: pass
132
+ self.ctx = None
133
+ except Exception as e:
134
+ print("Error freeing GPU memory:", e)
135
+
136
+ def allocate(self):
137
+ """
138
+ Build SELL-C-σ directly from manip AcousticFields in streaming blocks.
139
+ """
140
+ # Ensures the module is loaded before attempting to retrieve functions
141
+ if self.sparse_mod is None:
142
+ raise RuntimeError("CUDA module not loaded. Check compilation.")
143
+
144
+ count_kernel = self.sparse_mod.get_function("count_nnz_rows_kernel")
145
+ fill_kernel = self.sparse_mod.get_function("fill_kernel__SELL")
146
+
147
+ num_rows = int(self.N * self.T)
148
+ num_cols = int(self.Z * self.X)
149
+ C = int(self.slice_height)
150
+
151
+ # host temporary block
152
+ br = int(self.block_rows)
153
+ bytes_per_elem = np.dtype(np.float32).itemsize
154
+ dense_host = np.empty((br, num_cols), dtype=np.float32)
155
+ dense_gpu = drv.mem_alloc(dense_host.nbytes)
156
+
157
+ # 1) count nnz per row (on host via small blocks with GPU kernel)
158
+ row_nnz = np.zeros(num_rows, dtype=np.int32)
159
+ row_nnz_gpu_block = drv.mem_alloc(br * np.dtype(np.int32).itemsize)
160
+
161
+ block = 256
162
+ for b in trange(0, num_rows, br, desc="Count NNZ per row"):
163
+ R = min(br, num_rows - b)
164
+ # fill dense_host
165
+ for i in range(R):
166
+ rg = b + i
167
+ n_idx = rg // self.T
168
+ t_idx = rg % self.T
169
+ dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
170
+ # copy only R rows
171
+ drv.memcpy_htod(dense_gpu, dense_host)
172
+ grid = ((R + block - 1) // block, 1, 1)
173
+ count_kernel(dense_gpu, row_nnz_gpu_block, np.int32(R), np.int32(num_cols), np.float32(self.relative_threshold),
174
+ block=(block,1,1), grid=grid)
175
+ tmp = np.empty(R, dtype=np.int32)
176
+ drv.memcpy_dtoh(tmp, row_nnz_gpu_block)
177
+ row_nnz[b:b+R] = tmp
178
+
179
+ row_nnz_gpu_block.free()
180
+ dense_gpu.free()
181
+
182
+ # 2) compute per-slice maxlen and slice_ptr
183
+ num_slices = (num_rows + C - 1) // C
184
+ slice_len = np.zeros(num_slices, dtype=np.int32)
185
+ for s in range(num_slices):
186
+ r0 = s * C
187
+ r1 = min(num_rows, r0 + C)
188
+ slice_len[s] = int(np.max(row_nnz[r0:r1])) if (r1>r0) else 0
189
+ # slice_ptr (int64)
190
+ slice_ptr = np.zeros(num_slices + 1, dtype=np.int64)
191
+ for s in range(num_slices):
192
+ slice_ptr[s+1] = slice_ptr[s] + (slice_len[s] * C)
193
+ total_storage = int(slice_ptr[-1])
194
+ self.total_storage = total_storage
195
+ print(f"SELL: num_rows={num_rows}, num_slices={num_slices}, total_storage(padded)={total_storage}")
196
+
197
+ # allocate device SELL arrays (values float32, colinds uint32)
198
+ self.sell_values_gpu = drv.mem_alloc(total_storage * np.dtype(np.float32).itemsize)
199
+ self.sell_colinds_gpu = drv.mem_alloc(total_storage * np.dtype(np.uint32).itemsize)
200
+ # allocate slice metadata on device
201
+ self.slice_ptr = slice_ptr
202
+ self.slice_len = slice_len
203
+ self.slice_ptr_gpu = drv.mem_alloc(self.slice_ptr.nbytes)
204
+ self.slice_len_gpu = drv.mem_alloc(self.slice_len.nbytes)
205
+ drv.memcpy_htod(self.slice_ptr_gpu, self.slice_ptr)
206
+ drv.memcpy_htod(self.slice_len_gpu, self.slice_len)
207
+
208
+ # 3) fill SELL arrays by streaming blocks again (use GPU fill kernel)
209
+ # reuse dense_host and dense_gpu
210
+ dense_host = np.empty((br, num_cols), dtype=np.float32)
211
+ dense_gpu = drv.mem_alloc(dense_host.nbytes)
212
+ # we also need row_nnz on device per-block; supply global row_nnz on host but the kernel recomputes threshold
213
+ row_nnz_host_gpu = drv.mem_alloc(br * np.dtype(np.int32).itemsize)
214
+
215
+ for b in trange(0, num_rows, br, desc="Fill SELL"):
216
+ R = min(br, num_rows - b)
217
+ for i in range(R):
218
+ rg = b + i
219
+ n_idx = rg // self.T
220
+ t_idx = rg % self.T
221
+ dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
222
+ drv.memcpy_htod(dense_gpu, dense_host)
223
+ # We pass a dummy row_nnz pointer (not used in this kernel; left for API)
224
+ # Kernel expects rows_in_block, rows_global_offset to know where to write.
225
+ grid = ((R + block - 1) // block, 1, 1)
226
+ fill_kernel(dense_gpu,
227
+ np.intp(0), # placeholder for row_nnz pointer (not used)
228
+ self.slice_ptr_gpu,
229
+ self.slice_len_gpu,
230
+ self.sell_colinds_gpu,
231
+ self.sell_values_gpu,
232
+ np.int32(R),
233
+ np.int32(num_cols),
234
+ np.int32(b), # rows_global_offset
235
+ np.int32(C),
236
+ np.float32(self.relative_threshold),
237
+ block=(block,1,1), grid=grid)
238
+ dense_gpu.free()
239
+ row_nnz_host_gpu.free()
240
+
241
+ # At this point sell_values_gpu and sell_colinds_gpu are filled.
242
+
243
+ # 4) compute norm_factor_inv via GPU accumulate (col sums)
244
+ self.compute_norm_factor()
245
+
246
+ def compute_norm_factor(self):
247
+ """
248
+ Accumulate column sums on GPU using accumulate_columns_atomic, then compute inverse.
249
+ """
250
+ if self.total_storage == 0:
251
+ raise RuntimeError("sell not built")
252
+ ZX = int(self.Z * self.X)
253
+
254
+ # allocate col sum on device
255
+ col_sum_gpu = drv.mem_alloc(ZX * np.dtype(np.float32).itemsize)
256
+ drv.memset_d32(col_sum_gpu, 0, ZX)
257
+
258
+ acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic__SELL")
259
+ threads = 256
260
+ blocks = (self.total_storage + threads - 1) // threads
261
+ acc_kernel(self.sell_values_gpu, self.sell_colinds_gpu, np.int64(self.total_storage), col_sum_gpu,
262
+ block=(threads,1,1), grid=(blocks,1,1))
263
+ drv.Context.synchronize()
264
+
265
+ # copy back
266
+ norm_host = np.empty(ZX, dtype=np.float32)
267
+ drv.memcpy_dtoh(norm_host, col_sum_gpu)
268
+ col_sum_gpu.free()
269
+
270
+ norm = np.maximum(norm_host.astype(np.float64), 1e-6)
271
+ self.norm_factor_inv = (1.0 / norm).astype(np.float32)
272
+ if self.norm_factor_inv_gpu is not None:
273
+ self.norm_factor_inv_gpu.free()
274
+ self.norm_factor_inv_gpu = drv.mem_alloc(self.norm_factor_inv.nbytes)
275
+ drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
276
+
277
+ def compute_density(self):
278
+ """
279
+ Returns only the density of the SELL-C-σ matrix.
280
+ """
281
+ if not hasattr(self, 'slice_ptr') or self.slice_ptr is None:
282
+ raise RuntimeError("The SELL-C-σ matrix is not allocated.")
283
+
284
+ num_rows = self.N * self.T
285
+ num_cols = self.Z * self.X
286
+ total_elements = num_rows * num_cols
287
+
288
+ # Conservative estimate of non-zeros (excluding padding)
289
+ nnz_ell_estimated = int(0.9 * self.total_storage)
290
+
291
+ return nnz_ell_estimated / total_elements # Returns only the density
292
+
293
+ def getMatrixSize(self):
294
+ """
295
+ Returns the total size of the SELL-C-σ matrix in Gigabytes (GB).
296
+ """
297
+ if self.sell_values_gpu is None:
298
+ return {"error": "The SELL-C-σ matrix is not yet allocated."}
299
+
300
+ total_bytes = 0
301
+
302
+ # Host-side arrays
303
+ if hasattr(self, 'slice_ptr') and self.slice_ptr is not None:
304
+ total_bytes += self.slice_ptr.nbytes
305
+ if hasattr(self, 'slice_len') and self.slice_len is not None:
306
+ total_bytes += self.slice_len.nbytes
307
+ if hasattr(self, 'norm_factor_inv') and self.norm_factor_inv is not None:
308
+ total_bytes += self.norm_factor_inv.nbytes
309
+
310
+ # GPU-side arrays
311
+ if hasattr(self, 'sell_values_gpu') and self.sell_values_gpu:
312
+ total_bytes += self.sell_values_gpu.size
313
+ if hasattr(self, 'sell_colinds_gpu') and self.sell_colinds_gpu:
314
+ total_bytes += self.sell_colinds_gpu.size
315
+ if hasattr(self, 'slice_ptr_gpu') and self.slice_ptr_gpu:
316
+ total_bytes += self.slice_ptr_gpu.size
317
+ if hasattr(self, 'slice_len_gpu') and self.slice_len_gpu:
318
+ total_bytes += self.slice_len_gpu.size
319
+ if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
320
+ total_bytes += self.norm_factor_inv_gpu.size
321
+
322
+ return total_bytes / (1024 ** 3) # Returns only the size in GB
@@ -0,0 +1,2 @@
1
+ from .SparseSMatrix_CSR import *
2
+ from .SparseSMatrix_SELL import *