AOT-biomaps 2.9.279__py3-none-any.whl → 2.9.300__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AOT-biomaps might be problematic. Click here for more details.

@@ -23,8 +23,7 @@ def LS(
23
23
  denominator_threshold=1e-6,
24
24
  max_saves=5000,
25
25
  show_logs=True,
26
- smatrixType=SMatrixType.SELL,
27
- Z=350,
26
+ smatrixType=SMatrixType.SELL
28
27
  ):
29
28
  """
30
29
  Least Squares reconstruction using Projected Gradient Descent (PGD) with non-negativity constraint.
@@ -44,7 +43,7 @@ def LS(
44
43
  # Dispatch to the appropriate implementation
45
44
  if use_gpu:
46
45
  if smatrixType == SMatrixType.CSR:
47
- return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
46
+ return _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
48
47
  elif smatrixType == SMatrixType.SELL:
49
48
  return _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
50
49
  elif smatrixType == SMatrixType.DENSE:
@@ -181,13 +180,12 @@ def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tu
181
180
  print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
182
181
 
183
182
  stream = drv.Stream()
184
- mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
185
183
 
186
184
  # Récupération des Kernels
187
- projection_kernel = mod.get_function('projection_kernel__CSR')
188
- backprojection_kernel = mod.get_function('backprojection_kernel__CSR')
189
- axpby_kernel = mod.get_function("vector_axpby_kernel")
190
- minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
185
+ projection_kernel = SMatrix.sparse_mod.get_function('projection_kernel__CSR')
186
+ backprojection_kernel = SMatrix.sparse_mod.get_function('backprojection_kernel__CSR')
187
+ axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
188
+ minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
191
189
 
192
190
  # --- Allocation des buffers (Pointeurs Bruts) ---
193
191
  y = y.T.flatten().astype(dtype)
@@ -231,7 +229,7 @@ def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tu
231
229
  drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
232
230
 
233
231
  # 6. rho_prev = ||r_0||^2
234
- rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
232
+ rho_prev = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
235
233
 
236
234
  # --- Boucle itérative ---
237
235
  saved_theta, saved_indices = [], []
@@ -258,7 +256,7 @@ def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tu
258
256
  block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
259
257
 
260
258
  # c. alpha = rho_prev / <p, z>
261
- pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
259
+ pAp = _dot_product_gpu(SMatrix.sparse_mod, p_flat_gpu, z_flat_gpu, ZX, stream)
262
260
 
263
261
  if abs(pAp) < 1e-15: break
264
262
  alpha = rho_prev / pAp
@@ -273,7 +271,7 @@ def _LS_CG_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tu
273
271
  block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
274
272
 
275
273
  # f. rho_curr = ||r||^2
276
- rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
274
+ rho_curr = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
277
275
 
278
276
  if rho_curr < tolerance: break
279
277
 
@@ -364,11 +362,10 @@ def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, t
364
362
  tolerance = 1e-12
365
363
 
366
364
  # Accès aux paramètres SELL
367
- mod = SMatrix.sparse_mod
368
- projection_kernel = mod.get_function("projection_kernel__SELL")
369
- backprojection_kernel = mod.get_function("backprojection_kernel__SELL")
370
- axpby_kernel = mod.get_function("vector_axpby_kernel")
371
- minus_axpy_kernel = mod.get_function("vector_minus_axpy_kernel")
365
+ projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
366
+ backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
367
+ axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
368
+ minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
372
369
  slice_height = np.int32(SMatrix.slice_height)
373
370
  grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
374
371
 
@@ -416,7 +413,7 @@ def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, t
416
413
  drv.memcpy_dtod(p_flat_gpu, r_flat_gpu, ZX * np.dtype(dtype).itemsize)
417
414
 
418
415
  # 6. rho_prev = ||r_0||^2
419
- rho_prev = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
416
+ rho_prev = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
420
417
 
421
418
  # --- Boucle itérative ---
422
419
  saved_theta, saved_indices = [], []
@@ -443,7 +440,7 @@ def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, t
443
440
  block=(block_size, 1, 1), grid=grid_rows, stream=stream)
444
441
 
445
442
  # c. alpha = rho_prev / <p, z>
446
- pAp = _dot_product_gpu(mod, p_flat_gpu, z_flat_gpu, ZX, stream)
443
+ pAp = _dot_product_gpu(SMatrix.sparse_mod, p_flat_gpu, z_flat_gpu, ZX, stream)
447
444
 
448
445
  if abs(pAp) < 1e-15: break
449
446
  alpha = rho_prev / pAp
@@ -458,7 +455,7 @@ def _LS_CG_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, t
458
455
  block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
459
456
 
460
457
  # f. rho_curr = ||r||^2
461
- rho_curr = _dot_product_gpu(mod, r_flat_gpu, r_flat_gpu, ZX, stream)
458
+ rho_curr = _dot_product_gpu(SMatrix.sparse_mod, r_flat_gpu, r_flat_gpu, ZX, stream)
462
459
 
463
460
  if rho_curr < tolerance: break
464
461
 
@@ -26,7 +26,6 @@ def MLEM(
26
26
  max_saves=5000,
27
27
  show_logs=True,
28
28
  smatrixType=SMatrixType.SELL,
29
- Z=350,
30
29
  ):
31
30
  """
32
31
  Unified MLEM algorithm for Acousto-Optic Tomography.
@@ -59,11 +58,11 @@ def MLEM(
59
58
  # Dispatch to the appropriate implementation
60
59
  if use_gpu:
61
60
  if smatrixType == SMatrixType.CSR:
62
- return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, Z, show_logs)
61
+ return MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
63
62
  elif smatrixType == SMatrixType.SELL:
64
- return MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs)
63
+ return MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold, show_logs)
65
64
  elif smatrixType == SMatrixType.DENSE:
66
- return _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold,show_logs)
65
+ return _MLEM_single_GPU(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, max_saves, denominator_threshold,show_logs)
67
66
  else:
68
67
  raise ValueError("Unsupported SMatrixType for GPU MLEM.")
69
68
  else:
@@ -229,49 +228,60 @@ def _MLEM_CPU_opti(SMatrix, y, numIterations, isSavingEachIteration, tumor_str,
229
228
  print(f"Error in optimized CPU MLEM: {type(e).__name__}: {e}")
230
229
  return None, None
231
230
 
232
- def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
231
+ def MLEM_sparseCSR_pycuda(
232
+ SMatrix,
233
+ y,
234
+ numIterations,
235
+ isSavingEachIteration,
236
+ tumor_str,
237
+ max_saves,
238
+ denominator_threshold,
239
+ show_logs=True,
240
+ ):
233
241
  """
234
- SMatrix: instance of SparseMatrixGPU (already allocated)
235
- y: measured data (1D np.float32 of length TN)
236
-
237
- Assumptions:
238
- - SMatrix.values_gpu and SMatrix.col_ind_gpu and SMatrix.row_ptr_gpu are device pointers
239
- - SMatrix.norm_factor_inv_gpu exists
240
- - SMatrix.ctx is the PyCUDA context for the target GPU.
242
+ Robust MLEM implementation for CSR SMatrix using PyCUDA kernels.
243
+ Expects SMatrix to be SparseSMatrix_CSR with attributes:
244
+ - values_gpu, col_ind_gpu, row_ptr_gpu (device pointers)
245
+ - norm_factor_inv_gpu (device pointer)
246
+ - sparse_mod (loaded module with kernels)
247
+ - ctx (PyCUDA context)
248
+ Returns (saved_theta_list, saved_indices) if isSavingEachIteration else (final_theta, None)
241
249
  """
242
-
243
- # We use a final_result placeholder to ensure it's defined outside the try block
244
250
  final_result = None
245
-
251
+
252
+ # Local holders to free in finally
253
+ y_gpu = q_flat_gpu = e_flat_gpu = c_flat_gpu = theta_flat_gpu = None
254
+
246
255
  try:
247
256
  if not isinstance(SMatrix, SparseSMatrix_CSR):
248
257
  raise TypeError("SMatrix must be a SparseSMatrix_CSR object")
249
258
 
250
- # --- CONTEXT FIX: Push the context associated with SMatrix ---
251
- # This ensures all subsequent PyCUDA operations use the correct GPU/context.
252
- if SMatrix.ctx:
259
+ # push context (if provided)
260
+ popped_ctx = False
261
+ if getattr(SMatrix, "ctx", None):
253
262
  SMatrix.ctx.push()
254
- # -----------------------------------------------------------
263
+ popped_ctx = True
255
264
 
256
265
  dtype = np.float32
257
- TN = SMatrix.N * SMatrix.T
258
- ZX = SMatrix.Z * SMatrix.X
259
- # Ensure Z and X are correctly defined for reshaping
260
- Z = SMatrix.Z
261
- X = SMatrix.X
262
-
263
- if show_logs:
264
- # We assume SMatrix was initialized using the correct device index.
265
- print(f"Executing on GPU device index: {SMatrix.device.primary_context.device.name()}")
266
- print(f"Dim X: {X}, Dim Z: {Z}, TN: {TN}, ZX: {ZX}")
267
-
268
- # streams
266
+ TN = int(SMatrix.N * SMatrix.T)
267
+ ZX = int(SMatrix.Z * SMatrix.X)
268
+ Z = int(SMatrix.Z)
269
+ X = int(SMatrix.X)
270
+
271
+ # Make sure required GPU pointers exist
272
+ if getattr(SMatrix, "values_gpu", None) is None or getattr(SMatrix, "col_ind_gpu", None) is None or getattr(SMatrix, "row_ptr_gpu", None) is None:
273
+ raise RuntimeError("SMatrix is missing GPU buffers (values_gpu / col_ind_gpu / row_ptr_gpu)")
274
+
275
+ if getattr(SMatrix, "norm_factor_inv_gpu", None) is None:
276
+ raise RuntimeError("SMatrix.norm_factor_inv_gpu not available on GPU")
277
+
278
+ # stream for async operations
269
279
  stream = drv.Stream()
270
280
 
271
- # allocate device buffers
272
- y = y.T.flatten().astype(np.float32)
273
- y_gpu = drv.mem_alloc(y.nbytes)
274
- drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
281
+ # prepare device buffers
282
+ y_arr = np.ascontiguousarray(y.T.flatten().astype(np.float32))
283
+ y_gpu = drv.mem_alloc(y_arr.nbytes)
284
+ drv.memcpy_htod_async(y_gpu, y_arr, stream)
275
285
 
276
286
  theta_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
277
287
  initial_theta = np.full(ZX, 0.1, dtype=dtype)
@@ -283,62 +293,111 @@ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumo
283
293
  e_flat_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize)
284
294
  c_flat_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize)
285
295
 
286
- # Assuming the cubin file is found globally or managed by the caller
287
- mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
288
- projection_kernel = mod.get_function('projection_kernel__CSR')
289
- backprojection_kernel = mod.get_function('backprojection_kernel__CSR')
290
- ratio_kernel = mod.get_function('ratio_kernel')
291
- update_kernel = mod.get_function('update_theta_kernel')
296
+ # Ensure kernels exist
297
+ projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__CSR")
298
+ backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__CSR")
299
+ ratio_kernel = SMatrix.sparse_mod.get_function("ratio_kernel")
300
+ update_kernel = SMatrix.sparse_mod.get_function("update_theta_kernel")
292
301
  block_size = 256
293
302
 
294
- saved_theta, saved_indices = [], []
303
+ # prepare save indices once
295
304
  if numIterations <= max_saves:
296
305
  save_indices = list(range(numIterations))
297
306
  else:
298
- save_indices = list(range(0, numIterations, max(1, numIterations // max_saves)))
307
+ step = max(1, numIterations // max_saves)
308
+ save_indices = list(range(0, numIterations, step))
299
309
  if save_indices[-1] != numIterations - 1:
300
310
  save_indices.append(numIterations - 1)
301
311
 
312
+ saved_theta = []
313
+ saved_indices = []
314
+
302
315
  description = f"AOT-BioMaps -- ML-EM (CSR-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
303
316
  iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
317
+
318
+ # grid sizes
319
+ grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
320
+ grid_cols = ((ZX + block_size - 1) // block_size, 1, 1)
321
+
304
322
  for it in iterator:
305
323
  # projection: q = A * theta
306
- projection_kernel(q_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
307
- theta_flat_gpu, np.int32(TN),
308
- block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1),
309
- stream=stream)
324
+ projection_kernel(
325
+ q_flat_gpu,
326
+ SMatrix.values_gpu,
327
+ SMatrix.row_ptr_gpu,
328
+ SMatrix.col_ind_gpu,
329
+ theta_flat_gpu,
330
+ np.int32(TN),
331
+ block=(block_size, 1, 1),
332
+ grid=grid_rows,
333
+ stream=stream,
334
+ )
310
335
 
311
336
  # ratio: e = y / max(q, threshold)
312
- ratio_kernel(e_flat_gpu, y_gpu, q_flat_gpu, np.float32(denominator_threshold), np.int32(TN),
313
- block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
314
-
315
- # backprojection: c = A^T * e
337
+ ratio_kernel(
338
+ e_flat_gpu,
339
+ y_gpu,
340
+ q_flat_gpu,
341
+ np.float32(denominator_threshold),
342
+ np.int32(TN),
343
+ block=(block_size, 1, 1),
344
+ grid=grid_rows,
345
+ stream=stream,
346
+ )
347
+
348
+ # backprojection: c = A^T * e (zero c first)
316
349
  drv.memset_d32_async(c_flat_gpu, 0, ZX, stream)
317
- backprojection_kernel(c_flat_gpu, SMatrix.values_gpu, SMatrix.row_ptr_gpu, SMatrix.col_ind_gpu,
318
- e_flat_gpu, np.int32(TN),
319
- block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
350
+ backprojection_kernel(
351
+ c_flat_gpu,
352
+ SMatrix.values_gpu,
353
+ SMatrix.row_ptr_gpu,
354
+ SMatrix.col_ind_gpu,
355
+ e_flat_gpu,
356
+ np.int32(TN),
357
+ block=(block_size, 1, 1),
358
+ grid=grid_rows,
359
+ stream=stream,
360
+ )
320
361
 
321
362
  # update: theta *= norm_factor_inv * c
322
- update_kernel(theta_flat_gpu, c_flat_gpu, norm_factor_inv_gpu, np.int32(ZX),
323
- block=(block_size, 1, 1), grid=((ZX + block_size - 1) // block_size, 1, 1), stream=stream)
324
-
363
+ update_kernel(
364
+ theta_flat_gpu,
365
+ c_flat_gpu,
366
+ norm_factor_inv_gpu,
367
+ np.int32(ZX),
368
+ block=(block_size, 1, 1),
369
+ grid=grid_cols,
370
+ stream=stream,
371
+ )
372
+
373
+ # periodic synchronization for stability / logging
325
374
  if show_logs and (it % 10 == 0 or it == numIterations - 1):
326
- drv.Context.synchronize()
375
+ stream.synchronize()
327
376
 
377
+ # save snapshot if required
328
378
  if isSavingEachIteration and it in save_indices:
379
+ # ensure kernels finished
380
+ stream.synchronize()
329
381
  theta_host = np.empty(ZX, dtype=dtype)
330
382
  drv.memcpy_dtoh(theta_host, theta_flat_gpu)
331
383
  saved_theta.append(theta_host.reshape(Z, X))
332
- saved_indices.append(it)
333
-
334
- drv.Context.synchronize()
335
-
336
- final_result = np.empty(ZX, dtype=dtype)
337
- drv.memcpy_dtoh(final_result, theta_flat_gpu)
338
- final_result = final_result.reshape(Z, X)
339
-
340
- # free local allocations
341
- y_gpu.free(); q_flat_gpu.free(); e_flat_gpu.free(); c_flat_gpu.free(); theta_flat_gpu.free()
384
+ saved_indices.append(int(it))
385
+
386
+ # make sure everything finished
387
+ stream.synchronize()
388
+ final_theta_host = np.empty(ZX, dtype=dtype)
389
+ drv.memcpy_dtoh(final_theta_host, theta_flat_gpu)
390
+ final_result = final_theta_host.reshape(Z, X)
391
+
392
+ # free local allocations (will also be freed in finally if exception)
393
+ try:
394
+ y_gpu.free()
395
+ q_flat_gpu.free()
396
+ e_flat_gpu.free()
397
+ c_flat_gpu.free()
398
+ theta_flat_gpu.free()
399
+ except Exception:
400
+ pass
342
401
 
343
402
  return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
344
403
 
@@ -346,47 +405,64 @@ def MLEM_sparseCSR_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumo
346
405
  print(f"Error in MLEM_sparseCSR_pycuda: {type(e).__name__}: {e}")
347
406
  gc.collect()
348
407
  return None, None
349
-
350
- finally:
351
- # --- CONTEXT FIX: Pop the context ---
352
- if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
353
- SMatrix.ctx.pop()
354
- # ------------------------------------
355
408
 
356
- def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tumor_str, device, max_saves, denominator_threshold, show_logs=True):
409
+ finally:
410
+ # free buffers if still allocated
411
+ for buf in ("y_gpu", "q_flat_gpu", "e_flat_gpu", "c_flat_gpu", "theta_flat_gpu"):
412
+ try:
413
+ val = locals().get(buf, None)
414
+ if val is not None:
415
+ val.free()
416
+ except Exception:
417
+ pass
418
+ # pop context safely
419
+ try:
420
+ if SMatrix and hasattr(SMatrix, "ctx") and SMatrix.ctx and popped_ctx:
421
+ SMatrix.ctx.pop()
422
+ except Exception:
423
+ pass
424
+
425
+ def MLEM_sparseSELL_pycuda(
426
+ SMatrix,
427
+ y,
428
+ numIterations,
429
+ isSavingEachIteration,
430
+ tumor_str,
431
+ max_saves,
432
+ denominator_threshold,
433
+ show_logs=True,
434
+ ):
357
435
  """
358
436
  MLEM using SELL-C-σ kernels already present on device.
359
437
  y must be float32 length TN.
438
+
439
+ Version propre : diagnostics retirés.
360
440
  """
361
441
  final_result = None
362
442
 
363
443
  try:
364
- # check if SMatrix is SparseSMatrix_SELL object
365
444
  if not isinstance(SMatrix, SparseSMatrix_SELL):
366
445
  raise TypeError("SMatrix must be a SparseSMatrix_SELL object")
367
446
  if SMatrix.sell_values_gpu is None:
368
447
  raise RuntimeError("SELL not built. Call allocate_sell_c_sigma_direct() first.")
369
-
370
- # --- CONTEXT FIX: Push the context associated with SMatrix ---
371
- # This ensures all subsequent PyCUDA operations use the correct GPU/context.
448
+
449
+ # Context
372
450
  if SMatrix.ctx:
373
451
  SMatrix.ctx.push()
374
- # -----------------------------------------------------------
375
452
 
376
453
  TN = int(SMatrix.N * SMatrix.T)
377
454
  ZX = int(SMatrix.Z * SMatrix.X)
378
455
  dtype = np.float32
379
456
  block_size = 256
380
457
 
381
- mod = drv.module_from_file('AOT_biomaps_kernels.cubin')
382
- proj = mod.get_function("projection_kernel__SELL")
383
- backproj = mod.get_function("backprojection_kernel__SELL")
384
- ratio = mod.get_function("ratio_kernel")
385
- update = mod.get_function("update_theta_kernel")
458
+ proj = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
459
+ backproj = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
460
+ ratio = SMatrix.sparse_mod.get_function("ratio_kernel")
461
+ update = SMatrix.sparse_mod.get_function("update_theta_kernel")
386
462
 
387
463
  stream = drv.Stream()
388
464
 
389
- # device buffers
465
+ # Device buffers
390
466
  y = y.T.flatten().astype(np.float32)
391
467
  y_gpu = drv.mem_alloc(y.nbytes)
392
468
  drv.memcpy_htod_async(y_gpu, y.astype(dtype), stream)
@@ -405,6 +481,7 @@ def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tum
405
481
  grid_rows = ((TN + block_size - 1) // block_size, 1, 1)
406
482
  grid_cols = ((ZX + block_size - 1) // block_size, 1, 1)
407
483
 
484
+ # Prepare save indices
408
485
  saved_theta, saved_indices = [], []
409
486
  if numIterations <= max_saves:
410
487
  save_indices = list(range(numIterations))
@@ -415,52 +492,59 @@ def MLEM_sparseSELL_pycuda(SMatrix, y, numIterations, isSavingEachIteration, tum
415
492
 
416
493
  description = f"AOT-BioMaps -- ML-EM (SELL-c-σ-sparse SMatrix) ---- {tumor_str} TUMOR ---- GPU {torch.cuda.current_device()}"
417
494
  iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
495
+
496
+ # --- MLEM Loop ---
418
497
  for it in iterator:
419
- # projection
420
- proj(q_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, slice_ptr_gpu, slice_len_gpu,
421
- theta_gpu, np.int32(TN), slice_height,
422
- block=(block_size,1,1), grid=grid_rows, stream=stream)
423
498
 
424
- # ratio
499
+ proj(q_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu,
500
+ slice_ptr_gpu, slice_len_gpu,
501
+ theta_gpu, np.int32(TN), slice_height,
502
+ block=(block_size,1,1), grid=grid_rows, stream=stream)
503
+
425
504
  ratio(e_gpu, y_gpu, q_gpu, np.float32(denominator_threshold), np.int32(TN),
426
- block=(block_size,1,1), grid=grid_rows, stream=stream)
505
+ block=(block_size,1,1), grid=grid_rows, stream=stream)
427
506
 
428
- # zero c
429
507
  drv.memset_d32_async(c_gpu, 0, ZX, stream)
430
508
 
431
- # backprojection accumulate
432
- backproj(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, slice_ptr_gpu, slice_len_gpu,
433
- e_gpu, c_gpu, np.int32(TN), slice_height,
434
- block=(block_size,1,1), grid=grid_rows, stream=stream)
509
+ backproj(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu,
510
+ slice_ptr_gpu, slice_len_gpu,
511
+ e_gpu, c_gpu, np.int32(TN), slice_height,
512
+ block=(block_size,1,1), grid=grid_rows, stream=stream)
435
513
 
436
- # update
437
514
  update(theta_gpu, c_gpu, SMatrix.norm_factor_inv_gpu, np.int32(ZX),
438
- block=(block_size,1,1), grid=grid_cols, stream=stream)
515
+ block=(block_size,1,1), grid=grid_cols, stream=stream)
439
516
 
440
- stream.synchronize()
441
517
  if isSavingEachIteration and it in save_indices:
442
518
  out = np.empty(ZX, dtype=np.float32)
443
519
  drv.memcpy_dtoh(out, theta_gpu)
444
520
  saved_theta.append(out.reshape((SMatrix.Z, SMatrix.X)))
445
521
  saved_indices.append(it)
446
522
 
447
- # final copy
523
+ stream.synchronize()
448
524
  res = np.empty(ZX, dtype=np.float32)
449
525
  drv.memcpy_dtoh(res, theta_gpu)
450
526
 
451
- # free temporaries
452
- y_gpu.free(); q_gpu.free(); e_gpu.free(); c_gpu.free(); theta_gpu.free()
453
-
527
+ # free
528
+ try:
529
+ y_gpu.free()
530
+ q_gpu.free()
531
+ e_gpu.free()
532
+ c_gpu.free()
533
+ theta_gpu.free()
534
+ except Exception:
535
+ pass
536
+
454
537
  final_result = res.reshape((SMatrix.Z, SMatrix.X))
455
538
  return (saved_theta, saved_indices) if isSavingEachIteration else (final_result, None)
456
-
539
+
457
540
  except Exception as e:
458
541
  print(f"Error in MLEM_sparseSELL_pycuda: {type(e).__name__}: {e}")
459
542
  gc.collect()
460
543
  return None, None
461
-
544
+
462
545
  finally:
463
- # --- CONTEXT FIX: Pop the context ---
464
546
  if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
465
- SMatrix.ctx.pop()
466
- # ------------------------------------
547
+ try:
548
+ SMatrix.ctx.pop()
549
+ except Exception:
550
+ pass