AOT-biomaps 2.9.279__py3-none-any.whl → 2.9.300__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AOT-biomaps might be problematic. Click here for more details.

@@ -1,8 +1,10 @@
1
- from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star
1
+ from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star, _call_axpby, _call_minus_axpy, compute_TV_cpu, power_method_estimate_L__SELL, calculate_memory_requirement, check_gpu_memory
2
2
  from AOT_biomaps.Config import config
3
- from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType
3
+ from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType, SMatrixType
4
4
  import torch
5
5
  from tqdm import trange
6
+ import numpy as np
7
+ import pycuda.driver as drv
6
8
 
7
9
  '''
8
10
  This module implements Primal-Dual Hybrid Gradient (PDHG) methods for solving inverse problems in Acousto-Optic Tomography.
@@ -11,6 +13,103 @@ The methods can run on both CPU and GPU, with configurations set in the AOT_biom
11
13
  '''
12
14
 
13
15
  def CP_TV(
16
+ SMatrix,
17
+ y,
18
+ alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
19
+ beta=1e-4, # Tikhonov regularization parameter
20
+ theta=1.0,
21
+ numIterations=5000,
22
+ isSavingEachIteration=True,
23
+ L=None,
24
+ withTumor=True,
25
+ device=None,
26
+ max_saves=5000,
27
+ show_logs=True,
28
+ smatrixType=SMatrixType.SELL,
29
+ k_security=0.8,
30
+ use_power_method=True,
31
+ auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
32
+ apply_positivity_clamp=True,
33
+ tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
34
+ use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
35
+ laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
36
+ ):
37
+ # try:
38
+ tumor_str = "WITH" if withTumor else "WITHOUT"
39
+ # Auto-select device and method
40
+ if device is None:
41
+ if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
42
+ device = torch.device(f"cuda:{config.select_best_gpu()}")
43
+ use_gpu = True
44
+ else:
45
+ device = torch.device("cpu")
46
+ use_gpu = False
47
+ else:
48
+ use_gpu = device.type == "cuda"
49
+ # Dispatch to the appropriate implementation
50
+ if use_gpu:
51
+ if smatrixType == SMatrixType.CSR:
52
+ raise NotImplementedError("GPU Chambolle Pock (LS-TV) with CSR not implemented.")
53
+ elif smatrixType == SMatrixType.SELL:
54
+ return CP_TV_Tikhonov_sparseSELL_pycuda(SMatrix, y, alpha,beta, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs, k_security, use_power_method, auto_alpha_gamma, apply_positivity_clamp, tikhonov_as_gradient, use_laplacian, laplacian_beta_scale)
55
+ elif smatrixType == SMatrixType.DENSE:
56
+ return CP_TV_dense(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
57
+ else:
58
+ raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-TV).")
59
+ else:
60
+ raise NotImplementedError("CPU Chambolle Pock (LS-TV) not implemented.")
61
+
62
+ def CP_KL(
63
+ SMatrix,
64
+ y,
65
+ alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
66
+ beta=1e-4, # Tikhonov regularization parameter
67
+ theta=1.0,
68
+ numIterations=5000,
69
+ isSavingEachIteration=True,
70
+ L=None,
71
+ withTumor=True,
72
+ device=None,
73
+ max_saves=5000,
74
+ show_logs=True,
75
+ smatrixType=SMatrixType.SELL,
76
+ k_security=0.8,
77
+ use_power_method=True,
78
+ auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
79
+ apply_positivity_clamp=True,
80
+ tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
81
+ use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
82
+ laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
83
+ ):
84
+ # try:
85
+ tumor_str = "WITH" if withTumor else "WITHOUT"
86
+ # Auto-select device and method
87
+ if device is None:
88
+ if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
89
+ device = torch.device(f"cuda:{config.select_best_gpu()}")
90
+ use_gpu = True
91
+ else:
92
+ device = torch.device("cpu")
93
+ use_gpu = False
94
+ else:
95
+ use_gpu = device.type == "cuda"
96
+ # Dispatch to the appropriate implementation
97
+ if use_gpu:
98
+ if smatrixType == SMatrixType.CSR:
99
+ raise NotImplementedError("GPU Chambolle Pock (LS-KL) with CSR not implemented.")
100
+ elif smatrixType == SMatrixType.SELL:
101
+ raise NotImplementedError("GPU Chambolle Pock (LS-KL) with SELL not implemented.")
102
+ elif smatrixType == SMatrixType.DENSE:
103
+ return CP_KL(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
104
+ else:
105
+ raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-KL).")
106
+ else:
107
+ raise NotImplementedError("CPU Chambolle Pock (LS-KL) not implemented.")
108
+
109
+
110
+
111
+
112
+ def CP_TV_dense(
14
113
  SMatrix,
15
114
  y,
16
115
  alpha=1e-1,
@@ -21,6 +120,7 @@ def CP_TV(
21
120
  withTumor=True,
22
121
  device=None,
23
122
  max_saves=5000,
123
+ show_logs=True,
24
124
  ):
25
125
  """
26
126
  Chambolle-Pock algorithm for Total Variation (TV) regularization.
@@ -92,10 +192,10 @@ def CP_TV(
92
192
  # Description for progress bar
93
193
  tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
94
194
  device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
95
- description = f"AOT-BioMaps -- Primal/Dual Reconstruction (TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
195
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
96
196
 
97
- # Main loop
98
- for iteration in trange(numIterations, desc=description):
197
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
198
+ for it in iterator:
99
199
  # Update p (TV proximal step)
100
200
  grad_x = gradient(x_tilde.reshape(Z, X))
101
201
  p = proj_l2(p + sigma * grad_x, alpha)
@@ -113,9 +213,9 @@ def CP_TV(
113
213
  x_tilde = x + theta * (x - x_old)
114
214
 
115
215
  # Save intermediate result if needed
116
- if isSavingEachIteration and iteration in save_indices:
216
+ if isSavingEachIteration and it in save_indices:
117
217
  I_reconMatrix.append(x.reshape(Z, X).clone() * (norm_y / norm_A))
118
- saved_indices.append(iteration)
218
+ saved_indices.append(it)
119
219
 
120
220
  # Return results
121
221
  if isSavingEachIteration:
@@ -123,6 +223,337 @@ def CP_TV(
123
223
  else:
124
224
  return (x.reshape(Z, X) * (norm_y / norm_A)).cpu().numpy(), None
125
225
 
226
+ def CP_TV_Tikhonov_sparseSELL_pycuda(
227
+ SMatrix,
228
+ y,
229
+ alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
230
+ beta=1e-4, # Tikhonov regularization parameter
231
+ theta=1.0,
232
+ numIterations=2000,
233
+ isSavingEachIteration=True,
234
+ L=None,
235
+ tumor_str="",
236
+ device=None,
237
+ max_saves=2000,
238
+ show_logs=True,
239
+ k_security=0.8,
240
+ use_power_method=True,
241
+ auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
242
+ apply_positivity_clamp=True,
243
+ tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
244
+ use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
245
+ laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
246
+ ):
247
+
248
+ """
249
+ CP-TV + Tikhonov + Laplacian (Hessian scalar) penalty integrated.
250
+ Returns (I_reconMatrix, saved_indices) if isSavingEachIteration else (x_final, None).
251
+ """
252
+ # ----- begin main -----
253
+ if SMatrix.ctx:
254
+ SMatrix.ctx.push()
255
+
256
+ # prepare variables
257
+ dtype = np.float32
258
+ TN = int(SMatrix.N * SMatrix.T)
259
+ ZX = int(SMatrix.Z * SMatrix.X)
260
+ Z, X = SMatrix.Z, SMatrix.X
261
+ block_size = 256
262
+
263
+ # existing kernels
264
+ projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
265
+ backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
266
+ axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
267
+ minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
268
+ gradient_kernel = SMatrix.sparse_mod.get_function("gradient_kernel")
269
+ divergence_kernel = SMatrix.sparse_mod.get_function("divergence_kernel")
270
+ proj_tv_kernel = SMatrix.sparse_mod.get_function("proj_tv_kernel")
271
+
272
+ # optional kernels (laplacian & clamp)
273
+ has_laplacian = False
274
+ has_clamp_kernel = False
275
+ try:
276
+ laplacian_kernel = SMatrix.sparse_mod.get_function("laplacian_kernel")
277
+ laplacian_adj_kernel = SMatrix.sparse_mod.get_function("laplacian_adj_kernel")
278
+ has_laplacian = True
279
+ except Exception:
280
+ has_laplacian = False
281
+
282
+ try:
283
+ clamp_positive_kernel = SMatrix.sparse_mod.get_function("clamp_positive_kernel")
284
+ has_clamp_kernel = True
285
+ except Exception:
286
+ has_clamp_kernel = False
287
+
288
+ stream = drv.Stream()
289
+
290
+ # estimate L operator norm if needed
291
+ if use_power_method or L is None:
292
+ L_LS_sq = power_method_estimate_L__SELL(SMatrix, stream, n_it=20, block_size=block_size)
293
+ L_nabla_sq = 8.0
294
+ L_op_norm = np.sqrt(L_LS_sq + L_nabla_sq)
295
+ if L_op_norm < 1e-6:
296
+ L_op_norm = 1.0
297
+ else:
298
+ L_op_norm = L
299
+
300
+ tau = np.float32(k_security / L_op_norm)
301
+ sigma = np.float32(k_security / L_op_norm)
302
+
303
+ # prepare y and normalization
304
+ y = y.T.astype(dtype).reshape(-1)
305
+ maxy = float(np.max(np.abs(y))) if y.size > 0 else 0.0
306
+ if maxy > 0:
307
+ y_normed = (y / maxy).copy()
308
+ else:
309
+ y_normed = y.copy()
310
+
311
+ # GPU allocations
312
+ bufs = []
313
+ y_gpu = drv.mem_alloc(y_normed.nbytes); bufs.append(y_gpu)
314
+ drv.memcpy_htod_async(y_gpu, y_normed.T.flatten(), stream)
315
+
316
+ x_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_gpu)
317
+ drv.memset_d32_async(x_gpu, 0, ZX, stream)
318
+ x_old_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_old_gpu)
319
+ x_tilde_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_tilde_gpu)
320
+ drv.memcpy_dtod_async(x_tilde_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
321
+
322
+ p_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(p_gpu)
323
+ q_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(q_gpu)
324
+ drv.memset_d32_async(p_gpu, 0, 2 * ZX, stream)
325
+ drv.memset_d32_async(q_gpu, 0, TN, stream)
326
+
327
+ grad_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(grad_gpu)
328
+ div_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(div_gpu)
329
+ Ax_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(Ax_gpu)
330
+ ATq_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(ATq_gpu)
331
+ zero_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(zero_gpu)
332
+ drv.memset_d32_async(zero_gpu, 0, ZX, stream)
333
+
334
+ # Laplacian buffers (if enabled and kernel available)
335
+ use_lap = use_laplacian and has_laplacian and (beta > 0)
336
+ if use_lap:
337
+ lap_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(lap_gpu)
338
+ r_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(r_gpu)
339
+ drv.memset_d32_async(r_gpu, 0, ZX, stream)
340
+ # scalar beta for laplacian (allow separate scale)
341
+ beta_lap = float(beta) * float(laplacian_beta_scale)
342
+ inv_1_plus_sigma_beta = np.float32(1.0 / (1.0 + float(sigma) * beta_lap))
343
+
344
+ # host buffers for logs
345
+ x_host = np.empty(ZX, dtype=dtype)
346
+ Ax_host = np.empty(TN, dtype=dtype)
347
+ q_host = np.empty(TN, dtype=dtype)
348
+ p_host = np.empty(2 * ZX, dtype=dtype)
349
+ ATq_host = np.empty(ZX, dtype=dtype)
350
+
351
+ # compute initial backprojection for auto-alpha
352
+ drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
353
+ backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
354
+ y_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
355
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
356
+ stream.synchronize()
357
+ drv.memcpy_dtoh(x_host, ATq_gpu)
358
+
359
+ # auto alpha if requested
360
+ if alpha is None:
361
+ drv.memcpy_htod_async(x_gpu, x_host, stream)
362
+ projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
363
+ x_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
364
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
365
+ stream.synchronize()
366
+ drv.memcpy_dtoh(Ax_host, Ax_gpu)
367
+ resid = Ax_host - y_normed[:TN]
368
+ data_term = 0.5 * float(np.dot(resid, resid))
369
+ tv_term = float(compute_TV_cpu(x_host, Z, X)) + 1e-12
370
+ alpha = float(auto_alpha_gamma * data_term / tv_term)
371
+ if show_logs:
372
+ print(f"[auto-alpha] data_term={data_term:.6e}, tv_term={tv_term:.6e}, alpha_set={alpha:.6e}")
373
+
374
+ # tikhonov prox multiplicative scale
375
+ if tikhonov_as_gradient:
376
+ tikh_scale = None
377
+ else:
378
+ tikh_scale = np.float32(1.0 / (1.0 + 2.0 * tau * beta)) if beta > 0 else np.float32(1.0)
379
+
380
+ # saving policy
381
+ if numIterations <= max_saves:
382
+ save_indices_all = list(range(0, numIterations + 1))
383
+ else:
384
+ step = max(1, numIterations // max_saves)
385
+ save_indices_all = list(range(0, numIterations + 1, step))
386
+
387
+ device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
388
+ if show_logs:
389
+ if (alpha is None or alpha == 0) and (beta is None or beta == 0):
390
+ print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} lap_enabled={use_lap}")
391
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS) -- {tumor_str} -- {device_str}"
392
+ if alpha is None or alpha == 0:
393
+ print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} beta={beta:.4e} lap_enabled={use_lap}")
394
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-Tikhonov) -- {tumor_str} -- {device_str}"
395
+ elif beta is None or beta == 0:
396
+ print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
397
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) -- {tumor_str} -- {device_str}"
398
+ else:
399
+ print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
400
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV-Tikhonov) -- {tumor_str} -- {device_str}"
401
+
402
+ I_reconMatrix = []
403
+ saved_indices = []
404
+ if isSavingEachIteration and 0 in save_indices_all:
405
+ drv.memcpy_dtoh(x_host, x_gpu)
406
+ x0 = x_host.reshape((Z, X)).copy()
407
+ if maxy > 0:
408
+ x0 *= maxy
409
+ I_reconMatrix.append(x0)
410
+ saved_indices.append(0)
411
+
412
+ # main loop
413
+ try:
414
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
415
+ for it in iterator:
416
+ # 1) dual p update (TV)
417
+ gradient_kernel(grad_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
418
+ block=(block_size, 1, 1),
419
+ grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
420
+ stream=stream)
421
+ _call_axpby(axpby_kernel, p_gpu, p_gpu, grad_gpu, 1.0, sigma, 2 * ZX, stream, block_size)
422
+ proj_tv_kernel(p_gpu, np.float32(alpha), np.int32(ZX),
423
+ block=(block_size, 1, 1),
424
+ grid=((ZX + block_size - 1) // block_size, 1, 1),
425
+ stream=stream)
426
+
427
+ # 2) dual q update (data fidelity)
428
+ projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
429
+ x_tilde_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
430
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
431
+ _call_axpby(axpby_kernel, Ax_gpu, Ax_gpu, y_gpu, 1.0, -1.0, TN, stream, block_size)
432
+ _call_axpby(axpby_kernel, q_gpu, q_gpu, Ax_gpu, 1.0 / (1.0 + sigma), sigma / (1.0 + sigma), TN, stream, block_size)
433
+
434
+ # optional Laplacian dual update
435
+ if use_lap:
436
+ # compute Laplacian of x_tilde -> lap_gpu
437
+ laplacian_kernel(lap_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
438
+ block=(block_size, 1, 1),
439
+ grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
440
+ stream=stream)
441
+ # r = r + sigma * lap
442
+ _call_axpby(axpby_kernel, r_gpu, r_gpu, lap_gpu, 1.0, sigma, ZX, stream, block_size)
443
+ # r = r / (1 + sigma * beta_lap)
444
+ _call_axpby(axpby_kernel, r_gpu, r_gpu, zero_gpu, inv_1_plus_sigma_beta, 0.0, ZX, stream, block_size)
445
+
446
+ # 3) primal x update
447
+ drv.memcpy_dtod_async(x_old_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
448
+ divergence_kernel(div_gpu, p_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
449
+ block=(block_size, 1, 1),
450
+ grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
451
+ stream=stream)
452
+ drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
453
+ backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
454
+ q_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
455
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
456
+ # ATq - div
457
+ _call_minus_axpy(minus_axpy_kernel, ATq_gpu, div_gpu, 1.0, ZX, stream, block_size)
458
+
459
+ # if laplacian is used, add H^T r into ATq
460
+ if use_lap:
461
+ # compute laplacian_adj_kernel(temp, r)
462
+ # reuse grad_gpu as temporary if safe (its content used earlier, but not reused until later)
463
+ laplacian_adj_kernel(grad_gpu, r_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
464
+ block=(block_size, 1, 1),
465
+ grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
466
+ stream=stream)
467
+ # ATq_gpu += temp (grad_gpu)
468
+ _call_axpby(axpby_kernel, ATq_gpu, ATq_gpu, grad_gpu, 1.0, 1.0, ZX, stream, block_size)
469
+
470
+ # x = x_old - tau * ATq_buffer
471
+ _call_minus_axpy(minus_axpy_kernel, x_gpu, ATq_gpu, tau, ZX, stream, block_size)
472
+
473
+ # Tikhonov
474
+ if beta > 0:
475
+ if tikhonov_as_gradient:
476
+ mul = 1.0 - 2.0 * float(tau) * float(beta)
477
+ if mul <= 0.0:
478
+ # fallback to prox multiplicative stable
479
+ fallback_scale = np.float32(1.0 / (1.0 + 2.0 * float(tau) * float(beta)))
480
+ _call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, fallback_scale, 0.0, ZX, stream, block_size)
481
+ else:
482
+ # x *= mul => implemented as axpby: out = 1* x + (mul-1)*x
483
+ _call_axpby(axpby_kernel, x_gpu, x_gpu, x_gpu, 1.0, np.float32(mul - 1.0), ZX, stream, block_size)
484
+ else:
485
+ _call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, tikh_scale, np.float32(0.0), ZX, stream, block_size)
486
+
487
+ # positivity clamp (prefer GPU kernel if available)
488
+ if apply_positivity_clamp:
489
+ if has_clamp_kernel:
490
+ # in-place clamp on GPU
491
+ clamp_positive_kernel(x_gpu, np.int32(ZX),
492
+ block=(block_size, 1, 1),
493
+ grid=((ZX + block_size - 1) // block_size, 1, 1),
494
+ stream=stream)
495
+ else:
496
+ # fallback CPU roundtrip (slower)
497
+ stream.synchronize()
498
+ drv.memcpy_dtoh(x_host, x_gpu)
499
+ np.maximum(x_host, 0.0, out=x_host)
500
+ drv.memcpy_htod_async(x_gpu, x_host, stream)
501
+
502
+ # extrapolation
503
+ _call_axpby(axpby_kernel, x_tilde_gpu, x_gpu, x_old_gpu, np.float32(1.0 + theta), np.float32(-theta), ZX, stream, block_size)
504
+
505
+ # saves
506
+ if isSavingEachIteration and (it + 1) in save_indices_all:
507
+ stream.synchronize()
508
+ drv.memcpy_dtoh(x_host, x_gpu)
509
+ x_saved = x_host.reshape((Z, X)).copy()
510
+ if maxy > 0:
511
+ x_saved *= maxy
512
+ I_reconMatrix.append(x_saved)
513
+ saved_indices.append(it + 1)
514
+
515
+ stream.synchronize()
516
+ drv.memcpy_dtoh(x_host, x_gpu)
517
+ x_final = x_host.reshape((Z, X)).copy()
518
+ if maxy > 0:
519
+ x_final *= maxy
520
+ if isSavingEachIteration and len(I_reconMatrix):
521
+ for i in range(len(I_reconMatrix)):
522
+ I_reconMatrix[i] *= maxy
523
+
524
+ # free buffers
525
+ for buff in bufs:
526
+ try:
527
+ buff.free()
528
+ except:
529
+ pass
530
+
531
+ if SMatrix.ctx:
532
+ SMatrix.ctx.pop()
533
+
534
+ if isSavingEachIteration:
535
+ return I_reconMatrix, saved_indices
536
+ else:
537
+ return x_final, None
538
+
539
+ except Exception as e:
540
+ # cleanup robustly
541
+ print("Error in CP_TV_Tikhonov+Lap (robust):", e)
542
+ try:
543
+ for buff in bufs:
544
+ try:
545
+ buff.free()
546
+ except:
547
+ pass
548
+ except:
549
+ pass
550
+ try:
551
+ if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
552
+ SMatrix.ctx.pop()
553
+ except:
554
+ pass
555
+ raise
556
+
126
557
 
127
558
  def CP_KL(
128
559
  SMatrix,
@@ -132,9 +563,10 @@ def CP_KL(
132
563
  numIterations=5000,
133
564
  isSavingEachIteration=True,
134
565
  L=None,
135
- withTumor=True,
566
+ tumor_str="",
136
567
  device=None,
137
568
  max_saves=5000,
569
+ show_logs=True,
138
570
  ):
139
571
  """
140
572
  Chambolle-Pock algorithm for Kullback-Leibler (KL) divergence regularization.
@@ -193,12 +625,11 @@ def CP_KL(
193
625
  saved_indices = [0]
194
626
 
195
627
  # Description for progress bar
196
- tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
197
628
  device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
198
629
  description = f"AOT-BioMaps -- Primal/Dual Reconstruction (KL) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
199
630
 
200
- # Main loop
201
- for iteration in trange(numIterations, desc=description):
631
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
632
+ for iteration in iterator:
202
633
  # Update q (proximal step for F*)
203
634
  q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)
204
635
 
@@ -224,27 +224,20 @@ class SparseSMatrix_CSR:
224
224
  def getMatrixSize(self):
225
225
  """
226
226
  Retourne la taille totale de la matrice CSR en Go (en sommant la mémoire GPU).
227
+ Utilise les attributs de taille stockés pour contourner l'AttributeError de DeviceAllocation.
227
228
  """
229
+ # Note: L'utilisateur doit s'assurer que self.row_ptr existe avant cet appel.
228
230
  if self.row_ptr is None:
229
231
  return {"error": "La matrice sparse n'est pas encore allouée."}
230
232
 
231
233
  total_bytes = 0
232
-
233
- # Mémoire GPU (row_ptr_gpu, col_ind_gpu, values_gpu, norm_factor_inv_gpu)
234
- if hasattr(self, 'row_ptr_gpu') and self.row_ptr_gpu:
235
- total_bytes += self.row_ptr_gpu.size
236
- if hasattr(self, 'col_ind_gpu') and self.col_ind_gpu:
237
- total_bytes += self.col_ind_gpu.size
238
- if hasattr(self, 'values_gpu') and self.values_gpu:
239
- total_bytes += self.values_gpu.size
240
- if hasattr(self, 'norm_factor_inv_gpu') and self.norm_factor_inv_gpu:
241
- total_bytes += self.norm_factor_inv_gpu.size
242
-
243
- # NOTE: Les versions précédentes utilisaient le .size de l'objet DeviceAllocation,
244
- # qui était problématique. Si l'erreur se reproduit ici, il faudra
245
- # stocker la taille en octets comme nous l'avons fait pour SELL.
246
- # Pour l'instant, nous conservons la méthode getMatrixSize originale de CSR.
247
234
 
235
+ # Somme des tailles stockées (Taille calculée et attribuée dans allocate et compute_norm_factor_from_csr)
236
+ total_bytes += getattr(self, 'row_ptr_gpu_size', 0)
237
+ total_bytes += getattr(self, 'col_ind_gpu_size', 0)
238
+ total_bytes += getattr(self, 'values_gpu_size', 0)
239
+ total_bytes += getattr(self, 'norm_factor_inv_gpu_size', 0)
240
+
248
241
  return total_bytes / (1024**3)
249
242
 
250
243
  def free(self):
@@ -92,13 +92,11 @@ class SparseSMatrix_SELL:
92
92
  def allocate(self):
93
93
  """
94
94
  Build SELL-C-σ directly from manip AcousticFields in streaming blocks.
95
- NOTE: This is the logic of allocate_sell_c_sigma_direct from the working class.
95
+ Corrected: per-block row_nnz copy, zeroing of host block, proper sync.
96
96
  """
97
97
  if self.sparse_mod is None:
98
98
  raise RuntimeError("CUDA module not loaded. Check compilation.")
99
99
 
100
- # NOTE: Les noms de kernel (count_nnz_rows_kernel, fill_kernel__SELL) sont utilisés
101
- # car ils sont présents dans la classe fonctionnelle.
102
100
  count_kernel = self.sparse_mod.get_function("count_nnz_rows_kernel")
103
101
  fill_kernel = self.sparse_mod.get_function("fill_kernel__SELL")
104
102
 
@@ -106,16 +104,14 @@ class SparseSMatrix_SELL:
106
104
  num_cols = int(self.Z * self.X)
107
105
  C = int(self.slice_height)
108
106
 
109
- # host temporary block
110
107
  br = int(self.block_rows)
111
- bytes_per_elem = np.dtype(np.float32).itemsize
112
108
  dense_host = np.empty((br, num_cols), dtype=np.float32)
113
109
 
114
- # Allocation 1: Dense block GPU memory
110
+ # Allocation dense buffer on device (size = br * num_cols)
115
111
  dense_gpu_size = dense_host.nbytes
116
112
  dense_gpu = drv.mem_alloc(dense_gpu_size)
117
113
 
118
- # 1) count nnz per row (on host via small blocks with GPU kernel)
114
+ # 1) count nnz per row (per block)
119
115
  row_nnz = np.zeros(num_rows, dtype=np.int32)
120
116
  row_nnz_gpu_block_size = br * np.dtype(np.int32).itemsize
121
117
  row_nnz_gpu_block = drv.mem_alloc(row_nnz_gpu_block_size)
@@ -123,17 +119,19 @@ class SparseSMatrix_SELL:
123
119
  block = 256
124
120
  for b in trange(0, num_rows, br, desc="Count NNZ per row"):
125
121
  R = min(br, num_rows - b)
126
- # fill dense_host
122
+ # zero the host block to avoid garbage in tail when R < br
123
+ dense_host.fill(0.0)
127
124
  for i in range(R):
128
125
  rg = b + i
129
126
  n_idx = rg // self.T
130
127
  t_idx = rg % self.T
131
128
  dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
132
- # copy only R rows
129
+ # copy whole buffer (safe because we zeroed tail)
133
130
  drv.memcpy_htod(dense_gpu, dense_host)
134
131
  grid = ((R + block - 1) // block, 1, 1)
135
132
  count_kernel(dense_gpu, row_nnz_gpu_block, np.int32(R), np.int32(num_cols), np.float32(self.relative_threshold),
136
- block=(block,1,1), grid=grid)
133
+ block=(block,1,1), grid=grid)
134
+ drv.Context.synchronize()
137
135
  tmp = np.empty(R, dtype=np.int32)
138
136
  drv.memcpy_dtoh(tmp, row_nnz_gpu_block)
139
137
  row_nnz[b:b+R] = tmp
@@ -148,7 +146,6 @@ class SparseSMatrix_SELL:
148
146
  r0 = s * C
149
147
  r1 = min(num_rows, r0 + C)
150
148
  slice_len[s] = int(np.max(row_nnz[r0:r1])) if (r1>r0) else 0
151
- # slice_ptr (int64)
152
149
  slice_ptr = np.zeros(num_slices + 1, dtype=np.int64)
153
150
  for s in range(num_slices):
154
151
  slice_ptr[s+1] = slice_ptr[s] + (slice_len[s] * C)
@@ -160,9 +157,14 @@ class SparseSMatrix_SELL:
160
157
  self.sell_values_gpu_size = total_storage * np.dtype(np.float32).itemsize
161
158
  self.sell_colinds_gpu_size = total_storage * np.dtype(np.uint32).itemsize
162
159
 
160
+ # allocate and optionally zero them
163
161
  self.sell_values_gpu = drv.mem_alloc(self.sell_values_gpu_size)
162
+ # It's good practice to zero the values buffer to avoid leftover memory
163
+ drv.memset_d32(self.sell_values_gpu, 0, total_storage)
164
+
164
165
  self.sell_colinds_gpu = drv.mem_alloc(self.sell_colinds_gpu_size)
165
-
166
+ drv.memset_d32(self.sell_colinds_gpu, 0, total_storage)
167
+
166
168
  # allocate slice metadata on device
167
169
  self.slice_ptr = slice_ptr
168
170
  self.slice_len = slice_len
@@ -177,29 +179,28 @@ class SparseSMatrix_SELL:
177
179
  drv.memcpy_htod(self.slice_len_gpu, self.slice_len)
178
180
 
179
181
  # 3) fill SELL arrays by streaming blocks again (use GPU fill kernel)
180
- # reuse dense_host and allocate new dense_gpu
181
182
  dense_host = np.empty((br, num_cols), dtype=np.float32)
183
+ dense_gpu = drv.mem_alloc(dense_host.nbytes)
182
184
 
183
- dense_gpu_2_size = dense_host.nbytes
184
- dense_gpu = drv.mem_alloc(dense_gpu_2_size)
185
-
186
- # we also need row_nnz on device per-block; supply global row_nnz on host but the kernel recomputes threshold
187
- row_nnz_host_gpu_size = br * np.dtype(np.int32).itemsize
188
- row_nnz_host_gpu = drv.mem_alloc(row_nnz_host_gpu_size)
185
+ # For per-block row_nnz pointer we allocate a buffer of max block size once, then reuse
186
+ row_nnz_host_gpu = drv.mem_alloc(br * np.dtype(np.int32).itemsize)
189
187
 
190
188
  for b in trange(0, num_rows, br, desc="Fill SELL"):
191
189
  R = min(br, num_rows - b)
190
+ dense_host.fill(0.0)
192
191
  for i in range(R):
193
192
  rg = b + i
194
193
  n_idx = rg // self.T
195
194
  t_idx = rg % self.T
196
195
  dense_host[i, :] = self.manip.AcousticFields[n_idx].field[t_idx].flatten()
196
+ # copy host block
197
197
  drv.memcpy_htod(dense_gpu, dense_host)
198
- # We pass a dummy row_nnz pointer (not used in this kernel; left for API)
199
- # Kernel expects rows_in_block, rows_global_offset to know where to write.
198
+ # copy corresponding row_nnz slice (only R entries)
199
+ drv.memcpy_htod(row_nnz_host_gpu, row_nnz[b:b+R])
200
+
200
201
  grid = ((R + block - 1) // block, 1, 1)
201
202
  fill_kernel(dense_gpu,
202
- np.intp(0), # placeholder for row_nnz pointer (not used)
203
+ row_nnz_host_gpu,
203
204
  self.slice_ptr_gpu,
204
205
  self.slice_len_gpu,
205
206
  self.sell_colinds_gpu,
@@ -210,12 +211,14 @@ class SparseSMatrix_SELL:
210
211
  np.int32(C),
211
212
  np.float32(self.relative_threshold),
212
213
  block=(block,1,1), grid=grid)
214
+ drv.Context.synchronize()
215
+
213
216
  dense_gpu.free()
214
217
  row_nnz_host_gpu.free()
215
218
 
216
219
  # 4) compute norm_factor_inv via GPU accumulate (col sums)
217
220
  self.compute_norm_factor()
218
-
221
+
219
222
  def apply_apodization_gpu(self, window_vector_gpu):
220
223
  """
221
224
  Applique le fenêtrage directement sur self.sell_values_gpu