AOT-biomaps 2.9.261__py3-none-any.whl → 2.9.318__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AOT-biomaps might be problematic. Click here for more details.

@@ -1,8 +1,10 @@
1
- from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star
1
+ from AOT_biomaps.AOT_Recon.ReconTools import power_method, gradient, div, proj_l2, prox_G, prox_F_star, _call_axpby, _call_minus_axpy, compute_TV_cpu, power_method_estimate_L__SELL, calculate_memory_requirement, check_gpu_memory
2
2
  from AOT_biomaps.Config import config
3
- from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType
3
+ from AOT_biomaps.AOT_Recon.ReconEnums import NoiseType, SMatrixType
4
4
  import torch
5
5
  from tqdm import trange
6
+ import numpy as np
7
+ import pycuda.driver as drv
6
8
 
7
9
  '''
8
10
  This module implements Primal-Dual Hybrid Gradient (PDHG) methods for solving inverse problems in Acousto-Optic Tomography.
@@ -11,6 +13,103 @@ The methods can run on both CPU and GPU, with configurations set in the AOT_biom
11
13
  '''
12
14
 
13
15
  def CP_TV(
16
+ SMatrix,
17
+ y,
18
+ alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
19
+ beta=1e-4, # Tikhonov regularization parameter
20
+ theta=1.0,
21
+ numIterations=5000,
22
+ isSavingEachIteration=True,
23
+ L=None,
24
+ withTumor=True,
25
+ device=None,
26
+ max_saves=5000,
27
+ show_logs=True,
28
+ smatrixType=SMatrixType.SELL,
29
+ k_security=0.8,
30
+ use_power_method=True,
31
+ auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
32
+ apply_positivity_clamp=True,
33
+ tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
34
+ use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
35
+ laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
36
+ ):
37
+ # try:
38
+ tumor_str = "WITH" if withTumor else "WITHOUT"
39
+ # Auto-select device and method
40
+ if device is None:
41
+ if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
42
+ device = torch.device(f"cuda:{config.select_best_gpu()}")
43
+ use_gpu = True
44
+ else:
45
+ device = torch.device("cpu")
46
+ use_gpu = False
47
+ else:
48
+ use_gpu = device.type == "cuda"
49
+ # Dispatch to the appropriate implementation
50
+ if use_gpu:
51
+ if smatrixType == SMatrixType.CSR:
52
+ raise NotImplementedError("GPU Chambolle Pock (LS-TV) with CSR not implemented.")
53
+ elif smatrixType == SMatrixType.SELL:
54
+ return CP_TV_Tikhonov_sparseSELL_pycuda(SMatrix, y, alpha,beta, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs, k_security, use_power_method, auto_alpha_gamma, apply_positivity_clamp, tikhonov_as_gradient, use_laplacian, laplacian_beta_scale)
55
+ elif smatrixType == SMatrixType.DENSE:
56
+ return CP_TV_dense(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
57
+ else:
58
+ raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-TV).")
59
+ else:
60
+ raise NotImplementedError("CPU Chambolle Pock (LS-TV) not implemented.")
61
+
62
+ def CP_KL(
63
+ SMatrix,
64
+ y,
65
+ alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
66
+ beta=1e-4, # Tikhonov regularization parameter
67
+ theta=1.0,
68
+ numIterations=5000,
69
+ isSavingEachIteration=True,
70
+ L=None,
71
+ withTumor=True,
72
+ device=None,
73
+ max_saves=5000,
74
+ show_logs=True,
75
+ smatrixType=SMatrixType.SELL,
76
+ k_security=0.8,
77
+ use_power_method=True,
78
+ auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
79
+ apply_positivity_clamp=True,
80
+ tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
81
+ use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
82
+ laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
83
+ ):
84
+ # try:
85
+ tumor_str = "WITH" if withTumor else "WITHOUT"
86
+ # Auto-select device and method
87
+ if device is None:
88
+ if torch.cuda.is_available() and check_gpu_memory(config.select_best_gpu(), calculate_memory_requirement(SMatrix, y), show_logs=show_logs):
89
+ device = torch.device(f"cuda:{config.select_best_gpu()}")
90
+ use_gpu = True
91
+ else:
92
+ device = torch.device("cpu")
93
+ use_gpu = False
94
+ else:
95
+ use_gpu = device.type == "cuda"
96
+ # Dispatch to the appropriate implementation
97
+ if use_gpu:
98
+ if smatrixType == SMatrixType.CSR:
99
+ raise NotImplementedError("GPU Chambolle Pock (LS-KL) with CSR not implemented.")
100
+ elif smatrixType == SMatrixType.SELL:
101
+ raise NotImplementedError("GPU Chambolle Pock (LS-KL) with SELL not implemented.")
102
+ elif smatrixType == SMatrixType.DENSE:
103
+ return CP_KL(SMatrix, y, alpha, theta, numIterations, isSavingEachIteration, L, tumor_str, device, max_saves, show_logs)
104
+ else:
105
+ raise ValueError("Unsupported SMatrixType for GPU Chambolle Pock (LS-KL).")
106
+ else:
107
+ raise NotImplementedError("CPU Chambolle Pock (LS-KL) not implemented.")
108
+
109
+
110
+
111
+
112
+ def CP_TV_dense(
14
113
  SMatrix,
15
114
  y,
16
115
  alpha=1e-1,
@@ -21,6 +120,7 @@ def CP_TV(
21
120
  withTumor=True,
22
121
  device=None,
23
122
  max_saves=5000,
123
+ show_logs=True,
24
124
  ):
25
125
  """
26
126
  Chambolle-Pock algorithm for Total Variation (TV) regularization.
@@ -92,10 +192,10 @@ def CP_TV(
92
192
  # Description for progress bar
93
193
  tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
94
194
  device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
95
- description = f"AOT-BioMaps -- Primal/Dual Reconstruction (TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
195
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
96
196
 
97
- # Main loop
98
- for iteration in trange(numIterations, desc=description):
197
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
198
+ for it in iterator:
99
199
  # Update p (TV proximal step)
100
200
  grad_x = gradient(x_tilde.reshape(Z, X))
101
201
  p = proj_l2(p + sigma * grad_x, alpha)
@@ -113,9 +213,9 @@ def CP_TV(
113
213
  x_tilde = x + theta * (x - x_old)
114
214
 
115
215
  # Save intermediate result if needed
116
- if isSavingEachIteration and iteration in save_indices:
216
+ if isSavingEachIteration and it in save_indices:
117
217
  I_reconMatrix.append(x.reshape(Z, X).clone() * (norm_y / norm_A))
118
- saved_indices.append(iteration)
218
+ saved_indices.append(it)
119
219
 
120
220
  # Return results
121
221
  if isSavingEachIteration:
@@ -123,6 +223,337 @@ def CP_TV(
123
223
  else:
124
224
  return (x.reshape(Z, X) * (norm_y / norm_A)).cpu().numpy(), None
125
225
 
226
+ def CP_TV_Tikhonov_sparseSELL_pycuda(
227
+ SMatrix,
228
+ y,
229
+ alpha=None, # TV regularization parameter (if None, alpha is auto-scaled)
230
+ beta=1e-4, # Tikhonov regularization parameter
231
+ theta=1.0,
232
+ numIterations=2000,
233
+ isSavingEachIteration=True,
234
+ L=None,
235
+ tumor_str="",
236
+ device=None,
237
+ max_saves=2000,
238
+ show_logs=True,
239
+ k_security=0.8,
240
+ use_power_method=True,
241
+ auto_alpha_gamma=0.05, # gamma for auto alpha: alpha = gamma * data_term / tv_term
242
+ apply_positivity_clamp=True,
243
+ tikhonov_as_gradient=False, # if True, apply -tau*2*beta*x instead of prox multiplicative
244
+ use_laplacian=True, # enable Laplacian (Hessian scalar) penalty
245
+ laplacian_beta_scale=1.0 # multiply beta for laplacian term if you want separate scaling
246
+ ):
247
+
248
+ """
249
+ CP-TV + Tikhonov + Laplacian (Hessian scalar) penalty integrated.
250
+ Returns (I_reconMatrix, saved_indices) if isSavingEachIteration else (x_final, None).
251
+ """
252
+ # ----- begin main -----
253
+ if SMatrix.ctx:
254
+ SMatrix.ctx.push()
255
+
256
+ # prepare variables
257
+ dtype = np.float32
258
+ TN = int(SMatrix.N * SMatrix.T)
259
+ ZX = int(SMatrix.Z * SMatrix.X)
260
+ Z, X = SMatrix.Z, SMatrix.X
261
+ block_size = 256
262
+
263
+ # existing kernels
264
+ projection_kernel = SMatrix.sparse_mod.get_function("projection_kernel__SELL")
265
+ backprojection_kernel = SMatrix.sparse_mod.get_function("backprojection_kernel__SELL")
266
+ axpby_kernel = SMatrix.sparse_mod.get_function("vector_axpby_kernel")
267
+ minus_axpy_kernel = SMatrix.sparse_mod.get_function("vector_minus_axpy_kernel")
268
+ gradient_kernel = SMatrix.sparse_mod.get_function("gradient_kernel")
269
+ divergence_kernel = SMatrix.sparse_mod.get_function("divergence_kernel")
270
+ proj_tv_kernel = SMatrix.sparse_mod.get_function("proj_tv_kernel")
271
+
272
+ # optional kernels (laplacian & clamp)
273
+ has_laplacian = False
274
+ has_clamp_kernel = False
275
+ try:
276
+ laplacian_kernel = SMatrix.sparse_mod.get_function("laplacian_kernel")
277
+ laplacian_adj_kernel = SMatrix.sparse_mod.get_function("laplacian_adj_kernel")
278
+ has_laplacian = True
279
+ except Exception:
280
+ has_laplacian = False
281
+
282
+ try:
283
+ clamp_positive_kernel = SMatrix.sparse_mod.get_function("clamp_positive_kernel")
284
+ has_clamp_kernel = True
285
+ except Exception:
286
+ has_clamp_kernel = False
287
+
288
+ stream = drv.Stream()
289
+
290
+ # estimate L operator norm if needed
291
+ if use_power_method or L is None:
292
+ L_LS_sq = power_method_estimate_L__SELL(SMatrix, stream, n_it=20, block_size=block_size)
293
+ L_nabla_sq = 8.0
294
+ L_op_norm = np.sqrt(L_LS_sq + L_nabla_sq)
295
+ if L_op_norm < 1e-6:
296
+ L_op_norm = 1.0
297
+ else:
298
+ L_op_norm = L
299
+
300
+ tau = np.float32(k_security / L_op_norm)
301
+ sigma = np.float32(k_security / L_op_norm)
302
+
303
+ # prepare y and normalization
304
+ y = y.T.astype(dtype).reshape(-1)
305
+ maxy = float(np.max(np.abs(y))) if y.size > 0 else 0.0
306
+ if maxy > 0:
307
+ y_normed = (y / maxy).copy()
308
+ else:
309
+ y_normed = y.copy()
310
+
311
+ # GPU allocations
312
+ bufs = []
313
+ y_gpu = drv.mem_alloc(y_normed.nbytes); bufs.append(y_gpu)
314
+ drv.memcpy_htod_async(y_gpu, y_normed.T.flatten(), stream)
315
+
316
+ x_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_gpu)
317
+ drv.memset_d32_async(x_gpu, 0, ZX, stream)
318
+ x_old_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_old_gpu)
319
+ x_tilde_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(x_tilde_gpu)
320
+ drv.memcpy_dtod_async(x_tilde_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
321
+
322
+ p_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(p_gpu)
323
+ q_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(q_gpu)
324
+ drv.memset_d32_async(p_gpu, 0, 2 * ZX, stream)
325
+ drv.memset_d32_async(q_gpu, 0, TN, stream)
326
+
327
+ grad_gpu = drv.mem_alloc(2 * ZX * np.dtype(dtype).itemsize); bufs.append(grad_gpu)
328
+ div_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(div_gpu)
329
+ Ax_gpu = drv.mem_alloc(TN * np.dtype(dtype).itemsize); bufs.append(Ax_gpu)
330
+ ATq_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(ATq_gpu)
331
+ zero_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(zero_gpu)
332
+ drv.memset_d32_async(zero_gpu, 0, ZX, stream)
333
+
334
+ # Laplacian buffers (if enabled and kernel available)
335
+ use_lap = use_laplacian and has_laplacian and (beta > 0)
336
+ if use_lap:
337
+ lap_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(lap_gpu)
338
+ r_gpu = drv.mem_alloc(ZX * np.dtype(dtype).itemsize); bufs.append(r_gpu)
339
+ drv.memset_d32_async(r_gpu, 0, ZX, stream)
340
+ # scalar beta for laplacian (allow separate scale)
341
+ beta_lap = float(beta) * float(laplacian_beta_scale)
342
+ inv_1_plus_sigma_beta = np.float32(1.0 / (1.0 + float(sigma) * beta_lap))
343
+
344
+ # host buffers for logs
345
+ x_host = np.empty(ZX, dtype=dtype)
346
+ Ax_host = np.empty(TN, dtype=dtype)
347
+ q_host = np.empty(TN, dtype=dtype)
348
+ p_host = np.empty(2 * ZX, dtype=dtype)
349
+ ATq_host = np.empty(ZX, dtype=dtype)
350
+
351
+ # compute initial backprojection for auto-alpha
352
+ drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
353
+ backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
354
+ y_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
355
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
356
+ stream.synchronize()
357
+ drv.memcpy_dtoh(x_host, ATq_gpu)
358
+
359
+ # auto alpha if requested
360
+ if alpha is None:
361
+ drv.memcpy_htod_async(x_gpu, x_host, stream)
362
+ projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
363
+ x_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
364
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
365
+ stream.synchronize()
366
+ drv.memcpy_dtoh(Ax_host, Ax_gpu)
367
+ resid = Ax_host - y_normed[:TN]
368
+ data_term = 0.5 * float(np.dot(resid, resid))
369
+ tv_term = float(compute_TV_cpu(x_host, Z, X)) + 1e-12
370
+ alpha = float(auto_alpha_gamma * data_term / tv_term)
371
+ if show_logs:
372
+ print(f"[auto-alpha] data_term={data_term:.6e}, tv_term={tv_term:.6e}, alpha_set={alpha:.6e}")
373
+
374
+ # tikhonov prox multiplicative scale
375
+ if tikhonov_as_gradient:
376
+ tikh_scale = None
377
+ else:
378
+ tikh_scale = np.float32(1.0 / (1.0 + 2.0 * tau * beta)) if beta > 0 else np.float32(1.0)
379
+
380
+ # saving policy
381
+ if numIterations <= max_saves:
382
+ save_indices_all = list(range(0, numIterations + 1))
383
+ else:
384
+ step = max(1, numIterations // max_saves)
385
+ save_indices_all = list(range(0, numIterations + 1, step))
386
+
387
+ device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
388
+ if show_logs:
389
+ if (alpha is None or alpha == 0) and (beta is None or beta == 0):
390
+ print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} lap_enabled={use_lap}")
391
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS) -- {tumor_str} -- {device_str}"
392
+ if alpha is None or alpha == 0:
393
+ print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} beta={beta:.4e} lap_enabled={use_lap}")
394
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-Tikhonov) -- {tumor_str} -- {device_str}"
395
+ elif beta is None or beta == 0:
396
+ print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
397
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV) -- {tumor_str} -- {device_str}"
398
+ else:
399
+ print(f"Parameters: L={L_op_norm:.6e} tau={tau:.3e} sigma={sigma:.3e} alpha={alpha:.4e} beta={beta:.4e} lap_enabled={use_lap}")
400
+ description = f"AOT-BioMaps -- Primal/Dual Reconstruction (LS-TV-Tikhonov) -- {tumor_str} -- {device_str}"
401
+
402
+ I_reconMatrix = []
403
+ saved_indices = []
404
+ if isSavingEachIteration and 0 in save_indices_all:
405
+ drv.memcpy_dtoh(x_host, x_gpu)
406
+ x0 = x_host.reshape((Z, X)).copy()
407
+ if maxy > 0:
408
+ x0 *= maxy
409
+ I_reconMatrix.append(x0)
410
+ saved_indices.append(0)
411
+
412
+ # main loop
413
+ try:
414
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
415
+ for it in iterator:
416
+ # 1) dual p update (TV)
417
+ gradient_kernel(grad_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
418
+ block=(block_size, 1, 1),
419
+ grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
420
+ stream=stream)
421
+ _call_axpby(axpby_kernel, p_gpu, p_gpu, grad_gpu, 1.0, sigma, 2 * ZX, stream, block_size)
422
+ proj_tv_kernel(p_gpu, np.float32(alpha), np.int32(ZX),
423
+ block=(block_size, 1, 1),
424
+ grid=((ZX + block_size - 1) // block_size, 1, 1),
425
+ stream=stream)
426
+
427
+ # 2) dual q update (data fidelity)
428
+ projection_kernel(Ax_gpu, SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
429
+ x_tilde_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
430
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
431
+ _call_axpby(axpby_kernel, Ax_gpu, Ax_gpu, y_gpu, 1.0, -1.0, TN, stream, block_size)
432
+ _call_axpby(axpby_kernel, q_gpu, q_gpu, Ax_gpu, 1.0 / (1.0 + sigma), sigma / (1.0 + sigma), TN, stream, block_size)
433
+
434
+ # optional Laplacian dual update
435
+ if use_lap:
436
+ # compute Laplacian of x_tilde -> lap_gpu
437
+ laplacian_kernel(lap_gpu, x_tilde_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
438
+ block=(block_size, 1, 1),
439
+ grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
440
+ stream=stream)
441
+ # r = r + sigma * lap
442
+ _call_axpby(axpby_kernel, r_gpu, r_gpu, lap_gpu, 1.0, sigma, ZX, stream, block_size)
443
+ # r = r / (1 + sigma * beta_lap)
444
+ _call_axpby(axpby_kernel, r_gpu, r_gpu, zero_gpu, inv_1_plus_sigma_beta, 0.0, ZX, stream, block_size)
445
+
446
+ # 3) primal x update
447
+ drv.memcpy_dtod_async(x_old_gpu, x_gpu, ZX * np.dtype(dtype).itemsize, stream)
448
+ divergence_kernel(div_gpu, p_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
449
+ block=(block_size, 1, 1),
450
+ grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
451
+ stream=stream)
452
+ drv.memset_d32_async(ATq_gpu, 0, ZX, stream)
453
+ backprojection_kernel(SMatrix.sell_values_gpu, SMatrix.sell_colinds_gpu, SMatrix.slice_ptr_gpu, SMatrix.slice_len_gpu,
454
+ q_gpu, ATq_gpu, np.int32(TN), np.int32(SMatrix.slice_height),
455
+ block=(block_size, 1, 1), grid=((TN + block_size - 1) // block_size, 1, 1), stream=stream)
456
+ # ATq - div
457
+ _call_minus_axpy(minus_axpy_kernel, ATq_gpu, div_gpu, 1.0, ZX, stream, block_size)
458
+
459
+ # if laplacian is used, add H^T r into ATq
460
+ if use_lap:
461
+ # compute laplacian_adj_kernel(temp, r)
462
+ # reuse grad_gpu as temporary if safe (its content used earlier, but not reused until later)
463
+ laplacian_adj_kernel(grad_gpu, r_gpu, np.int32(Z), np.int32(X), np.int32(ZX),
464
+ block=(block_size, 1, 1),
465
+ grid=((X + block_size - 1) // block_size, (Z + block_size - 1) // block_size, 1),
466
+ stream=stream)
467
+ # ATq_gpu += temp (grad_gpu)
468
+ _call_axpby(axpby_kernel, ATq_gpu, ATq_gpu, grad_gpu, 1.0, 1.0, ZX, stream, block_size)
469
+
470
+ # x = x_old - tau * ATq_buffer
471
+ _call_minus_axpy(minus_axpy_kernel, x_gpu, ATq_gpu, tau, ZX, stream, block_size)
472
+
473
+ # Tikhonov
474
+ if beta > 0:
475
+ if tikhonov_as_gradient:
476
+ mul = 1.0 - 2.0 * float(tau) * float(beta)
477
+ if mul <= 0.0:
478
+ # fallback to prox multiplicative stable
479
+ fallback_scale = np.float32(1.0 / (1.0 + 2.0 * float(tau) * float(beta)))
480
+ _call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, fallback_scale, 0.0, ZX, stream, block_size)
481
+ else:
482
+ # x *= mul => implemented as axpby: out = 1* x + (mul-1)*x
483
+ _call_axpby(axpby_kernel, x_gpu, x_gpu, x_gpu, 1.0, np.float32(mul - 1.0), ZX, stream, block_size)
484
+ else:
485
+ _call_axpby(axpby_kernel, x_gpu, x_gpu, zero_gpu, tikh_scale, np.float32(0.0), ZX, stream, block_size)
486
+
487
+ # positivity clamp (prefer GPU kernel if available)
488
+ if apply_positivity_clamp:
489
+ if has_clamp_kernel:
490
+ # in-place clamp on GPU
491
+ clamp_positive_kernel(x_gpu, np.int32(ZX),
492
+ block=(block_size, 1, 1),
493
+ grid=((ZX + block_size - 1) // block_size, 1, 1),
494
+ stream=stream)
495
+ else:
496
+ # fallback CPU roundtrip (slower)
497
+ stream.synchronize()
498
+ drv.memcpy_dtoh(x_host, x_gpu)
499
+ np.maximum(x_host, 0.0, out=x_host)
500
+ drv.memcpy_htod_async(x_gpu, x_host, stream)
501
+
502
+ # extrapolation
503
+ _call_axpby(axpby_kernel, x_tilde_gpu, x_gpu, x_old_gpu, np.float32(1.0 + theta), np.float32(-theta), ZX, stream, block_size)
504
+
505
+ # saves
506
+ if isSavingEachIteration and (it + 1) in save_indices_all:
507
+ stream.synchronize()
508
+ drv.memcpy_dtoh(x_host, x_gpu)
509
+ x_saved = x_host.reshape((Z, X)).copy()
510
+ if maxy > 0:
511
+ x_saved *= maxy
512
+ I_reconMatrix.append(x_saved)
513
+ saved_indices.append(it + 1)
514
+
515
+ stream.synchronize()
516
+ drv.memcpy_dtoh(x_host, x_gpu)
517
+ x_final = x_host.reshape((Z, X)).copy()
518
+ if maxy > 0:
519
+ x_final *= maxy
520
+ if isSavingEachIteration and len(I_reconMatrix):
521
+ for i in range(len(I_reconMatrix)):
522
+ I_reconMatrix[i] *= maxy
523
+
524
+ # free buffers
525
+ for buff in bufs:
526
+ try:
527
+ buff.free()
528
+ except:
529
+ pass
530
+
531
+ if SMatrix.ctx:
532
+ SMatrix.ctx.pop()
533
+
534
+ if isSavingEachIteration:
535
+ return I_reconMatrix, saved_indices
536
+ else:
537
+ return x_final, None
538
+
539
+ except Exception as e:
540
+ # cleanup robustly
541
+ print("Error in CP_TV_Tikhonov+Lap (robust):", e)
542
+ try:
543
+ for buff in bufs:
544
+ try:
545
+ buff.free()
546
+ except:
547
+ pass
548
+ except:
549
+ pass
550
+ try:
551
+ if SMatrix and hasattr(SMatrix, 'ctx') and SMatrix.ctx:
552
+ SMatrix.ctx.pop()
553
+ except:
554
+ pass
555
+ raise
556
+
126
557
 
127
558
  def CP_KL(
128
559
  SMatrix,
@@ -132,9 +563,10 @@ def CP_KL(
132
563
  numIterations=5000,
133
564
  isSavingEachIteration=True,
134
565
  L=None,
135
- withTumor=True,
566
+ tumor_str="",
136
567
  device=None,
137
568
  max_saves=5000,
569
+ show_logs=True,
138
570
  ):
139
571
  """
140
572
  Chambolle-Pock algorithm for Kullback-Leibler (KL) divergence regularization.
@@ -193,12 +625,11 @@ def CP_KL(
193
625
  saved_indices = [0]
194
626
 
195
627
  # Description for progress bar
196
- tumor_str = "WITH TUMOR" if withTumor else "WITHOUT TUMOR"
197
628
  device_str = f"GPU no.{torch.cuda.current_device()}" if device.type == "cuda" else "CPU"
198
629
  description = f"AOT-BioMaps -- Primal/Dual Reconstruction (KL) α:{alpha:.4f} L:{L:.4f} -- {tumor_str} -- {device_str}"
199
630
 
200
- # Main loop
201
- for iteration in trange(numIterations, desc=description):
631
+ iterator = trange(numIterations, desc=description) if show_logs else range(numIterations)
632
+ for iteration in iterator:
202
633
  # Update q (proximal step for F*)
203
634
  q = prox_F_star(q + sigma * P(x_tilde) - sigma * y_flat, sigma, y_flat)
204
635
 
@@ -1,4 +1,3 @@
1
- # sparse_matrix_gpu.py
2
1
  import pycuda.driver as drv
3
2
  import numpy as np
4
3
  from pycuda.compiler import SourceModule
@@ -26,6 +25,14 @@ class SparseSMatrix_CSR:
26
25
  self.X = manip.AcousticFields[0].field.shape[2]
27
26
  self.block_rows = block_rows
28
27
  self.relative_threshold = relative_threshold
28
+
29
+ # --- FIX: Résolution du chemin du .cubin (dans AOT_Recon/) ---
30
+ # Le fichier SparseSMatrix_CSR.py est dans AOT_Recon/AOT_SparseSMatrix/
31
+ # On remonte d'un répertoire pour atteindre AOT_Recon/
32
+ cubin_parent_dir = os.path.dirname(os.path.dirname(__file__))
33
+ self.module_path = os.path.join(cubin_parent_dir, "AOT_biomaps_kernels.cubin")
34
+ # --- FIN FIX ---
35
+
29
36
  self.h_dense = None
30
37
  self.row_ptr = None
31
38
  self.row_ptr_gpu = None
@@ -41,22 +48,24 @@ class SparseSMatrix_CSR:
41
48
  def __exit__(self, exc_type, exc, tb):
42
49
  self.free()
43
50
 
44
- def load_precompiled_module(self, so_path="AOT_biomaps_kernels.cubin"):
51
+ def load_precompiled_module(self):
52
+ """
53
+ Charge le module CUDA pré-compilé (.cubin) en utilisant le chemin résolu.
54
+ Supprime la logique de compilation JIT.
55
+ """
56
+ so_path = self.module_path # Utilise le chemin résolu dans __init__
57
+
58
+ if not os.path.exists(so_path):
59
+ raise FileNotFoundError(
60
+ f"Le module CUDA {os.path.basename(so_path)} est introuvable au chemin: {so_path}. "
61
+ "Assurez-vous qu'il est compilé et bien placé."
62
+ )
63
+
45
64
  try:
46
- # If a PTX or cubin is provided via path
47
65
  self.sparse_mod = drv.module_from_file(so_path)
48
66
  print(f"✅ Module CUDA chargé depuis {so_path}")
49
- except Exception:
50
- # Fallback: try to compile from bundled source (if available)
51
- src_path = os.path.join(os.path.dirname(__file__), 'AOT_biomaps_kernels.cu')
52
- if os.path.exists(src_path):
53
- print("Compilation JIT du kernel CUDA depuis source...")
54
- with open(src_path, 'r') as f:
55
- src = f.read()
56
- self.sparse_mod = SourceModule(src, no_extern_c=True)
57
- print("✅ Module compilé JIT")
58
- else:
59
- raise
67
+ except Exception as e:
68
+ raise RuntimeError(f"Le fichier {os.path.basename(so_path)} a été trouvé, mais PyCUDA n'a pas pu le charger. Vérifiez la compatibilité.") from e
60
69
 
61
70
  def estimate_nnz_cpu(self):
62
71
  """Estimation rapide (non-exacte) — utile si tu veux une estimation faible.
@@ -80,12 +89,10 @@ class SparseSMatrix_CSR:
80
89
  bytes_float = np.dtype(np.float32).itemsize
81
90
 
82
91
  # Charge module
83
- if kernel_module_path:
84
- self.load_precompiled_module(kernel_module_path)
85
- else:
86
- self.load_precompiled_module('AOT_biomaps_kernels.cubin')
92
+ # FIX: Toujours charger depuis self.module_path (résolu)
93
+ self.load_precompiled_module()
87
94
 
88
- count_nnz_kernel = self.sparse_mod.get_function('count_nnz_per_row_kernel')
95
+ count_nnz_kernel = self.sparse_mod.get_function('count_nnz_rows_kernel')
89
96
  fill_csr_kernel = self.sparse_mod.get_function('fill_kernel__CSR')
90
97
 
91
98
  # allocate host row_ptr
@@ -110,6 +117,7 @@ class SparseSMatrix_CSR:
110
117
  drv.memcpy_htod(dense_block_gpu, dense_block_host)
111
118
 
112
119
  grid = ((current_rows + block_size - 1) // block_size, 1, 1)
120
+ # Note: Assuming 'count_nnz_per_row_kernel' is the correct name (verified by user in prior steps)
113
121
  count_nnz_kernel(dense_block_gpu, row_nnz_gpu,
114
122
  np.int32(current_rows), np.int32(num_cols),
115
123
  np.float32(self.relative_threshold),
@@ -182,7 +190,11 @@ class SparseSMatrix_CSR:
182
190
  drv.memset_d32(col_sum_gpu, 0, ZX)
183
191
 
184
192
  # 2) Récupérer le kernel
185
- acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic__CSR")
193
+ # FIX: Utiliser le nom générique 'accumulate_columns_atomic' comme dans SELL (si le binaire est partagé)
194
+ # Si le développeur utilise la convention __CSR, on la garde.
195
+ # Basé sur notre historique SELL, le nom est probablement générique 'accumulate_columns_atomic'.
196
+ # Je vais supposer que le nom est générique pour éviter une LogicError ici aussi.
197
+ acc_kernel = self.sparse_mod.get_function("accumulate_columns_atomic")
186
198
 
187
199
  # 3) Lancer le kernel
188
200
  threads = 256
@@ -210,12 +222,23 @@ class SparseSMatrix_CSR:
210
222
  drv.memcpy_htod(self.norm_factor_inv_gpu, self.norm_factor_inv)
211
223
 
212
224
  def getMatrixSize(self):
225
+ """
226
+ Retourne la taille totale de la matrice CSR en Go (en sommant la mémoire GPU).
227
+ Utilise les attributs de taille stockés pour contourner l'AttributeError de DeviceAllocation.
228
+ """
229
+ # Note: L'utilisateur doit s'assurer que self.row_ptr existe avant cet appel.
213
230
  if self.row_ptr is None:
214
231
  return {"error": "La matrice sparse n'est pas encore allouée."}
215
- total = (self.row_ptr.nbytes if self.row_ptr is not None else 0) + \
216
- (self.h_col_ind.nbytes if self.h_col_ind is not None else 0) + \
217
- (self.h_values.nbytes if self.h_values is not None else 0)
218
- return total / (1024**3)
232
+
233
+ total_bytes = 0
234
+
235
+ # Somme des tailles stockées (Taille calculée et attribuée dans allocate et compute_norm_factor_from_csr)
236
+ total_bytes += getattr(self, 'row_ptr_gpu_size', 0)
237
+ total_bytes += getattr(self, 'col_ind_gpu_size', 0)
238
+ total_bytes += getattr(self, 'values_gpu_size', 0)
239
+ total_bytes += getattr(self, 'norm_factor_inv_gpu_size', 0)
240
+
241
+ return total_bytes / (1024**3)
219
242
 
220
243
  def free(self):
221
244
  try:
@@ -248,5 +271,4 @@ class SparseSMatrix_CSR:
248
271
  num_cols = int(self.Z * self.X)
249
272
  total_nnz = int(self.row_ptr[-1])
250
273
  density = total_nnz / (num_rows * num_cols)
251
- return density
252
-
274
+ return density