diffct 1.2.6__tar.gz → 1.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {diffct-1.2.6 → diffct-1.2.7}/PKG-INFO +27 -1
  2. {diffct-1.2.6 → diffct-1.2.7}/README.md +26 -0
  3. {diffct-1.2.6 → diffct-1.2.7}/diffct/differentiable.py +219 -175
  4. {diffct-1.2.6 → diffct-1.2.7}/pyproject.toml +2 -3
  5. {diffct-1.2.6 → diffct-1.2.7}/.github/workflows/docs.yml +0 -0
  6. {diffct-1.2.6 → diffct-1.2.7}/.github/workflows/release.yml +0 -0
  7. {diffct-1.2.6 → diffct-1.2.7}/.gitignore +0 -0
  8. {diffct-1.2.6 → diffct-1.2.7}/LICENSE +0 -0
  9. {diffct-1.2.6 → diffct-1.2.7}/diffct/__init__.py +0 -0
  10. {diffct-1.2.6 → diffct-1.2.7}/docs/Makefile +0 -0
  11. {diffct-1.2.6 → diffct-1.2.7}/docs/source/_static/.gitkeep +0 -0
  12. {diffct-1.2.6 → diffct-1.2.7}/docs/source/api.rst +0 -0
  13. {diffct-1.2.6 → diffct-1.2.7}/docs/source/conf.py +0 -0
  14. {diffct-1.2.6 → diffct-1.2.7}/docs/source/examples.rst +0 -0
  15. {diffct-1.2.6 → diffct-1.2.7}/docs/source/fbp_fan_example.rst +0 -0
  16. {diffct-1.2.6 → diffct-1.2.7}/docs/source/fbp_parallel_example.rst +0 -0
  17. {diffct-1.2.6 → diffct-1.2.7}/docs/source/fdk_cone_example.rst +0 -0
  18. {diffct-1.2.6 → diffct-1.2.7}/docs/source/getting_started.rst +0 -0
  19. {diffct-1.2.6 → diffct-1.2.7}/docs/source/index.rst +0 -0
  20. {diffct-1.2.6 → diffct-1.2.7}/docs/source/iterative_reco_cone_example.rst +0 -0
  21. {diffct-1.2.6 → diffct-1.2.7}/docs/source/iterative_reco_fan_example.rst +0 -0
  22. {diffct-1.2.6 → diffct-1.2.7}/docs/source/iterative_reco_parallel_example.rst +0 -0
  23. {diffct-1.2.6 → diffct-1.2.7}/examples/fbp_fan.py +0 -0
  24. {diffct-1.2.6 → diffct-1.2.7}/examples/fbp_parallel.py +0 -0
  25. {diffct-1.2.6 → diffct-1.2.7}/examples/fdk_cone.py +0 -0
  26. {diffct-1.2.6 → diffct-1.2.7}/examples/iterative_reco_cone.py +0 -0
  27. {diffct-1.2.6 → diffct-1.2.7}/examples/iterative_reco_fan.py +0 -0
  28. {diffct-1.2.6 → diffct-1.2.7}/examples/iterative_reco_parallel.py +0 -0
  29. {diffct-1.2.6 → diffct-1.2.7}/requirements.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffct
3
- Version: 1.2.6
3
+ Version: 1.2.7
4
4
  Summary: A CUDA-based library for computed tomography (CT) projection and reconstruction with differentiable operators
5
5
  Project-URL: Homepage, https://github.com/sypsyp97/diffct
6
6
  Author-email: Yipeng Sun <yipeng.sun@fau.de>
@@ -69,7 +69,33 @@ diffct/
69
69
 
70
70
  ### Installation
71
71
 
72
+ **CUDA 12:**
72
73
  ```bash
74
+ # Create and activate conda environment
75
+ conda create -n diffct python=3.12
76
+ conda activate diffct
77
+
78
+ # Install CUDA Toolkit, PyTorch, and Numba
79
+ conda install nvidia/label/cuda-12.8.1::cuda-toolkit
80
+ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
81
+ pip install numba-cuda[cu12]
82
+
83
+ # Install diffct
84
+ pip install diffct
85
+ ```
86
+
87
+ **CUDA 11:**
88
+ ```bash
89
+ # Create and activate conda environment
90
+ conda create -n diffct python=3.12
91
+ conda activate diffct
92
+
93
+ # Install CUDA Toolkit, PyTorch, and Numba
94
+ conda install nvidia/label/cuda-11.8.0::cuda-toolkit
95
+ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
96
+ pip install numba-cuda[cu11]
97
+
98
+ # Install diffct
73
99
  pip install diffct
74
100
  ```
75
101
 
@@ -52,7 +52,33 @@ diffct/
52
52
 
53
53
  ### Installation
54
54
 
55
+ **CUDA 12:**
55
56
  ```bash
57
+ # Create and activate conda environment
58
+ conda create -n diffct python=3.12
59
+ conda activate diffct
60
+
61
+ # Install CUDA Toolkit, PyTorch, and Numba
62
+ conda install nvidia/label/cuda-12.8.1::cuda-toolkit
63
+ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
64
+ pip install numba-cuda[cu12]
65
+
66
+ # Install diffct
67
+ pip install diffct
68
+ ```
69
+
70
+ **CUDA 11:**
71
+ ```bash
72
+ # Create and activate conda environment
73
+ conda create -n diffct python=3.12
74
+ conda activate diffct
75
+
76
+ # Install CUDA Toolkit, PyTorch, and Numba
77
+ conda install nvidia/label/cuda-11.8.0::cuda-toolkit
78
+ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
79
+ pip install numba-cuda[cu11]
80
+
81
+ # Install diffct
56
82
  pip install diffct
57
83
  ```
58
84
 
@@ -2,6 +2,7 @@ import math
2
2
  import numpy as np
3
3
  import torch
4
4
  from numba import cuda
5
+ from functools import lru_cache
5
6
 
6
7
  # ---------------------------------------------------------------------------
7
8
  # Global settings & helpers
@@ -19,8 +20,7 @@ _TPB_3D = (8, 8, 8)
19
20
  # Trades numerical precision for performance in ray-tracing calculations
20
21
  # Safe for CT reconstruction where slight precision loss is acceptable for speed gains
21
22
  _FASTMATH_DECORATOR = cuda.jit(cache=True, fastmath=True)
22
- # Disable fastmath for backward kernels to ensure gradient correctness
23
- _NON_FASTMATH_DECORATOR = cuda.jit(cache=True, fastmath=False)
23
+
24
24
  _INF = _DTYPE(np.inf)
25
25
  _EPSILON = _DTYPE(1e-6)
26
26
  # === Device Management Utilities ===
@@ -70,8 +70,8 @@ class DeviceManager:
70
70
  ... )
71
71
  tensor([1, 2, 3], device='cuda:0')
72
72
  """
73
- if hasattr(tensor, "to"):
74
- return tensor if tensor.device == device else tensor.to(device)
73
+ if hasattr(tensor, "to") and tensor.device != device:
74
+ return tensor.to(device)
75
75
  return tensor
76
76
 
77
77
  # === PyTorch-CUDA Bridge ===
@@ -110,6 +110,7 @@ class TorchCUDABridge:
110
110
 
111
111
 
112
112
  # === GPU-aware Trigonometric Table Generation ===
113
+ @lru_cache(maxsize=2048)
113
114
  def _trig_tables(angles, dtype=_DTYPE, device=None):
114
115
  """Compute cosine and sine tables for input angles.
115
116
 
@@ -139,9 +140,11 @@ def _trig_tables(angles, dtype=_DTYPE, device=None):
139
140
  """
140
141
  if isinstance(angles, torch.Tensor):
141
142
  device = angles.device if device is None else device
142
- cos = torch.cos(angles).to(dtype=dtype)
143
- sin = torch.sin(angles).to(dtype=dtype)
144
- return cos.to(device), sin.to(device)
143
+ # Compute both cos and sin in one call to avoid redundant kernel launches
144
+ angles_device = angles.to(dtype=dtype, device=device)
145
+ cos = torch.cos(angles_device)
146
+ sin = torch.sin(angles_device)
147
+ return cos, sin
145
148
  else:
146
149
  # fallback for non-tensor inputs: compute via PyTorch on CPU for consistency
147
150
  # Determine desired torch dtype
@@ -153,7 +156,7 @@ def _trig_tables(angles, dtype=_DTYPE, device=None):
153
156
  np.float64: torch.float64,
154
157
  }
155
158
  torch_dtype = _NP_TO_TORCH.get(dtype, torch.float32)
156
- # Convert input angles to a CPU torch tensor
159
+ # Convert input angles to a CPU torch tensor and compute both simultaneously
157
160
  angles_cpu = torch.tensor(angles, dtype=torch_dtype)
158
161
  cos_cpu = torch.cos(angles_cpu)
159
162
  sin_cpu = torch.sin(angles_cpu)
@@ -182,18 +185,23 @@ def _validate_3d_memory_layout(tensor, expected_order='DHW'):
182
185
  ValueError
183
186
  If tensor has unexpected memory layout or is non-contiguous
184
187
  """
185
- if len(tensor.shape) != 3:
186
- raise ValueError(f"Expected 3D tensor, got {len(tensor.shape)}D")
187
-
188
- # Check if tensor is contiguous to avoid memory duplication
189
- if not tensor.is_contiguous():
190
- raise ValueError(
191
- "Input tensor must be contiguous. Call .contiguous() before passing to "
192
- "cone beam functions to avoid memory duplication and ensure correct results."
193
- )
188
+ shape = tensor.shape
189
+ if len(shape) != 3:
190
+ raise ValueError(f"Expected 3D tensor, got {len(shape)}D")
194
191
 
192
+ # Early return for common case - contiguous tensor with expected ordering
193
+ if tensor.is_contiguous() and expected_order in ('DHW', 'VHW'):
194
+ # For DHW and VHW, the expected order matches memory layout when contiguous
195
+ return
196
+
195
197
  # Only check memory order for DHW and VHW, not for internal WHD layout
196
198
  if expected_order in ('DHW', 'VHW'):
199
+ if not tensor.is_contiguous():
200
+ raise ValueError(
201
+ "Input tensor must be contiguous. Call .contiguous() before passing to "
202
+ "cone beam functions to avoid memory duplication and ensure correct results."
203
+ )
204
+
197
205
  strides = tensor.stride()
198
206
  order_mapping = {
199
207
  'DHW': (0, 1, 2), # Depth, Height, Width
@@ -210,15 +218,15 @@ def _validate_3d_memory_layout(tensor, expected_order='DHW'):
210
218
  if actual_order != expected_stride_order:
211
219
  # Create appropriate error message based on context
212
220
  if expected_order == 'VHW':
213
- actual_str = f"({tensor.shape[0]}, {tensor.shape[1]}, {tensor.shape[2]})"
221
+ actual_str = f"({shape[0]}, {shape[1]}, {shape[2]})"
214
222
  expected_str = "(Views, Height, Width)"
215
223
  fix_str = "ensure your sinogram has shape (num_views, det_v, det_u)"
216
224
  elif expected_order == 'DHW':
217
- actual_str = f"({tensor.shape[0]}, {tensor.shape[1]}, {tensor.shape[2]})"
225
+ actual_str = f"({shape[0]}, {shape[1]}, {shape[2]})"
218
226
  expected_str = "(Depth, Height, Width)"
219
227
  fix_str = "ensure your volume has shape (D, H, W)"
220
228
  else:
221
- actual_str = str(tuple(tensor.shape))
229
+ actual_str = str(tuple(shape))
222
230
  expected_str = expected_order
223
231
  fix_str = "check tensor dimensions"
224
232
 
@@ -316,7 +324,7 @@ def _parallel_2d_forward_kernel(
316
324
  ):
317
325
  """Compute the 2D parallel beam forward projection.
318
326
 
319
- This CUDA kernel implements the Siddon-Joseph ray-tracing algorithm for
327
+ This CUDA kernel implements the Siddon ray-tracing method with interpolation for
320
328
  2D parallel beam forward projection.
321
329
 
322
330
  Parameters
@@ -348,7 +356,7 @@ def _parallel_2d_forward_kernel(
348
356
 
349
357
  Notes
350
358
  -----
351
- The Siddon-Joseph algorithm provides accurate ray-volume intersection by:
359
+ The Siddon method with interpolation provides accurate ray-volume intersection by:
352
360
  - Calculating ray-volume boundary intersections to define traversal limits.
353
361
  - Iterating through voxels along the ray path via parametric equations.
354
362
  - Determining bilinear interpolation weights for sub-voxel sampling.
@@ -407,7 +415,7 @@ def _parallel_2d_forward_kernel(
407
415
  if t_min >= t_max:
408
416
  d_sino[iang, idet] = 0.0; return
409
417
 
410
- # === SIDDON-JOSEPH VOXEL TRAVERSAL INITIALIZATION ===
418
+ # === SIDDON METHOD VOXEL TRAVERSAL INITIALIZATION ===
411
419
  accum = 0.0 # Accumulated projection value along ray
412
420
  t = t_min # Current ray parameter (distance from ray start)
413
421
 
@@ -417,12 +425,15 @@ def _parallel_2d_forward_kernel(
417
425
 
418
426
  # Determine traversal direction and step sizes for each axis
419
427
  step_x, step_y = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1) # Voxel stepping direction
420
- dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF # Parameter increment to cross one voxel in x
421
- dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF # Parameter increment to cross one voxel in y
422
-
423
- # Calculate parameter values for next voxel boundary crossings
424
- tx = ((ix + (step_x > 0)) - cx - pnt_x) / dir_x if abs(dir_x) > _EPSILON else _INF # Next x-boundary crossing
425
- ty = ((iy + (step_y > 0)) - cy - pnt_y) / dir_y if abs(dir_y) > _EPSILON else _INF # Next y-boundary crossing
428
+ # Hoist inverse directions to reduce divisions and branches
429
+ inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
430
+ inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
431
+ dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF
432
+ dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF
433
+
434
+ # Calculate parameter values for next voxel boundary crossings using inv_dir_*
435
+ tx = ((ix + (step_x > 0)) - cx - pnt_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
436
+ ty = ((iy + (step_y > 0)) - cy - pnt_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
426
437
 
427
438
  # === MAIN RAY TRAVERSAL LOOP ===
428
439
  # Step through voxels along ray path, accumulating weighted contributions
@@ -437,9 +448,10 @@ def _parallel_2d_forward_kernel(
437
448
  # === BILINEAR INTERPOLATION SAMPLING ===
438
449
  # Sample volume at ray segment midpoint for accurate integration
439
450
  # Mathematical basis: Midpoint rule for numerical integration along ray segments
440
- mid_x = pnt_x + (t + seg_len * 0.5) * dir_x + cx # Midpoint x-coordinate in image space
441
- mid_y = pnt_y + (t + seg_len * 0.5) * dir_y + cy # Midpoint y-coordinate in image space
442
-
451
+ t_mid = t + seg_len * 0.5
452
+ mid_x = pnt_x + t_mid * dir_x + cx # Midpoint x-coordinate in image space
453
+ mid_y = pnt_y + t_mid * dir_y + cy # Midpoint y-coordinate in image space
454
+
443
455
  # Convert continuous coordinates to discrete voxel indices and fractional weights
444
456
  # Floor operation gives base voxel index, fractional part gives interpolation weights
445
457
  ix0, iy0 = int(math.floor(mid_x)), int(math.floor(mid_y)) # Base voxel indices (bottom-left corner)
@@ -476,7 +488,7 @@ def _parallel_2d_forward_kernel(
476
488
 
477
489
  d_sino[iang, idet] = accum
478
490
 
479
- @_NON_FASTMATH_DECORATOR
491
+ @_FASTMATH_DECORATOR
480
492
  def _parallel_2d_backward_kernel(
481
493
  d_sino, n_ang, n_det,
482
494
  d_image, Nx, Ny,
@@ -484,8 +496,8 @@ def _parallel_2d_backward_kernel(
484
496
  ):
485
497
  """Compute the 2D parallel beam backprojection.
486
498
 
487
- This CUDA kernel implements the Siddon-Joseph algorithm for 2D parallel
488
- beam backprojection.
499
+ This CUDA kernel implements the Siddon ray-tracing method with interpolation for
500
+ 2D parallel beam backprojection.
489
501
 
490
502
  Parameters
491
503
  ----------
@@ -549,16 +561,18 @@ def _parallel_2d_backward_kernel(
549
561
 
550
562
  if t_min >= t_max: return
551
563
 
552
- # === SIDDON-JOSEPH TRAVERSAL INITIALIZATION ===
564
+ # === SIDDON METHOD TRAVERSAL INITIALIZATION ===
553
565
  t = t_min
554
566
  ix = int(math.floor(pnt_x + t * dir_x + cx))
555
567
  iy = int(math.floor(pnt_y + t * dir_y + cy))
556
568
 
557
569
  step_x, step_y = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1)
558
- dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF
559
- dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF
560
- tx = ((ix + (step_x > 0)) - cx - pnt_x) / dir_x if abs(dir_x) > _EPSILON else _INF
561
- ty = ((iy + (step_y > 0)) - cy - pnt_y) / dir_y if abs(dir_y) > _EPSILON else _INF
570
+ inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
571
+ inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
572
+ dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF
573
+ dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF
574
+ tx = ((ix + (step_x > 0)) - cx - pnt_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
575
+ ty = ((iy + (step_y > 0)) - cy - pnt_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
562
576
 
563
577
  # === BACKPROJECTION TRAVERSAL LOOP ===
564
578
  # Distribute sinogram value along ray path using bilinear interpolation
@@ -568,8 +582,9 @@ def _parallel_2d_backward_kernel(
568
582
  seg_len = t_next - t
569
583
  if seg_len > _EPSILON:
570
584
  # Sample at ray segment midpoint (same as forward projection)
571
- mid_x = pnt_x + (t + seg_len * 0.5) * dir_x + cx
572
- mid_y = pnt_y + (t + seg_len * 0.5) * dir_y + cy
585
+ t_mid = t + seg_len * 0.5
586
+ mid_x = pnt_x + t_mid * dir_x + cx
587
+ mid_y = pnt_y + t_mid * dir_y + cy
573
588
  ix0, iy0 = int(math.floor(mid_x)), int(math.floor(mid_y))
574
589
  dx, dy = mid_x - ix0, mid_y - iy0
575
590
 
@@ -585,10 +600,12 @@ def _parallel_2d_backward_kernel(
585
600
  # Performance impact: Atomic operations are slower than regular writes but necessary for correctness
586
601
  # Memory access pattern: Global memory atomics with potential bank conflicts, but unavoidable
587
602
  cval = val * seg_len # Contribution value for this ray segment
588
- cuda.atomic.add(d_image, (iy0, ix0), cval * (1 - dx) * (1 - dy))
589
- cuda.atomic.add(d_image, (iy0, ix0 + 1), cval * dx * (1 - dy))
590
- cuda.atomic.add(d_image, (iy0 + 1, ix0), cval * (1 - dx) * dy)
591
- cuda.atomic.add(d_image, (iy0 + 1, ix0 + 1), cval * dx * dy)
603
+ one_minus_dx = 1.0 - dx
604
+ one_minus_dy = 1.0 - dy
605
+ cuda.atomic.add(d_image, (iy0, ix0), cval * one_minus_dx * one_minus_dy)
606
+ cuda.atomic.add(d_image, (iy0, ix0 + 1), cval * dx * one_minus_dy)
607
+ cuda.atomic.add(d_image, (iy0 + 1, ix0), cval * one_minus_dx * dy)
608
+ cuda.atomic.add(d_image, (iy0 + 1, ix0 + 1), cval * dx * dy)
592
609
 
593
610
  # Advance to next voxel (identical logic to forward projection)
594
611
  if tx <= ty:
@@ -613,8 +630,8 @@ def _fan_2d_forward_kernel(
613
630
  ):
614
631
  """Compute the 2D fan beam forward projection.
615
632
 
616
- This CUDA kernel implements the Siddon-Joseph algorithm for 2D fan beam
617
- forward projection.
633
+ This CUDA kernel implements the Siddon ray-tracing method with interpolation for
634
+ 2D fan beam forward projection.
618
635
 
619
636
  Parameters
620
637
  ----------
@@ -704,7 +721,7 @@ def _fan_2d_forward_kernel(
704
721
  if t_min >= t_max: # No valid intersection
705
722
  d_sino[iang, idet] = 0.0; return
706
723
 
707
- # === SIDDON-JOSEPH TRAVERSAL (same algorithm as parallel beam) ===
724
+ # === SIDDON METHOD TRAVERSAL (same algorithm as parallel beam) ===
708
725
  accum = 0.0 # Accumulated projection value
709
726
  t = t_min # Current ray parameter
710
727
 
@@ -714,10 +731,12 @@ def _fan_2d_forward_kernel(
714
731
 
715
732
  # Traversal parameters (identical to parallel beam implementation)
716
733
  step_x, step_y = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1)
717
- dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF
718
- dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF
719
- tx = ((ix + (step_x > 0)) - cx - src_x) / dir_x if abs(dir_x) > _EPSILON else _INF
720
- ty = ((iy + (step_y > 0)) - cy - src_y) / dir_y if abs(dir_y) > _EPSILON else _INF
734
+ inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
735
+ inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
736
+ dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF
737
+ dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF
738
+ tx = ((ix + (step_x > 0)) - cx - src_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
739
+ ty = ((iy + (step_y > 0)) - cy - src_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
721
740
 
722
741
  # Main traversal loop with bilinear interpolation (identical to parallel beam)
723
742
  while t < t_max:
@@ -726,8 +745,9 @@ def _fan_2d_forward_kernel(
726
745
  seg_len = t_next - t
727
746
  if seg_len > _EPSILON:
728
747
  # Sample at midpoint using source as ray origin
729
- mid_x = src_x + (t + seg_len * 0.5) * dir_x + cx
730
- mid_y = src_y + (t + seg_len * 0.5) * dir_y + cy
748
+ t_mid = t + seg_len * 0.5
749
+ mid_x = src_x + t_mid * dir_x + cx
750
+ mid_y = src_y + t_mid * dir_y + cy
731
751
  ix0, iy0 = int(math.floor(mid_x)), int(math.floor(mid_y))
732
752
  dx, dy = mid_x - ix0, mid_y - iy0
733
753
 
@@ -756,7 +776,7 @@ def _fan_2d_forward_kernel(
756
776
 
757
777
  d_sino[iang, idet] = accum
758
778
 
759
- @_NON_FASTMATH_DECORATOR
779
+ @_FASTMATH_DECORATOR
760
780
  def _fan_2d_backward_kernel(
761
781
  d_sino, n_ang, n_det,
762
782
  d_image, Nx, Ny,
@@ -765,8 +785,8 @@ def _fan_2d_backward_kernel(
765
785
  ):
766
786
  """Compute the 2D fan beam backprojection.
767
787
 
768
- This CUDA kernel implements the Siddon-Joseph algorithm for 2D fan beam
769
- backprojection.
788
+ This CUDA kernel implements the Siddon ray-tracing method with interpolation for
789
+ 2D fan beam backprojection.
770
790
 
771
791
  Parameters
772
792
  ----------
@@ -851,16 +871,18 @@ def _fan_2d_backward_kernel(
851
871
 
852
872
  if t_min >= t_max: return
853
873
 
854
- # === SIDDON-JOSEPH TRAVERSAL INITIALIZATION ===
874
+ # === SIDDON METHOD TRAVERSAL INITIALIZATION ===
855
875
  t = t_min
856
876
  ix = int(math.floor(src_x + t * dir_x + cx))
857
877
  iy = int(math.floor(src_y + t * dir_y + cy))
858
878
 
859
879
  step_x, step_y = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1)
860
- dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF
861
- dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF
862
- tx = ((ix + (step_x > 0)) - cx - src_x) / dir_x if abs(dir_x) > _EPSILON else _INF
863
- ty = ((iy + (step_y > 0)) - cy - src_y) / dir_y if abs(dir_y) > _EPSILON else _INF
880
+ inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
881
+ inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
882
+ dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF
883
+ dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF
884
+ tx = ((ix + (step_x > 0)) - cx - src_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
885
+ ty = ((iy + (step_y > 0)) - cy - src_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
864
886
 
865
887
  # === FAN BEAM BACKPROJECTION TRAVERSAL LOOP ===
866
888
  # Distribute sinogram value along divergent ray path using bilinear interpolation
@@ -870,8 +892,9 @@ def _fan_2d_backward_kernel(
870
892
  seg_len = t_next - t
871
893
  if seg_len > _EPSILON:
872
894
  # Sample at ray segment midpoint using source as ray origin
873
- mid_x = src_x + (t + seg_len * 0.5) * dir_x + cx
874
- mid_y = src_y + (t + seg_len * 0.5) * dir_y + cy
895
+ t_mid = t + seg_len * 0.5
896
+ mid_x = src_x + t_mid * dir_x + cx
897
+ mid_y = src_y + t_mid * dir_y + cy
875
898
  ix0, iy0 = int(math.floor(mid_x)), int(math.floor(mid_y))
876
899
  dx, dy = mid_x - ix0, mid_y - iy0
877
900
 
@@ -886,10 +909,12 @@ def _fan_2d_backward_kernel(
886
909
  # Atomic operations prevent race conditions when multiple divergent rays write to same voxel
887
910
  # Performance consideration: Fan beam geometry may have more atomic contention than parallel beam
888
911
  cval = val * seg_len # Contribution value for this ray segment
889
- cuda.atomic.add(d_image, (iy0, ix0), cval * (1 - dx) * (1 - dy))
890
- cuda.atomic.add(d_image, (iy0, ix0 + 1), cval * dx * (1 - dy))
891
- cuda.atomic.add(d_image, (iy0 + 1, ix0), cval * (1 - dx) * dy)
892
- cuda.atomic.add(d_image, (iy0 + 1, ix0 + 1), cval * dx * dy)
912
+ one_minus_dx = 1.0 - dx
913
+ one_minus_dy = 1.0 - dy
914
+ cuda.atomic.add(d_image, (iy0, ix0), cval * one_minus_dx * one_minus_dy)
915
+ cuda.atomic.add(d_image, (iy0, ix0 + 1), cval * dx * one_minus_dy)
916
+ cuda.atomic.add(d_image, (iy0 + 1, ix0), cval * one_minus_dx * dy)
917
+ cuda.atomic.add(d_image, (iy0 + 1, ix0 + 1), cval * dx * dy)
893
918
 
894
919
  # === VOXEL BOUNDARY CROSSING LOGIC ===
895
920
  # Advance to next voxel based on which boundary is crossed first
@@ -915,8 +940,8 @@ def _cone_3d_forward_kernel(
915
940
  ):
916
941
  """Compute the 3D cone-beam forward projection.
917
942
 
918
- This CUDA kernel implements the Siddon-Joseph algorithm for 3D cone-beam
919
- forward projection.
943
+ This CUDA kernel implements the Siddon ray-tracing method with interpolation for
944
+ 3D cone-beam forward projection.
920
945
 
921
946
  Parameters
922
947
  ----------
@@ -1025,7 +1050,7 @@ def _cone_3d_forward_kernel(
1025
1050
  if t_min >= t_max: # No valid 3D intersection
1026
1051
  d_sino[iview, iu, iv] = 0.0; return
1027
1052
 
1028
- # === 3D SIDDON-JOSEPH TRAVERSAL INITIALIZATION ===
1053
+ # === 3D SIDDON METHOD TRAVERSAL INITIALIZATION ===
1029
1054
  accum = 0.0 # Accumulated projection value
1030
1055
  t = t_min # Current ray parameter
1031
1056
 
@@ -1036,14 +1061,17 @@ def _cone_3d_forward_kernel(
1036
1061
 
1037
1062
  # 3D traversal parameters (extends 2D algorithm)
1038
1063
  step_x, step_y, step_z = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1), (1 if dir_z >= 0 else -1)
1039
- dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF # Parameter increment per x-voxel
1040
- dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF # Parameter increment per y-voxel
1041
- dt_z = abs(1.0 / dir_z) if abs(dir_z) > _EPSILON else _INF # Parameter increment per z-voxel
1042
-
1064
+ inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
1065
+ inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
1066
+ inv_dir_z = (1.0 / dir_z) if abs(dir_z) > _EPSILON else 0.0
1067
+ dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF # Parameter increment per x-voxel
1068
+ dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF # Parameter increment per y-voxel
1069
+ dt_z = abs(inv_dir_z) if abs(dir_z) > _EPSILON else _INF # Parameter increment per z-voxel
1070
+
1043
1071
  # Calculate parameter values for next 3D voxel boundary crossings
1044
- tx = ((ix + (step_x > 0)) - cx - src_x) / dir_x if abs(dir_x) > _EPSILON else _INF
1045
- ty = ((iy + (step_y > 0)) - cy - src_y) / dir_y if abs(dir_y) > _EPSILON else _INF
1046
- tz = ((iz + (step_z > 0)) - cz - src_z) / dir_z if abs(dir_z) > _EPSILON else _INF
1072
+ tx = ((ix + (step_x > 0)) - cx - src_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
1073
+ ty = ((iy + (step_y > 0)) - cy - src_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
1074
+ tz = ((iz + (step_z > 0)) - cz - src_z) * inv_dir_z if abs(dir_z) > _EPSILON else _INF
1047
1075
 
1048
1076
  # === 3D TRAVERSAL LOOP WITH TRILINEAR INTERPOLATION ===
1049
1077
  while t < t_max:
@@ -1056,34 +1084,35 @@ def _cone_3d_forward_kernel(
1056
1084
  # === TRILINEAR INTERPOLATION SAMPLING ===
1057
1085
  # Sample 3D volume at ray segment midpoint for accurate integration
1058
1086
  # Mathematical basis: Midpoint rule for numerical integration along 3D ray segments
1059
- mid_x = src_x + (t + seg_len * 0.5) * dir_x + cx # Midpoint x-coordinate in volume space
1060
- mid_y = src_y + (t + seg_len * 0.5) * dir_y + cy # Midpoint y-coordinate in volume space
1061
- mid_z = src_z + (t + seg_len * 0.5) * dir_z + cz # Midpoint z-coordinate in volume space
1062
-
1087
+ t_mid = t + seg_len * 0.5
1088
+ mid_x = src_x + t_mid * dir_x + cx # Midpoint x-coordinate in volume space
1089
+ mid_y = src_y + t_mid * dir_y + cy # Midpoint y-coordinate in volume space
1090
+ mid_z = src_z + t_mid * dir_z + cz # Midpoint z-coordinate in volume space
1091
+
1063
1092
  # Convert continuous 3D coordinates to discrete voxel indices and fractional weights
1064
- # Floor operation gives base voxel index, fractional part gives interpolation weights
1065
- ix0, iy0, iz0 = int(math.floor(mid_x)), int(math.floor(mid_y)), int(math.floor(mid_z)) # Base voxel indices (corner 0,0,0)
1066
- dx, dy, dz = mid_x - ix0, mid_y - iy0, mid_z - iz0 # Fractional parts: distance from base voxel center [0,1]
1067
-
1093
+ ix0, iy0, iz0 = int(math.floor(mid_x)), int(math.floor(mid_y)), int(math.floor(mid_z))
1094
+ dx, dy, dz = mid_x - ix0, mid_y - iy0, mid_z - iz0
1095
+
1068
1096
  # Clamp indices to stay in-bounds during interpolation
1069
1097
  ix0 = max(0, min(ix0, Nx - 2))
1070
1098
  iy0 = max(0, min(iy0, Ny - 2))
1071
1099
  iz0 = max(0, min(iz0, Nz - 2))
1072
-
1100
+
1101
+ # Precompute complements
1102
+ omdx = 1.0 - dx
1103
+ omdy = 1.0 - dy
1104
+ omdz = 1.0 - dz
1105
+
1073
1106
  # === TRILINEAR INTERPOLATION WEIGHT CALCULATION ===
1074
- # Mathematical basis: Trilinear interpolation formula f(x,y,z) = Σ f(xi,yi,zi) * wi(x,y,z)
1075
- # where wi(x,y,z) are the trilinear basis functions for each corner voxel of the 3D cube
1076
- # Weights are products of 1D linear interpolation weights: (1-dx) or dx, (1-dy) or dy, (1-dz) or dz
1077
- # Each of the 8 cube corners gets a weight proportional to its distance from the sample point
1078
1107
  val = (
1079
- d_vol[ix0, iy0, iz0] * (1-dx)*(1-dy)*(1-dz) + # Corner (0,0,0): weight = product of distances from opposite faces
1080
- d_vol[ix0 + 1, iy0, iz0] * dx*(1-dy)*(1-dz) + # Corner (1,0,0): weight = dx * (1-dy) * (1-dz)
1081
- d_vol[ix0, iy0 + 1, iz0] * (1-dx)*dy*(1-dz) + # Corner (0,1,0): weight = (1-dx) * dy * (1-dz)
1082
- d_vol[ix0, iy0, iz0 + 1] * (1-dx)*(1-dy)*dz + # Corner (0,0,1): weight = (1-dx) * (1-dy) * dz
1083
- d_vol[ix0 + 1, iy0 + 1, iz0] * dx*dy*(1-dz) + # Corner (1,1,0): weight = dx * dy * (1-dz)
1084
- d_vol[ix0 + 1, iy0, iz0 + 1] * dx*(1-dy)*dz + # Corner (1,0,1): weight = dx * (1-dy) * dz
1085
- d_vol[ix0, iy0 + 1, iz0 + 1] * (1-dx)*dy*dz + # Corner (0,1,1): weight = (1-dx) * dy * dz
1086
- d_vol[ix0 + 1, iy0 + 1, iz0 + 1] * dx*dy*dz # Corner (1,1,1): weight = dx * dy * dz
1108
+ d_vol[ix0, iy0, iz0] * omdx*omdy*omdz +
1109
+ d_vol[ix0 + 1, iy0, iz0] * dx *omdy*omdz +
1110
+ d_vol[ix0, iy0 + 1, iz0] * omdx*dy *omdz +
1111
+ d_vol[ix0, iy0, iz0 + 1] * omdx*omdy*dz +
1112
+ d_vol[ix0 + 1, iy0 + 1, iz0] * dx *dy *omdz +
1113
+ d_vol[ix0 + 1, iy0, iz0 + 1] * dx *omdy*dz +
1114
+ d_vol[ix0, iy0 + 1, iz0 + 1] * omdx*dy *dz +
1115
+ d_vol[ix0 + 1, iy0 + 1, iz0 + 1] * dx *dy *dz
1087
1116
  )
1088
1117
  # Accumulate contribution weighted by 3D ray segment length (discrete line integral approximation)
1089
1118
  # This implements the 3D Radon transform: integral of f(x,y,z) along the ray path
@@ -1106,7 +1135,7 @@ def _cone_3d_forward_kernel(
1106
1135
 
1107
1136
  d_sino[iview, iu, iv] = accum
1108
1137
 
1109
- @_NON_FASTMATH_DECORATOR
1138
+ @_FASTMATH_DECORATOR
1110
1139
  def _cone_3d_backward_kernel(
1111
1140
  d_sino, n_views, n_u, n_v,
1112
1141
  d_vol, Nx, Ny, Nz,
@@ -1115,8 +1144,8 @@ def _cone_3d_backward_kernel(
1115
1144
  ):
1116
1145
  """Compute the 3D cone-beam backprojection.
1117
1146
 
1118
- This CUDA kernel implements the Siddon-Joseph algorithm for 3D cone-beam
1119
- backprojection.
1147
+ This CUDA kernel implements the Siddon ray-tracing method with interpolation for
1148
+ 3D cone-beam backprojection.
1120
1149
 
1121
1150
  Parameters
1122
1151
  ----------
@@ -1219,7 +1248,7 @@ def _cone_3d_backward_kernel(
1219
1248
 
1220
1249
  if t_min >= t_max: return
1221
1250
 
1222
- # === 3D SIDDON-JOSEPH TRAVERSAL INITIALIZATION ===
1251
+ # === 3D SIDDON METHOD TRAVERSAL INITIALIZATION ===
1223
1252
  t = t_min
1224
1253
  ix = int(math.floor(src_x + t * dir_x + cx)) # Current voxel x-index
1225
1254
  iy = int(math.floor(src_y + t * dir_y + cy)) # Current voxel y-index
@@ -1227,14 +1256,17 @@ def _cone_3d_backward_kernel(
1227
1256
 
1228
1257
  # 3D traversal parameters (extends 2D algorithm)
1229
1258
  step_x, step_y, step_z = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1), (1 if dir_z >= 0 else -1)
1230
- dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF # Parameter increment per x-voxel
1231
- dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF # Parameter increment per y-voxel
1232
- dt_z = abs(1.0 / dir_z) if abs(dir_z) > _EPSILON else _INF # Parameter increment per z-voxel
1233
-
1259
+ inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
1260
+ inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
1261
+ inv_dir_z = (1.0 / dir_z) if abs(dir_z) > _EPSILON else 0.0
1262
+ dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF # Parameter increment per x-voxel
1263
+ dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF # Parameter increment per y-voxel
1264
+ dt_z = abs(inv_dir_z) if abs(dir_z) > _EPSILON else _INF # Parameter increment per z-voxel
1265
+
1234
1266
  # Calculate parameter values for next 3D voxel boundary crossings
1235
- tx = ((ix + (step_x > 0)) - cx - src_x) / dir_x if abs(dir_x) > _EPSILON else _INF
1236
- ty = ((iy + (step_y > 0)) - cy - src_y) / dir_y if abs(dir_y) > _EPSILON else _INF
1237
- tz = ((iz + (step_z > 0)) - cz - src_z) / dir_z if abs(dir_z) > _EPSILON else _INF
1267
+ tx = ((ix + (step_x > 0)) - cx - src_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
1268
+ ty = ((iy + (step_y > 0)) - cy - src_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
1269
+ tz = ((iz + (step_z > 0)) - cz - src_z) * inv_dir_z if abs(dir_z) > _EPSILON else _INF
1238
1270
 
1239
1271
  # === 3D CONE BEAM BACKPROJECTION TRAVERSAL LOOP ===
1240
1272
  # Distribute sinogram value along divergent 3D ray path using trilinear interpolation
@@ -1247,35 +1279,35 @@ def _cone_3d_backward_kernel(
1247
1279
  if seg_len > _EPSILON:
1248
1280
  # === TRILINEAR INTERPOLATION SAMPLING ===
1249
1281
  # Sample 3D volume at ray segment midpoint using source as ray origin
1250
- mid_x = src_x + (t + seg_len * 0.5) * dir_x + cx # Midpoint x-coordinate
1251
- mid_y = src_y + (t + seg_len * 0.5) * dir_y + cy # Midpoint y-coordinate
1252
- mid_z = src_z + (t + seg_len * 0.5) * dir_z + cz # Midpoint z-coordinate
1253
-
1282
+ t_mid = t + seg_len * 0.5
1283
+ mid_x = src_x + t_mid * dir_x + cx
1284
+ mid_y = src_y + t_mid * dir_y + cy
1285
+ mid_z = src_z + t_mid * dir_z + cz
1286
+
1254
1287
  # Convert continuous 3D coordinates to voxel indices and interpolation weights
1255
1288
  ix0, iy0, iz0 = int(math.floor(mid_x)), int(math.floor(mid_y)), int(math.floor(mid_z))
1256
- dx, dy, dz = mid_x - ix0, mid_y - iy0, mid_z - iz0 # Fractional parts for 3D weights
1257
-
1289
+ dx, dy, dz = mid_x - ix0, mid_y - iy0, mid_z - iz0
1290
+
1258
1291
  # Clamp indices to stay in-bounds during interpolation
1259
1292
  ix0 = max(0, min(ix0, Nx - 2))
1260
1293
  iy0 = max(0, min(iy0, Ny - 2))
1261
1294
  iz0 = max(0, min(iz0, Nz - 2))
1262
-
1295
+
1296
+ # Precompute complements and contribution
1297
+ omdx = 1.0 - dx
1298
+ omdy = 1.0 - dy
1299
+ omdz = 1.0 - dz
1300
+ cval = g * seg_len
1301
+
1263
1302
  # === ATOMIC BACKPROJECTION WITH TRILINEAR WEIGHTS ===
1264
- # Distribute contribution weighted by segment length and interpolation weights
1265
- # CUDA 3D ATOMIC OPERATIONS: Most complex atomic pattern in cone beam backprojection
1266
- # 8 atomic writes per ray segment (one per cube corner) increases memory contention significantly
1267
- # Cone beam geometry creates maximum ray convergence, highest probability of write conflicts
1268
- # Performance impact: 3D atomics are most expensive due to volume of concurrent writes
1269
- # Memory bandwidth: 8 atomic operations per interpolation point can saturate memory subsystem
1270
- cval = g * seg_len # Contribution value for this ray segment
1271
- cuda.atomic.add(d_vol, (ix0, iy0, iz0), cval * (1-dx)*(1-dy)*(1-dz)) # Corner (0,0,0) - atomic write
1272
- cuda.atomic.add(d_vol, (ix0 + 1, iy0, iz0), cval * dx*(1-dy)*(1-dz)) # Corner (1,0,0) - atomic write
1273
- cuda.atomic.add(d_vol, (ix0, iy0 + 1, iz0), cval * (1-dx)*dy*(1-dz)) # Corner (0,1,0) - atomic write
1274
- cuda.atomic.add(d_vol, (ix0, iy0, iz0 + 1), cval * (1-dx)*(1-dy)*dz) # Corner (0,0,1) - atomic write
1275
- cuda.atomic.add(d_vol, (ix0 + 1, iy0 + 1, iz0), cval * dx*dy*(1-dz)) # Corner (1,1,0) - atomic write
1276
- cuda.atomic.add(d_vol, (ix0 + 1, iy0, iz0 + 1), cval * dx*(1-dy)*dz) # Corner (1,0,1) - atomic write
1277
- cuda.atomic.add(d_vol, (ix0, iy0 + 1, iz0 + 1), cval * (1-dx)*dy*dz) # Corner (0,1,1) - atomic write
1278
- cuda.atomic.add(d_vol, (ix0 + 1, iy0 + 1, iz0 + 1), cval * dx*dy*dz) # Corner (1,1,1) - atomic write
1303
+ cuda.atomic.add(d_vol, (ix0, iy0, iz0), cval * omdx*omdy*omdz)
1304
+ cuda.atomic.add(d_vol, (ix0 + 1, iy0, iz0), cval * dx *omdy*omdz)
1305
+ cuda.atomic.add(d_vol, (ix0, iy0 + 1, iz0), cval * omdx*dy *omdz)
1306
+ cuda.atomic.add(d_vol, (ix0, iy0, iz0 + 1), cval * omdx*omdy*dz)
1307
+ cuda.atomic.add(d_vol, (ix0 + 1, iy0 + 1, iz0), cval * dx *dy *omdz)
1308
+ cuda.atomic.add(d_vol, (ix0 + 1, iy0, iz0 + 1), cval * dx *omdy*dz)
1309
+ cuda.atomic.add(d_vol, (ix0, iy0 + 1, iz0 + 1), cval * omdx*dy *dz)
1310
+ cuda.atomic.add(d_vol, (ix0 + 1, iy0 + 1, iz0 + 1), cval * dx *dy *dz)
1279
1311
 
1280
1312
  # === 3D VOXEL BOUNDARY CROSSING LOGIC ===
1281
1313
  # Advance to next voxel based on which boundary is crossed first in 3D
@@ -1305,8 +1337,8 @@ class ParallelProjectorFunction(torch.autograd.Function):
1305
1337
 
1306
1338
  Notes
1307
1339
  -----
1308
- Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph
1309
- ray-tracing algorithm for parallel beam CT geometry. The forward pass computes
1340
+ Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
1341
+ method with interpolation for parallel beam CT geometry. The forward pass computes
1310
1342
  the sinogram from a 2D image using parallel beam geometry. The backward pass
1311
1343
  computes gradients using the adjoint backprojection operation. Requires
1312
1344
  CUDA-capable hardware and a properly configured CUDA environment; all input
@@ -1358,7 +1390,7 @@ class ParallelProjectorFunction(torch.autograd.Function):
1358
1390
  -----
1359
1391
  - All input tensors must be on the same CUDA device.
1360
1392
  - The operation is fully differentiable and supports autograd.
1361
- - Uses the Siddon-Joseph algorithm for accurate ray tracing and bilinear interpolation.
1393
+ - Uses the Siddon method with interpolation for accurate ray tracing and bilinear interpolation.
1362
1394
 
1363
1395
  Examples
1364
1396
  --------
@@ -1394,11 +1426,12 @@ class ParallelProjectorFunction(torch.autograd.Function):
1394
1426
  grid, tpb = _grid_2d(n_angles, num_detectors)
1395
1427
  cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
1396
1428
 
1397
- _parallel_2d_forward_kernel[grid, tpb](
1429
+ pt_stream = torch.cuda.current_stream()
1430
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
1431
+ _parallel_2d_forward_kernel[grid, tpb, numba_stream](
1398
1432
  d_image, Nx, Ny, d_sino, n_angles, num_detectors,
1399
1433
  _DTYPE(detector_spacing), d_cos_arr, d_sin_arr, cx, cy, _DTYPE(voxel_spacing)
1400
1434
  )
1401
- torch.cuda.synchronize()
1402
1435
 
1403
1436
  ctx.save_for_backward(angles)
1404
1437
  ctx.intermediate = (num_detectors, detector_spacing, Ny, Nx, voxel_spacing)
@@ -1427,12 +1460,13 @@ class ParallelProjectorFunction(torch.autograd.Function):
1427
1460
  grid, tpb = _grid_2d(n_angles, num_detectors)
1428
1461
  cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
1429
1462
 
1430
- _parallel_2d_backward_kernel[grid, tpb](
1463
+ pt_stream = torch.cuda.current_stream()
1464
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
1465
+ _parallel_2d_backward_kernel[grid, tpb, numba_stream](
1431
1466
  d_grad_sino, n_angles, num_detectors,
1432
1467
  d_img_grad, Nx, Ny,
1433
1468
  _DTYPE(detector_spacing), d_cos_arr, d_sin_arr, cx, cy, _DTYPE(voxel_spacing)
1434
1469
  )
1435
- torch.cuda.synchronize()
1436
1470
 
1437
1471
  return grad_image, None, None, None, None
1438
1472
 
@@ -1445,8 +1479,8 @@ class ParallelBackprojectorFunction(torch.autograd.Function):
1445
1479
 
1446
1480
  Notes
1447
1481
  -----
1448
- Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph ray-tracing
1449
- algorithm for parallel beam backprojection. The forward pass computes a 2D
1482
+ Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
1483
+ method with interpolation for parallel beam backprojection. The forward pass computes a 2D
1450
1484
  reconstruction from sinogram data using parallel beam backprojection, and the
1451
1485
  backward pass computes gradients via forward projection as the adjoint operation.
1452
1486
  Requires CUDA-capable hardware and consistent device placements.
@@ -1493,7 +1527,7 @@ class ParallelBackprojectorFunction(torch.autograd.Function):
1493
1527
  -----
1494
1528
  - All input tensors must be on the same CUDA device.
1495
1529
  - The operation is fully differentiable and supports autograd.
1496
- - Uses the Siddon-Joseph algorithm for accurate ray tracing and bilinear interpolation.
1530
+ - Uses the Siddon method with interpolation for accurate ray tracing and bilinear interpolation.
1497
1531
 
1498
1532
  Examples
1499
1533
  --------
@@ -1529,11 +1563,12 @@ class ParallelBackprojectorFunction(torch.autograd.Function):
1529
1563
  grid, tpb = _grid_2d(n_ang, n_det)
1530
1564
  cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
1531
1565
 
1532
- _parallel_2d_backward_kernel[grid, tpb](
1566
+ pt_stream = torch.cuda.current_stream()
1567
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
1568
+ _parallel_2d_backward_kernel[grid, tpb, numba_stream](
1533
1569
  d_sino, n_ang, n_det, d_reco, Nx, Ny,
1534
1570
  _DTYPE(detector_spacing), d_cos_arr, d_sin_arr, cx, cy, _DTYPE(voxel_spacing)
1535
1571
  )
1536
- torch.cuda.synchronize()
1537
1572
 
1538
1573
  ctx.save_for_backward(angles)
1539
1574
  ctx.intermediate = (H, W, detector_spacing, sinogram.shape[0], sinogram.shape[1], voxel_spacing)
@@ -1567,11 +1602,12 @@ class ParallelBackprojectorFunction(torch.autograd.Function):
1567
1602
  grid, tpb = _grid_2d(n_ang, n_det)
1568
1603
  cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
1569
1604
 
1570
- _parallel_2d_forward_kernel[grid, tpb](
1605
+ pt_stream = torch.cuda.current_stream()
1606
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
1607
+ _parallel_2d_forward_kernel[grid, tpb, numba_stream](
1571
1608
  d_grad_out, Nx, Ny, d_sino_grad, n_ang, n_det,
1572
1609
  _DTYPE(detector_spacing), d_cos, d_sin, cx, cy, _DTYPE(voxel_spacing)
1573
1610
  )
1574
- torch.cuda.synchronize()
1575
1611
 
1576
1612
  return grad_sino, None, None, None, None, None
1577
1613
 
@@ -1584,8 +1620,8 @@ class FanProjectorFunction(torch.autograd.Function):
1584
1620
 
1585
1621
  Notes
1586
1622
  -----
1587
- Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph
1588
- ray-tracing algorithm for fan beam geometry, where rays diverge from a point
1623
+ Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
1624
+ method with interpolation for fan beam geometry, where rays diverge from a point
1589
1625
  X-ray source to a linear detector array. The forward pass computes sinograms
1590
1626
  using divergent beam geometry, and the backward pass computes gradients via
1591
1627
  adjoint backprojection.
@@ -1637,7 +1673,7 @@ class FanProjectorFunction(torch.autograd.Function):
1637
1673
  - All input tensors must be on the same CUDA device.
1638
1674
  - The operation is fully differentiable and supports autograd.
1639
1675
  - Fan beam geometry uses divergent rays from a point source to the detector.
1640
- - Uses the Siddon-Joseph algorithm for accurate ray tracing and bilinear interpolation.
1676
+ - Uses the Siddon method with interpolation for accurate ray tracing and bilinear interpolation.
1641
1677
 
1642
1678
  Examples
1643
1679
  --------
@@ -1668,12 +1704,13 @@ class FanProjectorFunction(torch.autograd.Function):
1668
1704
  grid, tpb = _grid_2d(n_ang, num_detectors)
1669
1705
  cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
1670
1706
 
1671
- _fan_2d_forward_kernel[grid, tpb](
1707
+ pt_stream = torch.cuda.current_stream()
1708
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
1709
+ _fan_2d_forward_kernel[grid, tpb, numba_stream](
1672
1710
  d_image, Nx, Ny, d_sino, n_ang, num_detectors,
1673
1711
  _DTYPE(detector_spacing), d_cos_arr, d_sin_arr,
1674
1712
  _DTYPE(sdd), _DTYPE(sid), cx, cy, _DTYPE(voxel_spacing)
1675
1713
  )
1676
- torch.cuda.synchronize()
1677
1714
 
1678
1715
  ctx.save_for_backward(angles)
1679
1716
  ctx.intermediate = (num_detectors, detector_spacing, Ny, Nx,
@@ -1703,12 +1740,13 @@ class FanProjectorFunction(torch.autograd.Function):
1703
1740
  grid, tpb = _grid_2d(n_ang, n_det)
1704
1741
  cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
1705
1742
 
1706
- _fan_2d_backward_kernel[grid, tpb](
1743
+ pt_stream = torch.cuda.current_stream()
1744
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
1745
+ _fan_2d_backward_kernel[grid, tpb, numba_stream](
1707
1746
  d_grad_sino, n_ang, n_det, d_img_grad, Nx, Ny,
1708
1747
  _DTYPE(det_spacing), d_cos_arr, d_sin_arr,
1709
1748
  _DTYPE(sdd), _DTYPE(sid), cx, cy, _DTYPE(voxel_spacing)
1710
1749
  )
1711
- torch.cuda.synchronize()
1712
1750
 
1713
1751
  return grad_img, None, None, None, None, None, None
1714
1752
 
@@ -1721,8 +1759,8 @@ class FanBackprojectorFunction(torch.autograd.Function):
1721
1759
 
1722
1760
  Notes
1723
1761
  -----
1724
- Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph
1725
- ray-tracing algorithm for fan beam backprojection. Implements the adjoint
1762
+ Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
1763
+ method with interpolation for fan beam backprojection. Implements the adjoint
1726
1764
  of the fan beam projection operator, distributing sinogram values back into
1727
1765
  the reconstruction volume along divergent ray paths. The forward pass
1728
1766
  computes reconstruction from sinogram data, and the backward pass computes
@@ -1777,7 +1815,7 @@ class FanBackprojectorFunction(torch.autograd.Function):
1777
1815
  - All input tensors must be on the same CUDA device.
1778
1816
  - The operation is fully differentiable and supports autograd.
1779
1817
  - Fan beam geometry uses divergent rays from a point source to the detector.
1780
- - Uses the Siddon-Joseph algorithm for accurate ray tracing and bilinear interpolation.
1818
+ - Uses the Siddon method with interpolation for accurate ray tracing and bilinear interpolation.
1781
1819
 
1782
1820
  Examples
1783
1821
  --------
@@ -1808,12 +1846,13 @@ class FanBackprojectorFunction(torch.autograd.Function):
1808
1846
  grid, tpb = _grid_2d(n_ang, n_det)
1809
1847
  cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
1810
1848
 
1811
- _fan_2d_backward_kernel[grid, tpb](
1849
+ pt_stream = torch.cuda.current_stream()
1850
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
1851
+ _fan_2d_backward_kernel[grid, tpb, numba_stream](
1812
1852
  d_sino, n_ang, n_det, d_reco, Nx, Ny,
1813
1853
  _DTYPE(detector_spacing), d_cos_arr, d_sin_arr,
1814
1854
  _DTYPE(sdd), _DTYPE(sid), cx, cy, _DTYPE(voxel_spacing)
1815
1855
  )
1816
- torch.cuda.synchronize()
1817
1856
 
1818
1857
  ctx.save_for_backward(angles)
1819
1858
  ctx.intermediate = (H, W, detector_spacing, n_ang, n_det, sdd, sid, voxel_spacing)
@@ -1843,12 +1882,13 @@ class FanBackprojectorFunction(torch.autograd.Function):
1843
1882
  grid, tpb = _grid_2d(n_ang, n_det)
1844
1883
  cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
1845
1884
 
1846
- _fan_2d_forward_kernel[grid, tpb](
1885
+ pt_stream = torch.cuda.current_stream()
1886
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
1887
+ _fan_2d_forward_kernel[grid, tpb, numba_stream](
1847
1888
  d_grad_out, Nx, Ny, d_sino_grad, n_ang, n_det,
1848
1889
  _DTYPE(det_spacing), d_cos_arr, d_sin_arr,
1849
1890
  _DTYPE(sdd), _DTYPE(sid), cx, cy, _DTYPE(voxel_spacing)
1850
1891
  )
1851
- torch.cuda.synchronize()
1852
1892
 
1853
1893
  return grad_sino, None, None, None, None, None, None, None
1854
1894
 
@@ -1861,8 +1901,8 @@ class ConeProjectorFunction(torch.autograd.Function):
1861
1901
 
1862
1902
  Notes
1863
1903
  -----
1864
- Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph
1865
- ray-tracing algorithm for 3D cone beam geometry. Rays emanate from a point
1904
+ Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
1905
+ method with interpolation for 3D cone beam geometry. Rays emanate from a point
1866
1906
  X-ray source to a 2D detector array capturing volumetric projection data.
1867
1907
  The forward pass computes 3D projections, and the backward pass computes
1868
1908
  gradients via adjoint 3D backprojection. Requires significant GPU memory.
@@ -1918,7 +1958,7 @@ class ConeProjectorFunction(torch.autograd.Function):
1918
1958
  - All input tensors must be on the same CUDA device.
1919
1959
  - The operation is fully differentiable and supports autograd.
1920
1960
  - Cone beam geometry uses a point source and a 2D detector array.
1921
- - Uses the Siddon-Joseph algorithm for accurate 3D ray tracing and trilinear interpolation.
1961
+ - Uses the Siddon method with interpolation for accurate 3D ray tracing and trilinear interpolation.
1922
1962
 
1923
1963
  Examples
1924
1964
  --------
@@ -1953,13 +1993,14 @@ class ConeProjectorFunction(torch.autograd.Function):
1953
1993
  grid, tpb = _grid_3d(n_views, det_u, det_v)
1954
1994
  cx, cy, cz = _DTYPE(W * 0.5), _DTYPE(H * 0.5), _DTYPE(D * 0.5)
1955
1995
 
1956
- _cone_3d_forward_kernel[grid, tpb](
1996
+ pt_stream = torch.cuda.current_stream()
1997
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
1998
+ _cone_3d_forward_kernel[grid, tpb, numba_stream](
1957
1999
  d_vol, W, H, D, d_sino, n_views, det_u, det_v,
1958
2000
  _DTYPE(du), _DTYPE(dv), d_cos_arr, d_sin_arr,
1959
2001
  _DTYPE(sdd), _DTYPE(sid),
1960
2002
  cx, cy, cz, _DTYPE(voxel_spacing)
1961
2003
  )
1962
- torch.cuda.synchronize()
1963
2004
 
1964
2005
  ctx.save_for_backward(angles)
1965
2006
  ctx.intermediate = (D, H, W, det_u, det_v, du, dv,
@@ -1991,12 +2032,13 @@ class ConeProjectorFunction(torch.autograd.Function):
1991
2032
  grid, tpb = _grid_3d(n_views, det_u, det_v)
1992
2033
  cx, cy, cz = _DTYPE(W * 0.5), _DTYPE(H * 0.5), _DTYPE(D * 0.5)
1993
2034
 
1994
- _cone_3d_backward_kernel[grid, tpb](
2035
+ pt_stream = torch.cuda.current_stream()
2036
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
2037
+ _cone_3d_backward_kernel[grid, tpb, numba_stream](
1995
2038
  d_grad_sino, n_views, det_u, det_v, d_vol_grad, W, H, D,
1996
2039
  _DTYPE(du), _DTYPE(dv), d_cos_arr, d_sin_arr,
1997
2040
  _DTYPE(sdd), _DTYPE(sid), cx, cy, cz, _DTYPE(voxel_spacing)
1998
2041
  )
1999
- torch.cuda.synchronize()
2000
2042
 
2001
2043
  grad_vol = grad_vol_perm.permute(2, 1, 0).contiguous()
2002
2044
  return grad_vol, None, None, None, None, None, None, None, None
@@ -2010,8 +2052,8 @@ class ConeBackprojectorFunction(torch.autograd.Function):
2010
2052
 
2011
2053
  Notes
2012
2054
  -----
2013
- Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph
2014
- ray-tracing algorithm for 3D cone beam backprojection. The forward pass
2055
+ Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
2056
+ method with interpolation for 3D cone beam backprojection. The forward pass
2015
2057
  computes a 3D reconstruction from cone beam projection data using
2016
2058
  backprojection as the adjoint operation. The backward pass computes gradients
2017
2059
  via 3D cone beam forward projection. Requires CUDA-capable hardware and
@@ -2078,7 +2120,7 @@ class ConeBackprojectorFunction(torch.autograd.Function):
2078
2120
  - All input tensors must be on the same CUDA device.
2079
2121
  - The operation is fully differentiable and supports autograd.
2080
2122
  - Cone beam geometry uses a point source and a 2D detector array.
2081
- - Uses the Siddon-Joseph algorithm for accurate 3D ray tracing and trilinear interpolation.
2123
+ - Uses the Siddon method with interpolation for accurate 3D ray tracing and trilinear interpolation.
2082
2124
 
2083
2125
  Examples
2084
2126
  --------
@@ -2111,12 +2153,13 @@ class ConeBackprojectorFunction(torch.autograd.Function):
2111
2153
  grid, tpb = _grid_3d(n_views, n_u, n_v)
2112
2154
  cx, cy, cz = _DTYPE(W * 0.5), _DTYPE(H * 0.5), _DTYPE(D * 0.5)
2113
2155
 
2114
- _cone_3d_backward_kernel[grid, tpb](
2156
+ pt_stream = torch.cuda.current_stream()
2157
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
2158
+ _cone_3d_backward_kernel[grid, tpb, numba_stream](
2115
2159
  d_sino, n_views, n_u, n_v, d_reco, W, H, D,
2116
2160
  _DTYPE(du), _DTYPE(dv), d_cos_arr, d_sin_arr,
2117
2161
  _DTYPE(sdd), _DTYPE(sid), cx, cy, cz, _DTYPE(voxel_spacing)
2118
2162
  )
2119
- torch.cuda.synchronize()
2120
2163
 
2121
2164
  ctx.save_for_backward(angles)
2122
2165
  ctx.intermediate = (D, H, W, n_u, n_v, du, dv,
@@ -2150,11 +2193,12 @@ class ConeBackprojectorFunction(torch.autograd.Function):
2150
2193
  grid, tpb = _grid_3d(n_views, n_u, n_v)
2151
2194
  cx, cy, cz = _DTYPE(W * 0.5), _DTYPE(H * 0.5), _DTYPE(D * 0.5)
2152
2195
 
2153
- _cone_3d_forward_kernel[grid, tpb](
2196
+ pt_stream = torch.cuda.current_stream()
2197
+ numba_stream = cuda.external_stream(pt_stream.cuda_stream)
2198
+ _cone_3d_forward_kernel[grid, tpb, numba_stream](
2154
2199
  d_grad_out, W, H, D, d_sino_grad, n_views, n_u, n_v,
2155
2200
  _DTYPE(du), _DTYPE(dv), d_cos_arr, d_sin_arr,
2156
2201
  _DTYPE(sdd), _DTYPE(sid), cx, cy, cz, _DTYPE(voxel_spacing)
2157
2202
  )
2158
- torch.cuda.synchronize()
2159
2203
 
2160
2204
  return grad_sino, None, None, None, None, None, None, None, None, None
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "diffct"
7
- version = "1.2.6"
7
+ version = "1.2.7"
8
8
  description = "A CUDA-based library for computed tomography (CT) projection and reconstruction with differentiable operators"
9
9
  readme = "README.md"
10
10
  authors = [
@@ -36,5 +36,4 @@ where = ["."]
36
36
  [tool.hatch.envs.default]
37
37
  python = "python"
38
38
 
39
- [tool.hatch.envs.default.env-vars]
40
- PYTHONDONTWRITEBYTECODE = "1"
39
+ [tool.hatch.envs.default.env-vars]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes