PyPI - diffct - Versions diffs - 1.2.6__tar.gz → 1.2.7__tar.gz - Mend

diffct 1.2.6tar.gz → 1.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{diffct-1.2.6 → diffct-1.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffct
-Version: 1.2.6
+Version: 1.2.7
 Summary: A CUDA-based library for computed tomography (CT) projection and reconstruction with differentiable operators
 Project-URL: Homepage, https://github.com/sypsyp97/diffct
 Author-email: Yipeng Sun <yipeng.sun@fau.de>
@@ -69,7 +69,33 @@ diffct/
 ### Installation
+**CUDA 12:**
 ```bash
+# Create and activate conda environment
+conda create -n diffct python=3.12
+conda activate diffct
+# Install CUDA Toolkit, PyTorch, and Numba
+conda install nvidia/label/cuda-12.8.1::cuda-toolkit
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+pip install numba-cuda[cu12]
+# Install diffct
+pip install diffct
+```
+**CUDA 11:**
+```bash
+# Create and activate conda environment
+conda create -n diffct python=3.12
+conda activate diffct
+# Install CUDA Toolkit, PyTorch, and Numba
+conda install nvidia/label/cuda-11.8.0::cuda-toolkit
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+pip install numba-cuda[cu11]
+# Install diffct
 pip install diffct
 ```

{diffct-1.2.6 → diffct-1.2.7}/README.md RENAMED Viewed

@@ -52,7 +52,33 @@ diffct/
 ### Installation
+**CUDA 12:**
 ```bash
+# Create and activate conda environment
+conda create -n diffct python=3.12
+conda activate diffct
+# Install CUDA Toolkit, PyTorch, and Numba
+conda install nvidia/label/cuda-12.8.1::cuda-toolkit
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+pip install numba-cuda[cu12]
+# Install diffct
+pip install diffct
+```
+**CUDA 11:**
+```bash
+# Create and activate conda environment
+conda create -n diffct python=3.12
+conda activate diffct
+# Install CUDA Toolkit, PyTorch, and Numba
+conda install nvidia/label/cuda-11.8.0::cuda-toolkit
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+pip install numba-cuda[cu11]
+# Install diffct
 pip install diffct
 ```

{diffct-1.2.6 → diffct-1.2.7}/diffct/differentiable.py RENAMED Viewed

@@ -2,6 +2,7 @@ import math
 import numpy as np
 import torch
 from numba import cuda
+from functools import lru_cache
 # ---------------------------------------------------------------------------
 # Global settings & helpers
@@ -19,8 +20,7 @@ _TPB_3D             = (8,  8,  8)
 # Trades numerical precision for performance in ray-tracing calculations
 # Safe for CT reconstruction where slight precision loss is acceptable for speed gains
 _FASTMATH_DECORATOR = cuda.jit(cache=True, fastmath=True)
-# Disable fastmath for backward kernels to ensure gradient correctness
-_NON_FASTMATH_DECORATOR = cuda.jit(cache=True, fastmath=False)
 _INF                = _DTYPE(np.inf)
 _EPSILON            = _DTYPE(1e-6)
 # === Device Management Utilities ===
@@ -70,8 +70,8 @@ class DeviceManager:
         ... )
         tensor([1, 2, 3], device='cuda:0')
         """
-        if hasattr(tensor, "to"):
-            return tensor if tensor.device == device else tensor.to(device)
+        if hasattr(tensor, "to") and tensor.device != device:
+            return tensor.to(device)
         return tensor
 # === PyTorch-CUDA Bridge ===
@@ -110,6 +110,7 @@ class TorchCUDABridge:
 # === GPU-aware Trigonometric Table Generation ===
+@lru_cache(maxsize=2048)
 def _trig_tables(angles, dtype=_DTYPE, device=None):
     """Compute cosine and sine tables for input angles.
@@ -139,9 +140,11 @@ def _trig_tables(angles, dtype=_DTYPE, device=None):
     """
     if isinstance(angles, torch.Tensor):
         device = angles.device if device is None else device
-        cos = torch.cos(angles).to(dtype=dtype)
-        sin = torch.sin(angles).to(dtype=dtype)
-        return cos.to(device), sin.to(device)
+        # Compute both cos and sin in one call to avoid redundant kernel launches
+        angles_device = angles.to(dtype=dtype, device=device)
+        cos = torch.cos(angles_device)
+        sin = torch.sin(angles_device)
+        return cos, sin
     else:
         # fallback for non-tensor inputs: compute via PyTorch on CPU for consistency
         # Determine desired torch dtype
@@ -153,7 +156,7 @@ def _trig_tables(angles, dtype=_DTYPE, device=None):
                 np.float64: torch.float64,
             }
             torch_dtype = _NP_TO_TORCH.get(dtype, torch.float32)
-        # Convert input angles to a CPU torch tensor
+        # Convert input angles to a CPU torch tensor and compute both simultaneously
         angles_cpu = torch.tensor(angles, dtype=torch_dtype)
         cos_cpu = torch.cos(angles_cpu)
         sin_cpu = torch.sin(angles_cpu)
@@ -182,18 +185,23 @@ def _validate_3d_memory_layout(tensor, expected_order='DHW'):
     ValueError
         If tensor has unexpected memory layout or is non-contiguous
     """
-    if len(tensor.shape) != 3:
-        raise ValueError(f"Expected 3D tensor, got {len(tensor.shape)}D")
-    # Check if tensor is contiguous to avoid memory duplication
-    if not tensor.is_contiguous():
-        raise ValueError(
-            "Input tensor must be contiguous. Call .contiguous() before passing to "
-            "cone beam functions to avoid memory duplication and ensure correct results."
-        )
+    shape = tensor.shape
+    if len(shape) != 3:
+        raise ValueError(f"Expected 3D tensor, got {len(shape)}D")
+    # Early return for common case - contiguous tensor with expected ordering
+    if tensor.is_contiguous() and expected_order in ('DHW', 'VHW'):
+        # For DHW and VHW, the expected order matches memory layout when contiguous
+        return
     # Only check memory order for DHW and VHW, not for internal WHD layout
     if expected_order in ('DHW', 'VHW'):
+        if not tensor.is_contiguous():
+            raise ValueError(
+                "Input tensor must be contiguous. Call .contiguous() before passing to "
+                "cone beam functions to avoid memory duplication and ensure correct results."
+            )
         strides = tensor.stride()
         order_mapping = {
             'DHW': (0, 1, 2),  # Depth, Height, Width
@@ -210,15 +218,15 @@ def _validate_3d_memory_layout(tensor, expected_order='DHW'):
         if actual_order != expected_stride_order:
             # Create appropriate error message based on context
             if expected_order == 'VHW':
-                actual_str = f"({tensor.shape[0]}, {tensor.shape[1]}, {tensor.shape[2]})"
+                actual_str = f"({shape[0]}, {shape[1]}, {shape[2]})"
                 expected_str = "(Views, Height, Width)"
                 fix_str = "ensure your sinogram has shape (num_views, det_v, det_u)"
             elif expected_order == 'DHW':
-                actual_str = f"({tensor.shape[0]}, {tensor.shape[1]}, {tensor.shape[2]})"
+                actual_str = f"({shape[0]}, {shape[1]}, {shape[2]})"
                 expected_str = "(Depth, Height, Width)"
                 fix_str = "ensure your volume has shape (D, H, W)"
             else:
-                actual_str = str(tuple(tensor.shape))
+                actual_str = str(tuple(shape))
                 expected_str = expected_order
                 fix_str = "check tensor dimensions"
@@ -316,7 +324,7 @@ def _parallel_2d_forward_kernel(
 ):
     """Compute the 2D parallel beam forward projection.
-    This CUDA kernel implements the Siddon-Joseph ray-tracing algorithm for
+    This CUDA kernel implements the Siddon ray-tracing method with interpolation for
     2D parallel beam forward projection.
     Parameters
@@ -348,7 +356,7 @@ def _parallel_2d_forward_kernel(
     Notes
     -----
-    The Siddon-Joseph algorithm provides accurate ray-volume intersection by:
+    The Siddon method with interpolation provides accurate ray-volume intersection by:
       - Calculating ray-volume boundary intersections to define traversal limits.
       - Iterating through voxels along the ray path via parametric equations.
       - Determining bilinear interpolation weights for sub-voxel sampling.
@@ -407,7 +415,7 @@ def _parallel_2d_forward_kernel(
     if t_min >= t_max:
         d_sino[iang, idet] = 0.0; return
-    # === SIDDON-JOSEPH VOXEL TRAVERSAL INITIALIZATION ===
+    # === SIDDON METHOD VOXEL TRAVERSAL INITIALIZATION ===
     accum = 0.0  # Accumulated projection value along ray
     t = t_min    # Current ray parameter (distance from ray start)
@@ -417,12 +425,15 @@ def _parallel_2d_forward_kernel(
     # Determine traversal direction and step sizes for each axis
     step_x, step_y = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1)  # Voxel stepping direction
-    dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF  # Parameter increment to cross one voxel in x
-    dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF  # Parameter increment to cross one voxel in y
-    # Calculate parameter values for next voxel boundary crossings
-    tx = ((ix + (step_x > 0)) - cx - pnt_x) / dir_x if abs(dir_x) > _EPSILON else _INF  # Next x-boundary crossing
-    ty = ((iy + (step_y > 0)) - cy - pnt_y) / dir_y if abs(dir_y) > _EPSILON else _INF  # Next y-boundary crossing
+    # Hoist inverse directions to reduce divisions and branches
+    inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
+    inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
+    dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF
+    dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF
+    # Calculate parameter values for next voxel boundary crossings using inv_dir_*
+    tx = ((ix + (step_x > 0)) - cx - pnt_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
+    ty = ((iy + (step_y > 0)) - cy - pnt_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
     # === MAIN RAY TRAVERSAL LOOP ===
     # Step through voxels along ray path, accumulating weighted contributions
@@ -437,9 +448,10 @@ def _parallel_2d_forward_kernel(
                 # === BILINEAR INTERPOLATION SAMPLING ===
                 # Sample volume at ray segment midpoint for accurate integration
                 # Mathematical basis: Midpoint rule for numerical integration along ray segments
-                mid_x = pnt_x + (t + seg_len * 0.5) * dir_x + cx  # Midpoint x-coordinate in image space
-                mid_y = pnt_y + (t + seg_len * 0.5) * dir_y + cy  # Midpoint y-coordinate in image space
+                t_mid = t + seg_len * 0.5
+                mid_x = pnt_x + t_mid * dir_x + cx  # Midpoint x-coordinate in image space
+                mid_y = pnt_y + t_mid * dir_y + cy  # Midpoint y-coordinate in image space
                 # Convert continuous coordinates to discrete voxel indices and fractional weights
                 # Floor operation gives base voxel index, fractional part gives interpolation weights
                 ix0, iy0 = int(math.floor(mid_x)), int(math.floor(mid_y))  # Base voxel indices (bottom-left corner)
@@ -476,7 +488,7 @@ def _parallel_2d_forward_kernel(
     d_sino[iang, idet] = accum
-@_NON_FASTMATH_DECORATOR
+@_FASTMATH_DECORATOR
 def _parallel_2d_backward_kernel(
     d_sino, n_ang, n_det,
     d_image, Nx, Ny,
@@ -484,8 +496,8 @@ def _parallel_2d_backward_kernel(
 ):
     """Compute the 2D parallel beam backprojection.
-    This CUDA kernel implements the Siddon-Joseph algorithm for 2D parallel
-    beam backprojection.
+    This CUDA kernel implements the Siddon ray-tracing method with interpolation for
+    2D parallel beam backprojection.
     Parameters
     ----------
@@ -549,16 +561,18 @@ def _parallel_2d_backward_kernel(
     if t_min >= t_max: return
-    # === SIDDON-JOSEPH TRAVERSAL INITIALIZATION ===
+    # === SIDDON METHOD TRAVERSAL INITIALIZATION ===
     t = t_min
     ix = int(math.floor(pnt_x + t * dir_x + cx))
     iy = int(math.floor(pnt_y + t * dir_y + cy))
     step_x, step_y = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1)
-    dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF
-    dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF
-    tx = ((ix + (step_x > 0)) - cx - pnt_x) / dir_x if abs(dir_x) > _EPSILON else _INF
-    ty = ((iy + (step_y > 0)) - cy - pnt_y) / dir_y if abs(dir_y) > _EPSILON else _INF
+    inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
+    inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
+    dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF
+    dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF
+    tx = ((ix + (step_x > 0)) - cx - pnt_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
+    ty = ((iy + (step_y > 0)) - cy - pnt_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
     # === BACKPROJECTION TRAVERSAL LOOP ===
     # Distribute sinogram value along ray path using bilinear interpolation
@@ -568,8 +582,9 @@ def _parallel_2d_backward_kernel(
             seg_len = t_next - t
             if seg_len > _EPSILON:
                 # Sample at ray segment midpoint (same as forward projection)
-                mid_x = pnt_x + (t + seg_len * 0.5) * dir_x + cx
-                mid_y = pnt_y + (t + seg_len * 0.5) * dir_y + cy
+                t_mid = t + seg_len * 0.5
+                mid_x = pnt_x + t_mid * dir_x + cx
+                mid_y = pnt_y + t_mid * dir_y + cy
                 ix0, iy0 = int(math.floor(mid_x)), int(math.floor(mid_y))
                 dx, dy = mid_x - ix0, mid_y - iy0
@@ -585,10 +600,12 @@ def _parallel_2d_backward_kernel(
                 # Performance impact: Atomic operations are slower than regular writes but necessary for correctness
                 # Memory access pattern: Global memory atomics with potential bank conflicts, but unavoidable
                 cval = val * seg_len  # Contribution value for this ray segment
-                cuda.atomic.add(d_image, (iy0,     ix0),     cval * (1 - dx) * (1 - dy))
-                cuda.atomic.add(d_image, (iy0,     ix0 + 1), cval * dx       * (1 - dy))
-                cuda.atomic.add(d_image, (iy0 + 1, ix0),     cval * (1 - dx) * dy)
-                cuda.atomic.add(d_image, (iy0 + 1, ix0 + 1), cval * dx       * dy)
+                one_minus_dx = 1.0 - dx
+                one_minus_dy = 1.0 - dy
+                cuda.atomic.add(d_image, (iy0,     ix0),     cval * one_minus_dx * one_minus_dy)
+                cuda.atomic.add(d_image, (iy0,     ix0 + 1), cval * dx          * one_minus_dy)
+                cuda.atomic.add(d_image, (iy0 + 1, ix0),     cval * one_minus_dx * dy)
+                cuda.atomic.add(d_image, (iy0 + 1, ix0 + 1), cval * dx          * dy)
         # Advance to next voxel (identical logic to forward projection)
         if tx <= ty:
@@ -613,8 +630,8 @@ def _fan_2d_forward_kernel(
 ):
     """Compute the 2D fan beam forward projection.
-    This CUDA kernel implements the Siddon-Joseph algorithm for 2D fan beam
-    forward projection.
+    This CUDA kernel implements the Siddon ray-tracing method with interpolation for
+    2D fan beam forward projection.
     Parameters
     ----------
@@ -704,7 +721,7 @@ def _fan_2d_forward_kernel(
     if t_min >= t_max:  # No valid intersection
         d_sino[iang, idet] = 0.0; return
-    # === SIDDON-JOSEPH TRAVERSAL (same algorithm as parallel beam) ===
+    # === SIDDON METHOD TRAVERSAL (same algorithm as parallel beam) ===
     accum = 0.0  # Accumulated projection value
     t = t_min    # Current ray parameter
@@ -714,10 +731,12 @@ def _fan_2d_forward_kernel(
     # Traversal parameters (identical to parallel beam implementation)
     step_x, step_y = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1)
-    dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF
-    dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF
-    tx = ((ix + (step_x > 0)) - cx - src_x) / dir_x if abs(dir_x) > _EPSILON else _INF
-    ty = ((iy + (step_y > 0)) - cy - src_y) / dir_y if abs(dir_y) > _EPSILON else _INF
+    inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
+    inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
+    dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF
+    dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF
+    tx = ((ix + (step_x > 0)) - cx - src_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
+    ty = ((iy + (step_y > 0)) - cy - src_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
     # Main traversal loop with bilinear interpolation (identical to parallel beam)
     while t < t_max:
@@ -726,8 +745,9 @@ def _fan_2d_forward_kernel(
             seg_len = t_next - t
             if seg_len > _EPSILON:
                 # Sample at midpoint using source as ray origin
-                mid_x = src_x + (t + seg_len * 0.5) * dir_x + cx
-                mid_y = src_y + (t + seg_len * 0.5) * dir_y + cy
+                t_mid = t + seg_len * 0.5
+                mid_x = src_x + t_mid * dir_x + cx
+                mid_y = src_y + t_mid * dir_y + cy
                 ix0, iy0 = int(math.floor(mid_x)), int(math.floor(mid_y))
                 dx, dy = mid_x - ix0, mid_y - iy0
@@ -756,7 +776,7 @@ def _fan_2d_forward_kernel(
     d_sino[iang, idet] = accum
-@_NON_FASTMATH_DECORATOR
+@_FASTMATH_DECORATOR
 def _fan_2d_backward_kernel(
     d_sino, n_ang, n_det,
     d_image, Nx, Ny,
@@ -765,8 +785,8 @@ def _fan_2d_backward_kernel(
 ):
     """Compute the 2D fan beam backprojection.
-    This CUDA kernel implements the Siddon-Joseph algorithm for 2D fan beam
-    backprojection.
+    This CUDA kernel implements the Siddon ray-tracing method with interpolation for
+    2D fan beam backprojection.
     Parameters
     ----------
@@ -851,16 +871,18 @@ def _fan_2d_backward_kernel(
     if t_min >= t_max: return
-    # === SIDDON-JOSEPH TRAVERSAL INITIALIZATION ===
+    # === SIDDON METHOD TRAVERSAL INITIALIZATION ===
     t = t_min
     ix = int(math.floor(src_x + t * dir_x + cx))
     iy = int(math.floor(src_y + t * dir_y + cy))
     step_x, step_y = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1)
-    dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF
-    dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF
-    tx = ((ix + (step_x > 0)) - cx - src_x) / dir_x if abs(dir_x) > _EPSILON else _INF
-    ty = ((iy + (step_y > 0)) - cy - src_y) / dir_y if abs(dir_y) > _EPSILON else _INF
+    inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
+    inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
+    dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF
+    dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF
+    tx = ((ix + (step_x > 0)) - cx - src_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
+    ty = ((iy + (step_y > 0)) - cy - src_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
     # === FAN BEAM BACKPROJECTION TRAVERSAL LOOP ===
     # Distribute sinogram value along divergent ray path using bilinear interpolation
@@ -870,8 +892,9 @@ def _fan_2d_backward_kernel(
             seg_len = t_next - t
             if seg_len > _EPSILON:
                 # Sample at ray segment midpoint using source as ray origin
-                mid_x = src_x + (t + seg_len * 0.5) * dir_x + cx
-                mid_y = src_y + (t + seg_len * 0.5) * dir_y + cy
+                t_mid = t + seg_len * 0.5
+                mid_x = src_x + t_mid * dir_x + cx
+                mid_y = src_y + t_mid * dir_y + cy
                 ix0, iy0 = int(math.floor(mid_x)), int(math.floor(mid_y))
                 dx, dy = mid_x - ix0, mid_y - iy0
@@ -886,10 +909,12 @@ def _fan_2d_backward_kernel(
                 # Atomic operations prevent race conditions when multiple divergent rays write to same voxel
                 # Performance consideration: Fan beam geometry may have more atomic contention than parallel beam
                 cval = val * seg_len  # Contribution value for this ray segment
-                cuda.atomic.add(d_image, (iy0,     ix0),     cval * (1 - dx) * (1 - dy))
-                cuda.atomic.add(d_image, (iy0,     ix0 + 1), cval * dx       * (1 - dy))
-                cuda.atomic.add(d_image, (iy0 + 1, ix0),     cval * (1 - dx) * dy)
-                cuda.atomic.add(d_image, (iy0 + 1, ix0 + 1), cval * dx       * dy)
+                one_minus_dx = 1.0 - dx
+                one_minus_dy = 1.0 - dy
+                cuda.atomic.add(d_image, (iy0,     ix0),     cval * one_minus_dx * one_minus_dy)
+                cuda.atomic.add(d_image, (iy0,     ix0 + 1), cval * dx          * one_minus_dy)
+                cuda.atomic.add(d_image, (iy0 + 1, ix0),     cval * one_minus_dx * dy)
+                cuda.atomic.add(d_image, (iy0 + 1, ix0 + 1), cval * dx          * dy)
         # === VOXEL BOUNDARY CROSSING LOGIC ===
         # Advance to next voxel based on which boundary is crossed first
@@ -915,8 +940,8 @@ def _cone_3d_forward_kernel(
 ):
     """Compute the 3D cone-beam forward projection.
-    This CUDA kernel implements the Siddon-Joseph algorithm for 3D cone-beam
-    forward projection.
+    This CUDA kernel implements the Siddon ray-tracing method with interpolation for
+    3D cone-beam forward projection.
     Parameters
     ----------
@@ -1025,7 +1050,7 @@ def _cone_3d_forward_kernel(
     if t_min >= t_max:  # No valid 3D intersection
         d_sino[iview, iu, iv] = 0.0; return
-    # === 3D SIDDON-JOSEPH TRAVERSAL INITIALIZATION ===
+    # === 3D SIDDON METHOD TRAVERSAL INITIALIZATION ===
     accum = 0.0  # Accumulated projection value
     t = t_min    # Current ray parameter
@@ -1036,14 +1061,17 @@ def _cone_3d_forward_kernel(
     # 3D traversal parameters (extends 2D algorithm)
     step_x, step_y, step_z = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1), (1 if dir_z >= 0 else -1)
-    dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF  # Parameter increment per x-voxel
-    dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF  # Parameter increment per y-voxel
-    dt_z = abs(1.0 / dir_z) if abs(dir_z) > _EPSILON else _INF  # Parameter increment per z-voxel
+    inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
+    inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
+    inv_dir_z = (1.0 / dir_z) if abs(dir_z) > _EPSILON else 0.0
+    dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF  # Parameter increment per x-voxel
+    dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF  # Parameter increment per y-voxel
+    dt_z = abs(inv_dir_z) if abs(dir_z) > _EPSILON else _INF  # Parameter increment per z-voxel
     # Calculate parameter values for next 3D voxel boundary crossings
-    tx = ((ix + (step_x > 0)) - cx - src_x) / dir_x if abs(dir_x) > _EPSILON else _INF
-    ty = ((iy + (step_y > 0)) - cy - src_y) / dir_y if abs(dir_y) > _EPSILON else _INF
-    tz = ((iz + (step_z > 0)) - cz - src_z) / dir_z if abs(dir_z) > _EPSILON else _INF
+    tx = ((ix + (step_x > 0)) - cx - src_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
+    ty = ((iy + (step_y > 0)) - cy - src_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
+    tz = ((iz + (step_z > 0)) - cz - src_z) * inv_dir_z if abs(dir_z) > _EPSILON else _INF
     # === 3D TRAVERSAL LOOP WITH TRILINEAR INTERPOLATION ===
     while t < t_max:
@@ -1056,34 +1084,35 @@ def _cone_3d_forward_kernel(
                 # === TRILINEAR INTERPOLATION SAMPLING ===
                 # Sample 3D volume at ray segment midpoint for accurate integration
                 # Mathematical basis: Midpoint rule for numerical integration along 3D ray segments
-                mid_x = src_x + (t + seg_len * 0.5) * dir_x + cx  # Midpoint x-coordinate in volume space
-                mid_y = src_y + (t + seg_len * 0.5) * dir_y + cy  # Midpoint y-coordinate in volume space
-                mid_z = src_z + (t + seg_len * 0.5) * dir_z + cz  # Midpoint z-coordinate in volume space
+                t_mid = t + seg_len * 0.5
+                mid_x = src_x + t_mid * dir_x + cx  # Midpoint x-coordinate in volume space
+                mid_y = src_y + t_mid * dir_y + cy  # Midpoint y-coordinate in volume space
+                mid_z = src_z + t_mid * dir_z + cz  # Midpoint z-coordinate in volume space
                 # Convert continuous 3D coordinates to discrete voxel indices and fractional weights
-                # Floor operation gives base voxel index, fractional part gives interpolation weights
-                ix0, iy0, iz0 = int(math.floor(mid_x)), int(math.floor(mid_y)), int(math.floor(mid_z))  # Base voxel indices (corner 0,0,0)
-                dx, dy, dz = mid_x - ix0, mid_y - iy0, mid_z - iz0  # Fractional parts: distance from base voxel center [0,1]
+                ix0, iy0, iz0 = int(math.floor(mid_x)), int(math.floor(mid_y)), int(math.floor(mid_z))
+                dx, dy, dz = mid_x - ix0, mid_y - iy0, mid_z - iz0
                 # Clamp indices to stay in-bounds during interpolation
                 ix0 = max(0, min(ix0, Nx - 2))
                 iy0 = max(0, min(iy0, Ny - 2))
                 iz0 = max(0, min(iz0, Nz - 2))
+                # Precompute complements
+                omdx = 1.0 - dx
+                omdy = 1.0 - dy
+                omdz = 1.0 - dz
                 # === TRILINEAR INTERPOLATION WEIGHT CALCULATION ===
-                # Mathematical basis: Trilinear interpolation formula f(x,y,z) = Σ f(xi,yi,zi) * wi(x,y,z)
-                # where wi(x,y,z) are the trilinear basis functions for each corner voxel of the 3D cube
-                # Weights are products of 1D linear interpolation weights: (1-dx) or dx, (1-dy) or dy, (1-dz) or dz
-                # Each of the 8 cube corners gets a weight proportional to its distance from the sample point
                 val = (
-                    d_vol[ix0,     iy0,     iz0]     * (1-dx)*(1-dy)*(1-dz) +  # Corner (0,0,0): weight = product of distances from opposite faces
-                    d_vol[ix0 + 1, iy0,     iz0]     * dx*(1-dy)*(1-dz) +     # Corner (1,0,0): weight = dx * (1-dy) * (1-dz)
-                    d_vol[ix0,     iy0 + 1, iz0]     * (1-dx)*dy*(1-dz) +     # Corner (0,1,0): weight = (1-dx) * dy * (1-dz)
-                    d_vol[ix0,     iy0,     iz0 + 1] * (1-dx)*(1-dy)*dz +     # Corner (0,0,1): weight = (1-dx) * (1-dy) * dz
-                    d_vol[ix0 + 1, iy0 + 1, iz0]     * dx*dy*(1-dz) +         # Corner (1,1,0): weight = dx * dy * (1-dz)
-                    d_vol[ix0 + 1, iy0,     iz0 + 1] * dx*(1-dy)*dz +         # Corner (1,0,1): weight = dx * (1-dy) * dz
-                    d_vol[ix0,     iy0 + 1, iz0 + 1] * (1-dx)*dy*dz +         # Corner (0,1,1): weight = (1-dx) * dy * dz
-                    d_vol[ix0 + 1, iy0 + 1, iz0 + 1] * dx*dy*dz               # Corner (1,1,1): weight = dx * dy * dz
+                    d_vol[ix0,     iy0,     iz0]     * omdx*omdy*omdz +
+                    d_vol[ix0 + 1, iy0,     iz0]     * dx  *omdy*omdz +
+                    d_vol[ix0,     iy0 + 1, iz0]     * omdx*dy  *omdz +
+                    d_vol[ix0,     iy0,     iz0 + 1] * omdx*omdy*dz   +
+                    d_vol[ix0 + 1, iy0 + 1, iz0]     * dx  *dy  *omdz +
+                    d_vol[ix0 + 1, iy0,     iz0 + 1] * dx  *omdy*dz   +
+                    d_vol[ix0,     iy0 + 1, iz0 + 1] * omdx*dy  *dz   +
+                    d_vol[ix0 + 1, iy0 + 1, iz0 + 1] * dx  *dy  *dz
                 )
                 # Accumulate contribution weighted by 3D ray segment length (discrete line integral approximation)
                 # This implements the 3D Radon transform: integral of f(x,y,z) along the ray path
@@ -1106,7 +1135,7 @@ def _cone_3d_forward_kernel(
     d_sino[iview, iu, iv] = accum
-@_NON_FASTMATH_DECORATOR
+@_FASTMATH_DECORATOR
 def _cone_3d_backward_kernel(
     d_sino, n_views, n_u, n_v,
     d_vol, Nx, Ny, Nz,
@@ -1115,8 +1144,8 @@ def _cone_3d_backward_kernel(
 ):
     """Compute the 3D cone-beam backprojection.
-    This CUDA kernel implements the Siddon-Joseph algorithm for 3D cone-beam
-    backprojection.
+    This CUDA kernel implements the Siddon ray-tracing method with interpolation for
+    3D cone-beam backprojection.
     Parameters
     ----------
@@ -1219,7 +1248,7 @@ def _cone_3d_backward_kernel(
     if t_min >= t_max: return
-    # === 3D SIDDON-JOSEPH TRAVERSAL INITIALIZATION ===
+    # === 3D SIDDON METHOD TRAVERSAL INITIALIZATION ===
     t = t_min
     ix = int(math.floor(src_x + t * dir_x + cx))  # Current voxel x-index
     iy = int(math.floor(src_y + t * dir_y + cy))  # Current voxel y-index
@@ -1227,14 +1256,17 @@ def _cone_3d_backward_kernel(
     # 3D traversal parameters (extends 2D algorithm)
     step_x, step_y, step_z = (1 if dir_x >= 0 else -1), (1 if dir_y >= 0 else -1), (1 if dir_z >= 0 else -1)
-    dt_x = abs(1.0 / dir_x) if abs(dir_x) > _EPSILON else _INF  # Parameter increment per x-voxel
-    dt_y = abs(1.0 / dir_y) if abs(dir_y) > _EPSILON else _INF  # Parameter increment per y-voxel
-    dt_z = abs(1.0 / dir_z) if abs(dir_z) > _EPSILON else _INF  # Parameter increment per z-voxel
+    inv_dir_x = (1.0 / dir_x) if abs(dir_x) > _EPSILON else 0.0
+    inv_dir_y = (1.0 / dir_y) if abs(dir_y) > _EPSILON else 0.0
+    inv_dir_z = (1.0 / dir_z) if abs(dir_z) > _EPSILON else 0.0
+    dt_x = abs(inv_dir_x) if abs(dir_x) > _EPSILON else _INF  # Parameter increment per x-voxel
+    dt_y = abs(inv_dir_y) if abs(dir_y) > _EPSILON else _INF  # Parameter increment per y-voxel
+    dt_z = abs(inv_dir_z) if abs(dir_z) > _EPSILON else _INF  # Parameter increment per z-voxel
     # Calculate parameter values for next 3D voxel boundary crossings
-    tx = ((ix + (step_x > 0)) - cx - src_x) / dir_x if abs(dir_x) > _EPSILON else _INF
-    ty = ((iy + (step_y > 0)) - cy - src_y) / dir_y if abs(dir_y) > _EPSILON else _INF
-    tz = ((iz + (step_z > 0)) - cz - src_z) / dir_z if abs(dir_z) > _EPSILON else _INF
+    tx = ((ix + (step_x > 0)) - cx - src_x) * inv_dir_x if abs(dir_x) > _EPSILON else _INF
+    ty = ((iy + (step_y > 0)) - cy - src_y) * inv_dir_y if abs(dir_y) > _EPSILON else _INF
+    tz = ((iz + (step_z > 0)) - cz - src_z) * inv_dir_z if abs(dir_z) > _EPSILON else _INF
     # === 3D CONE BEAM BACKPROJECTION TRAVERSAL LOOP ===
     # Distribute sinogram value along divergent 3D ray path using trilinear interpolation
@@ -1247,35 +1279,35 @@ def _cone_3d_backward_kernel(
             if seg_len > _EPSILON:
                 # === TRILINEAR INTERPOLATION SAMPLING ===
                 # Sample 3D volume at ray segment midpoint using source as ray origin
-                mid_x = src_x + (t + seg_len * 0.5) * dir_x + cx  # Midpoint x-coordinate
-                mid_y = src_y + (t + seg_len * 0.5) * dir_y + cy  # Midpoint y-coordinate
-                mid_z = src_z + (t + seg_len * 0.5) * dir_z + cz  # Midpoint z-coordinate
+                t_mid = t + seg_len * 0.5
+                mid_x = src_x + t_mid * dir_x + cx
+                mid_y = src_y + t_mid * dir_y + cy
+                mid_z = src_z + t_mid * dir_z + cz
                 # Convert continuous 3D coordinates to voxel indices and interpolation weights
                 ix0, iy0, iz0 = int(math.floor(mid_x)), int(math.floor(mid_y)), int(math.floor(mid_z))
-                dx, dy, dz = mid_x - ix0, mid_y - iy0, mid_z - iz0  # Fractional parts for 3D weights
+                dx, dy, dz = mid_x - ix0, mid_y - iy0, mid_z - iz0
                 # Clamp indices to stay in-bounds during interpolation
                 ix0 = max(0, min(ix0, Nx - 2))
                 iy0 = max(0, min(iy0, Ny - 2))
                 iz0 = max(0, min(iz0, Nz - 2))
+                # Precompute complements and contribution
+                omdx = 1.0 - dx
+                omdy = 1.0 - dy
+                omdz = 1.0 - dz
+                cval = g * seg_len
                 # === ATOMIC BACKPROJECTION WITH TRILINEAR WEIGHTS ===
-                # Distribute contribution weighted by segment length and interpolation weights
-                # CUDA 3D ATOMIC OPERATIONS: Most complex atomic pattern in cone beam backprojection
-                # 8 atomic writes per ray segment (one per cube corner) increases memory contention significantly
-                # Cone beam geometry creates maximum ray convergence, highest probability of write conflicts
-                # Performance impact: 3D atomics are most expensive due to volume of concurrent writes
-                # Memory bandwidth: 8 atomic operations per interpolation point can saturate memory subsystem
-                cval = g * seg_len  # Contribution value for this ray segment
-                cuda.atomic.add(d_vol, (ix0,     iy0,     iz0),     cval * (1-dx)*(1-dy)*(1-dz))  # Corner (0,0,0) - atomic write
-                cuda.atomic.add(d_vol, (ix0 + 1, iy0,     iz0),     cval * dx*(1-dy)*(1-dz))      # Corner (1,0,0) - atomic write
-                cuda.atomic.add(d_vol, (ix0,     iy0 + 1, iz0),     cval * (1-dx)*dy*(1-dz))      # Corner (0,1,0) - atomic write
-                cuda.atomic.add(d_vol, (ix0,     iy0,     iz0 + 1), cval * (1-dx)*(1-dy)*dz)      # Corner (0,0,1) - atomic write
-                cuda.atomic.add(d_vol, (ix0 + 1, iy0 + 1, iz0),     cval * dx*dy*(1-dz))          # Corner (1,1,0) - atomic write
-                cuda.atomic.add(d_vol, (ix0 + 1, iy0,     iz0 + 1), cval * dx*(1-dy)*dz)          # Corner (1,0,1) - atomic write
-                cuda.atomic.add(d_vol, (ix0,     iy0 + 1, iz0 + 1), cval * (1-dx)*dy*dz)          # Corner (0,1,1) - atomic write
-                cuda.atomic.add(d_vol, (ix0 + 1, iy0 + 1, iz0 + 1), cval * dx*dy*dz)              # Corner (1,1,1) - atomic write
+                cuda.atomic.add(d_vol, (ix0,     iy0,     iz0),     cval * omdx*omdy*omdz)
+                cuda.atomic.add(d_vol, (ix0 + 1, iy0,     iz0),     cval * dx  *omdy*omdz)
+                cuda.atomic.add(d_vol, (ix0,     iy0 + 1, iz0),     cval * omdx*dy  *omdz)
+                cuda.atomic.add(d_vol, (ix0,     iy0,     iz0 + 1), cval * omdx*omdy*dz)
+                cuda.atomic.add(d_vol, (ix0 + 1, iy0 + 1, iz0),     cval * dx  *dy  *omdz)
+                cuda.atomic.add(d_vol, (ix0 + 1, iy0,     iz0 + 1), cval * dx  *omdy*dz)
+                cuda.atomic.add(d_vol, (ix0,     iy0 + 1, iz0 + 1), cval * omdx*dy  *dz)
+                cuda.atomic.add(d_vol, (ix0 + 1, iy0 + 1, iz0 + 1), cval * dx  *dy  *dz)
         # === 3D VOXEL BOUNDARY CROSSING LOGIC ===
         # Advance to next voxel based on which boundary is crossed first in 3D
@@ -1305,8 +1337,8 @@ class ParallelProjectorFunction(torch.autograd.Function):
     Notes
     -----
-    Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph
-    ray-tracing algorithm for parallel beam CT geometry. The forward pass computes
+    Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
+    method with interpolation for parallel beam CT geometry. The forward pass computes
     the sinogram from a 2D image using parallel beam geometry. The backward pass
     computes gradients using the adjoint backprojection operation. Requires
     CUDA-capable hardware and a properly configured CUDA environment; all input
@@ -1358,7 +1390,7 @@ class ParallelProjectorFunction(torch.autograd.Function):
         -----
         - All input tensors must be on the same CUDA device.
         - The operation is fully differentiable and supports autograd.
-        - Uses the Siddon-Joseph algorithm for accurate ray tracing and bilinear interpolation.
+        - Uses the Siddon method with interpolation for accurate ray tracing and bilinear interpolation.
         Examples
         --------
@@ -1394,11 +1426,12 @@ class ParallelProjectorFunction(torch.autograd.Function):
         grid, tpb = _grid_2d(n_angles, num_detectors)
         cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
-        _parallel_2d_forward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _parallel_2d_forward_kernel[grid, tpb, numba_stream](
             d_image, Nx, Ny, d_sino, n_angles, num_detectors,
             _DTYPE(detector_spacing), d_cos_arr, d_sin_arr, cx, cy, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         ctx.save_for_backward(angles)
         ctx.intermediate = (num_detectors, detector_spacing, Ny, Nx, voxel_spacing)
@@ -1427,12 +1460,13 @@ class ParallelProjectorFunction(torch.autograd.Function):
         grid, tpb = _grid_2d(n_angles, num_detectors)
         cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
-        _parallel_2d_backward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _parallel_2d_backward_kernel[grid, tpb, numba_stream](
             d_grad_sino, n_angles, num_detectors,
             d_img_grad, Nx, Ny,
             _DTYPE(detector_spacing), d_cos_arr, d_sin_arr, cx, cy, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         return grad_image, None, None, None, None
@@ -1445,8 +1479,8 @@ class ParallelBackprojectorFunction(torch.autograd.Function):
     Notes
     -----
-    Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph ray-tracing
-    algorithm for parallel beam backprojection. The forward pass computes a 2D
+    Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
+    method with interpolation for parallel beam backprojection. The forward pass computes a 2D
     reconstruction from sinogram data using parallel beam backprojection, and the
     backward pass computes gradients via forward projection as the adjoint operation.
     Requires CUDA-capable hardware and consistent device placements.
@@ -1493,7 +1527,7 @@ class ParallelBackprojectorFunction(torch.autograd.Function):
         -----
         - All input tensors must be on the same CUDA device.
         - The operation is fully differentiable and supports autograd.
-        - Uses the Siddon-Joseph algorithm for accurate ray tracing and bilinear interpolation.
+        - Uses the Siddon method with interpolation for accurate ray tracing and bilinear interpolation.
         Examples
         --------
@@ -1529,11 +1563,12 @@ class ParallelBackprojectorFunction(torch.autograd.Function):
         grid, tpb = _grid_2d(n_ang, n_det)
         cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
-        _parallel_2d_backward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _parallel_2d_backward_kernel[grid, tpb, numba_stream](
             d_sino, n_ang, n_det, d_reco, Nx, Ny,
             _DTYPE(detector_spacing), d_cos_arr, d_sin_arr, cx, cy, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         ctx.save_for_backward(angles)
         ctx.intermediate = (H, W, detector_spacing, sinogram.shape[0], sinogram.shape[1], voxel_spacing)
@@ -1567,11 +1602,12 @@ class ParallelBackprojectorFunction(torch.autograd.Function):
         grid, tpb = _grid_2d(n_ang, n_det)
         cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
-        _parallel_2d_forward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _parallel_2d_forward_kernel[grid, tpb, numba_stream](
             d_grad_out, Nx, Ny, d_sino_grad, n_ang, n_det,
             _DTYPE(detector_spacing), d_cos, d_sin, cx, cy, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         return grad_sino, None, None, None, None, None
@@ -1584,8 +1620,8 @@ class FanProjectorFunction(torch.autograd.Function):
     Notes
     -----
-    Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph
-    ray-tracing algorithm for fan beam geometry, where rays diverge from a point
+    Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
+    method with interpolation for fan beam geometry, where rays diverge from a point
     X-ray source to a linear detector array. The forward pass computes sinograms
     using divergent beam geometry, and the backward pass computes gradients via
     adjoint backprojection.
@@ -1637,7 +1673,7 @@ class FanProjectorFunction(torch.autograd.Function):
         - All input tensors must be on the same CUDA device.
         - The operation is fully differentiable and supports autograd.
         - Fan beam geometry uses divergent rays from a point source to the detector.
-        - Uses the Siddon-Joseph algorithm for accurate ray tracing and bilinear interpolation.
+        - Uses the Siddon method with interpolation for accurate ray tracing and bilinear interpolation.
         Examples
         --------
@@ -1668,12 +1704,13 @@ class FanProjectorFunction(torch.autograd.Function):
         grid, tpb = _grid_2d(n_ang, num_detectors)
         cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
-        _fan_2d_forward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _fan_2d_forward_kernel[grid, tpb, numba_stream](
             d_image, Nx, Ny, d_sino, n_ang, num_detectors,
             _DTYPE(detector_spacing), d_cos_arr, d_sin_arr,
             _DTYPE(sdd), _DTYPE(sid), cx, cy, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         ctx.save_for_backward(angles)
         ctx.intermediate = (num_detectors, detector_spacing, Ny, Nx,
@@ -1703,12 +1740,13 @@ class FanProjectorFunction(torch.autograd.Function):
         grid, tpb = _grid_2d(n_ang, n_det)
         cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
-        _fan_2d_backward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _fan_2d_backward_kernel[grid, tpb, numba_stream](
             d_grad_sino, n_ang, n_det, d_img_grad, Nx, Ny,
             _DTYPE(det_spacing), d_cos_arr, d_sin_arr,
             _DTYPE(sdd), _DTYPE(sid), cx, cy, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         return grad_img, None, None, None, None, None, None
@@ -1721,8 +1759,8 @@ class FanBackprojectorFunction(torch.autograd.Function):
     Notes
     -----
-    Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph
-    ray-tracing algorithm for fan beam backprojection. Implements the adjoint
+    Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
+    method with interpolation for fan beam backprojection. Implements the adjoint
     of the fan beam projection operator, distributing sinogram values back into
     the reconstruction volume along divergent ray paths. The forward pass
     computes reconstruction from sinogram data, and the backward pass computes
@@ -1777,7 +1815,7 @@ class FanBackprojectorFunction(torch.autograd.Function):
         - All input tensors must be on the same CUDA device.
         - The operation is fully differentiable and supports autograd.
         - Fan beam geometry uses divergent rays from a point source to the detector.
-        - Uses the Siddon-Joseph algorithm for accurate ray tracing and bilinear interpolation.
+        - Uses the Siddon method with interpolation for accurate ray tracing and bilinear interpolation.
         Examples
         --------
@@ -1808,12 +1846,13 @@ class FanBackprojectorFunction(torch.autograd.Function):
         grid, tpb = _grid_2d(n_ang, n_det)
         cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
-        _fan_2d_backward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _fan_2d_backward_kernel[grid, tpb, numba_stream](
             d_sino, n_ang, n_det, d_reco, Nx, Ny,
             _DTYPE(detector_spacing), d_cos_arr, d_sin_arr,
             _DTYPE(sdd), _DTYPE(sid), cx, cy, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         ctx.save_for_backward(angles)
         ctx.intermediate = (H, W, detector_spacing, n_ang, n_det, sdd, sid, voxel_spacing)
@@ -1843,12 +1882,13 @@ class FanBackprojectorFunction(torch.autograd.Function):
         grid, tpb = _grid_2d(n_ang, n_det)
         cx, cy = _DTYPE(Nx * 0.5), _DTYPE(Ny * 0.5)
-        _fan_2d_forward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _fan_2d_forward_kernel[grid, tpb, numba_stream](
             d_grad_out, Nx, Ny, d_sino_grad, n_ang, n_det,
             _DTYPE(det_spacing), d_cos_arr, d_sin_arr,
             _DTYPE(sdd), _DTYPE(sid), cx, cy, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         return grad_sino, None, None, None, None, None, None, None
@@ -1861,8 +1901,8 @@ class ConeProjectorFunction(torch.autograd.Function):
     Notes
     -----
-    Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph
-    ray-tracing algorithm for 3D cone beam geometry. Rays emanate from a point
+    Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
+    method with interpolation for 3D cone beam geometry. Rays emanate from a point
     X-ray source to a 2D detector array capturing volumetric projection data.
     The forward pass computes 3D projections, and the backward pass computes
     gradients via adjoint 3D backprojection. Requires significant GPU memory.
@@ -1918,7 +1958,7 @@ class ConeProjectorFunction(torch.autograd.Function):
         - All input tensors must be on the same CUDA device.
         - The operation is fully differentiable and supports autograd.
         - Cone beam geometry uses a point source and a 2D detector array.
-        - Uses the Siddon-Joseph algorithm for accurate 3D ray tracing and trilinear interpolation.
+        - Uses the Siddon method with interpolation for accurate 3D ray tracing and trilinear interpolation.
         Examples
         --------
@@ -1953,13 +1993,14 @@ class ConeProjectorFunction(torch.autograd.Function):
         grid, tpb = _grid_3d(n_views, det_u, det_v)
         cx, cy, cz = _DTYPE(W * 0.5), _DTYPE(H * 0.5), _DTYPE(D * 0.5)
-        _cone_3d_forward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _cone_3d_forward_kernel[grid, tpb, numba_stream](
             d_vol, W, H, D, d_sino, n_views, det_u, det_v,
             _DTYPE(du), _DTYPE(dv), d_cos_arr, d_sin_arr,
             _DTYPE(sdd), _DTYPE(sid),
             cx, cy, cz, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         ctx.save_for_backward(angles)
         ctx.intermediate = (D, H, W, det_u, det_v, du, dv,
@@ -1991,12 +2032,13 @@ class ConeProjectorFunction(torch.autograd.Function):
         grid, tpb = _grid_3d(n_views, det_u, det_v)
         cx, cy, cz = _DTYPE(W * 0.5), _DTYPE(H * 0.5), _DTYPE(D * 0.5)
-        _cone_3d_backward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _cone_3d_backward_kernel[grid, tpb, numba_stream](
             d_grad_sino, n_views, det_u, det_v, d_vol_grad, W, H, D,
             _DTYPE(du), _DTYPE(dv), d_cos_arr, d_sin_arr,
             _DTYPE(sdd), _DTYPE(sid), cx, cy, cz, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         grad_vol = grad_vol_perm.permute(2, 1, 0).contiguous()
         return grad_vol, None, None, None, None, None, None, None, None
@@ -2010,8 +2052,8 @@ class ConeBackprojectorFunction(torch.autograd.Function):
     Notes
     -----
-    Provides a differentiable interface to the CUDA-accelerated Siddon-Joseph
-    ray-tracing algorithm for 3D cone beam backprojection. The forward pass
+    Provides a differentiable interface to the CUDA-accelerated Siddon ray-tracing
+    method with interpolation for 3D cone beam backprojection. The forward pass
     computes a 3D reconstruction from cone beam projection data using
     backprojection as the adjoint operation. The backward pass computes gradients
     via 3D cone beam forward projection. Requires CUDA-capable hardware and
@@ -2078,7 +2120,7 @@ class ConeBackprojectorFunction(torch.autograd.Function):
         - All input tensors must be on the same CUDA device.
         - The operation is fully differentiable and supports autograd.
         - Cone beam geometry uses a point source and a 2D detector array.
-        - Uses the Siddon-Joseph algorithm for accurate 3D ray tracing and trilinear interpolation.
+        - Uses the Siddon method with interpolation for accurate 3D ray tracing and trilinear interpolation.
         Examples
         --------
@@ -2111,12 +2153,13 @@ class ConeBackprojectorFunction(torch.autograd.Function):
         grid, tpb = _grid_3d(n_views, n_u, n_v)
         cx, cy, cz = _DTYPE(W * 0.5), _DTYPE(H * 0.5), _DTYPE(D * 0.5)
-        _cone_3d_backward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _cone_3d_backward_kernel[grid, tpb, numba_stream](
             d_sino, n_views, n_u, n_v, d_reco, W, H, D,
             _DTYPE(du), _DTYPE(dv), d_cos_arr, d_sin_arr,
             _DTYPE(sdd), _DTYPE(sid), cx, cy, cz, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         ctx.save_for_backward(angles)
         ctx.intermediate = (D, H, W, n_u, n_v, du, dv,
@@ -2150,11 +2193,12 @@ class ConeBackprojectorFunction(torch.autograd.Function):
         grid, tpb = _grid_3d(n_views, n_u, n_v)
         cx, cy, cz = _DTYPE(W * 0.5), _DTYPE(H * 0.5), _DTYPE(D * 0.5)
-        _cone_3d_forward_kernel[grid, tpb](
+        pt_stream = torch.cuda.current_stream()
+        numba_stream = cuda.external_stream(pt_stream.cuda_stream)
+        _cone_3d_forward_kernel[grid, tpb, numba_stream](
             d_grad_out, W, H, D, d_sino_grad, n_views, n_u, n_v,
             _DTYPE(du), _DTYPE(dv), d_cos_arr, d_sin_arr,
             _DTYPE(sdd), _DTYPE(sid), cx, cy, cz, _DTYPE(voxel_spacing)
         )
-        torch.cuda.synchronize()
         return grad_sino, None, None, None, None, None, None, None, None, None

{diffct-1.2.6 → diffct-1.2.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "diffct"
-version = "1.2.6"
+version = "1.2.7"
 description = "A CUDA-based library for computed tomography (CT) projection and reconstruction with differentiable operators"
 readme = "README.md"
 authors = [
@@ -36,5 +36,4 @@ where = ["."]
 [tool.hatch.envs.default]
 python = "python"
-[tool.hatch.envs.default.env-vars]
-PYTHONDONTWRITEBYTECODE = "1"
+[tool.hatch.envs.default.env-vars]