cuslines 2.0.0__tar.gz → 2.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. cuslines-2.2.1/.github/workflows/dockerbuild.yml +37 -0
  2. cuslines-2.2.1/.github/workflows/publish_pypi.yml +33 -0
  3. cuslines-2.2.1/.gitignore +15 -0
  4. cuslines-2.2.1/.pre-commit-config.yaml +18 -0
  5. cuslines-2.2.1/CLAUDE.md +121 -0
  6. cuslines-2.2.1/Dockerfile +19 -0
  7. {cuslines-2.0.0/cuslines.egg-info → cuslines-2.2.1}/PKG-INFO +46 -12
  8. cuslines-2.0.0/PKG-INFO → cuslines-2.2.1/README.md +23 -25
  9. cuslines-2.2.1/cuslines/__init__.py +88 -0
  10. cuslines-2.2.1/cuslines/boot_utils.py +94 -0
  11. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/boot.cu +55 -141
  12. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/generate_streamlines_cuda.cu +55 -128
  13. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/globals.h +5 -17
  14. cuslines-2.2.1/cuslines/cuda_c/ptt.cu +474 -0
  15. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/ptt.cuh +4 -4
  16. cuslines-2.2.1/cuslines/cuda_c/ptt_init.cu +63 -0
  17. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/tracking_helpers.cu +48 -93
  18. cuslines-2.2.1/cuslines/cuda_python/_globals.py +8 -0
  19. cuslines-2.2.1/cuslines/cuda_python/cu_direction_getters.py +648 -0
  20. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_python/cu_propagate_seeds.py +28 -25
  21. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_python/cu_tractography.py +135 -150
  22. cuslines-2.2.1/cuslines/cuda_python/cutils.py +128 -0
  23. cuslines-2.2.1/cuslines/generic_tracker.py +119 -0
  24. cuslines-2.2.1/cuslines/metal/README.md +127 -0
  25. cuslines-2.2.1/cuslines/metal/__init__.py +13 -0
  26. cuslines-2.2.1/cuslines/metal/mt_direction_getters.py +465 -0
  27. cuslines-2.2.1/cuslines/metal/mt_propagate_seeds.py +201 -0
  28. cuslines-2.2.1/cuslines/metal/mt_tractography.py +170 -0
  29. cuslines-2.2.1/cuslines/metal/mutils.py +142 -0
  30. cuslines-2.2.1/cuslines/metal_shaders/boot.metal +869 -0
  31. cuslines-2.2.1/cuslines/metal_shaders/disc.h +1890 -0
  32. cuslines-2.2.1/cuslines/metal_shaders/generate_streamlines_metal.metal +400 -0
  33. cuslines-2.2.1/cuslines/metal_shaders/globals.h +67 -0
  34. cuslines-2.2.1/cuslines/metal_shaders/philox_rng.h +152 -0
  35. cuslines-2.2.1/cuslines/metal_shaders/ptt.metal +1061 -0
  36. cuslines-2.2.1/cuslines/metal_shaders/tracking_helpers.metal +221 -0
  37. cuslines-2.2.1/cuslines/metal_shaders/types.h +50 -0
  38. cuslines-2.2.1/cuslines/metal_shaders/utils.metal +107 -0
  39. cuslines-2.2.1/cuslines/metal_shaders/warp_sort.metal +109 -0
  40. cuslines-2.2.1/cuslines/numba/__init__.py +13 -0
  41. cuslines-2.2.1/cuslines/numba/nu_globals.py +19 -0
  42. cuslines-2.2.1/cuslines/numba/nu_tractography.py +241 -0
  43. cuslines-2.2.1/cuslines/numba_njit/generate_streamlines_numba.py +284 -0
  44. cuslines-2.2.1/cuslines/numba_njit/num_streamlines_numba.py +170 -0
  45. cuslines-2.2.1/cuslines/numba_njit/tracking_helpers.py +62 -0
  46. cuslines-2.2.1/cuslines/webgpu/README.md +147 -0
  47. cuslines-2.2.1/cuslines/webgpu/__init__.py +19 -0
  48. cuslines-2.2.1/cuslines/webgpu/benchmark.py +486 -0
  49. cuslines-2.2.1/cuslines/webgpu/wg_direction_getters.py +478 -0
  50. cuslines-2.2.1/cuslines/webgpu/wg_propagate_seeds.py +205 -0
  51. cuslines-2.2.1/cuslines/webgpu/wg_tractography.py +203 -0
  52. cuslines-2.2.1/cuslines/webgpu/wgutils.py +127 -0
  53. cuslines-2.2.1/cuslines/wgsl_shaders/boot.wgsl +843 -0
  54. cuslines-2.2.1/cuslines/wgsl_shaders/disc.wgsl +74 -0
  55. cuslines-2.2.1/cuslines/wgsl_shaders/generate_streamlines.wgsl +418 -0
  56. cuslines-2.2.1/cuslines/wgsl_shaders/globals.wgsl +38 -0
  57. cuslines-2.2.1/cuslines/wgsl_shaders/philox_rng.wgsl +189 -0
  58. cuslines-2.2.1/cuslines/wgsl_shaders/ptt.wgsl +1153 -0
  59. cuslines-2.2.1/cuslines/wgsl_shaders/tracking_helpers.wgsl +261 -0
  60. cuslines-2.2.1/cuslines/wgsl_shaders/types.wgsl +19 -0
  61. cuslines-2.2.1/cuslines/wgsl_shaders/utils.wgsl +78 -0
  62. cuslines-2.2.1/cuslines/wgsl_shaders/warp_sort.wgsl +75 -0
  63. cuslines-2.2.1/cuslines.egg-info/PKG-INFO +124 -0
  64. cuslines-2.2.1/cuslines.egg-info/SOURCES.txt +74 -0
  65. cuslines-2.2.1/cuslines.egg-info/requires.txt +31 -0
  66. cuslines-2.2.1/pyproject.toml +88 -0
  67. cuslines-2.2.1/run_gpu_streamlines.py +462 -0
  68. cuslines-2.0.0/README.md +0 -73
  69. cuslines-2.0.0/cuslines/__init__.py +0 -13
  70. cuslines-2.0.0/cuslines/cuda_c/ptt.cu +0 -559
  71. cuslines-2.0.0/cuslines/cuda_python/_globals.py +0 -10
  72. cuslines-2.0.0/cuslines/cuda_python/cu_direction_getters.py +0 -472
  73. cuslines-2.0.0/cuslines/cuda_python/cutils.py +0 -64
  74. cuslines-2.0.0/cuslines.egg-info/SOURCES.txt +0 -26
  75. cuslines-2.0.0/cuslines.egg-info/requires.txt +0 -8
  76. cuslines-2.0.0/pyproject.toml +0 -24
  77. cuslines-2.0.0/setup.py +0 -50
  78. {cuslines-2.0.0 → cuslines-2.2.1}/LICENSE +0 -0
  79. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/cudamacro.h +0 -0
  80. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/cuwsort.cuh +0 -0
  81. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/disc.h +0 -0
  82. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/utils.cu +0 -0
  83. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_python/__init__.py +2 -2
  84. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines.egg-info/dependency_links.txt +0 -0
  85. {cuslines-2.0.0 → cuslines-2.2.1}/cuslines.egg-info/top_level.txt +0 -0
  86. {cuslines-2.0.0 → cuslines-2.2.1}/setup.cfg +0 -0
@@ -0,0 +1,37 @@
1
+ name: Build and Push Docker Image
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - 'master'
7
+ pull_request:
8
+ branches:
9
+ - 'master'
10
+
11
+ jobs:
12
+ build:
13
+ runs-on: ubuntu-latest
14
+ permissions:
15
+ packages: write
16
+ contents: read
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+
20
+ - name: Set up Docker Buildx
21
+ uses: docker/setup-buildx-action@v3
22
+
23
+ - name: Login to GHCR
24
+ uses: docker/login-action@v3
25
+ with:
26
+ registry: ghcr.io
27
+ username: ${{ github.actor }}
28
+ password: ${{ secrets.GITHUB_TOKEN }}
29
+
30
+ - name: Build and push Docker image
31
+ uses: docker/build-push-action@v6
32
+ with:
33
+ context: .
34
+ push: true
35
+ tags: |
36
+ ghcr.io/dipy/gpustreamlines:${{ github.sha }}
37
+ ghcr.io/dipy/gpustreamlines:latest
@@ -0,0 +1,33 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ build-n-publish:
9
+ name: Build and publish Python distro to PyPI
10
+ runs-on: ubuntu-latest
11
+ environment:
12
+ name: pypi
13
+ url: https://pypi.org/p/cuslines
14
+ permissions:
15
+ id-token: write
16
+
17
+ steps:
18
+ - name: Checkout code
19
+ uses: actions/checkout@v4
20
+
21
+ - name: Set up Python
22
+ uses: actions/setup-python@v5
23
+ with:
24
+ python-version: "3.x"
25
+
26
+ - name: Install build dependencies
27
+ run: python -m pip install build
28
+
29
+ - name: Build binary wheel and source tarball
30
+ run: python -m build
31
+
32
+ - name: Publish to PyPI
33
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,15 @@
1
+ # Python bytecode
2
+ **/*.pyc
3
+ **/__pycache__/
4
+ *.pyo
5
+ *.pyd
6
+
7
+ # Build artifacts
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+
12
+ # Test outputs
13
+ *.trk
14
+ *.trx
15
+ *.nii.gz
@@ -0,0 +1,18 @@
1
+ default_language_version:
2
+ python: python3
3
+
4
+ repos:
5
+ - repo: https://github.com/astral-sh/ruff-pre-commit
6
+ rev: v0.14.10
7
+ hooks:
8
+ # Run the linter
9
+ - id: ruff
10
+ args: [ --fix, --config, pyproject.toml ]
11
+ # Run the formatter
12
+ - id: ruff-format
13
+ - repo: https://github.com/codespell-project/codespell
14
+ rev: v2.3.0
15
+ hooks:
16
+ - id: codespell
17
+ additional_dependencies:
18
+ - tomli
@@ -0,0 +1,121 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ GPUStreamlines (`cuslines`) is a GPU-accelerated tractography package for diffusion MRI. It supports **three GPU backends**: NVIDIA CUDA, Apple Metal (Apple Silicon), and WebGPU (cross-platform via wgpu-py). Backend is auto-detected at import time in `cuslines/__init__.py` (priority: Metal → CUDA → WebGPU). Kernels are compiled at runtime (NVRTC for CUDA, `MTLDevice.newLibraryWithSource` for Metal, `device.create_shader_module` for WebGPU/WGSL).
8
+
9
+ ## Build & Run
10
+
11
+ ```bash
12
+ # Install (pick your backend)
13
+ pip install ".[cu13]" # CUDA 13
14
+ pip install ".[cu12]" # CUDA 12
15
+ pip install ".[metal]" # Apple Metal (Apple Silicon)
16
+ pip install ".[webgpu]" # WebGPU (cross-platform: NVIDIA, AMD, Intel, Apple)
17
+
18
+ # From PyPI
19
+ pip install "cuslines[cu13]"
20
+ pip install "cuslines[metal]"
21
+ pip install "cuslines[webgpu]"
22
+
23
+ # GPU run (downloads HARDI dataset if no data passed)
24
+ python run_gpu_streamlines.py --output-prefix small --nseeds 1000 --ngpus 1
25
+
26
+ # Force a specific backend
27
+ python run_gpu_streamlines.py --device=webgpu --output-prefix small --nseeds 1000
28
+
29
+ # CPU reference run (for comparison/debugging)
30
+ python run_gpu_streamlines.py --device=cpu --output-prefix small --nseeds 1000
31
+
32
+ # Docker
33
+ docker build -t gpustreamlines .
34
+ ```
35
+
36
+ There is no dedicated test or lint suite. Validate by comparing CPU vs GPU outputs on the same seeds.
37
+
38
+ ## Architecture
39
+
40
+ **Two-layer design**: Python orchestration + GPU kernels compiled at runtime. Three parallel backend implementations share the same API surface.
41
+
42
+ ```
43
+ run_gpu_streamlines.py # CLI entry: DIPY model fitting → CPU or GPU tracking
44
+ cuslines/
45
+ __init__.py # Auto-detects Metal → CUDA → WebGPU backend at import
46
+ boot_utils.py # Shared bootstrap matrix preparation (OPDT/CSA) for all backends
47
+ cuda_python/ # CUDA backend
48
+ cu_tractography.py # GPUTracker: context manager, multi-GPU allocation
49
+ cu_propagate_seeds.py # SeedBatchPropagator: chunked seed processing
50
+ cu_direction_getters.py # Direction getter ABC + Boot/Prob/PTT implementations
51
+ cutils.py # REAL_DTYPE, REAL3_DTYPE, checkCudaErrors(), ModelType enum
52
+ _globals.py # Global constants useful for all languages
53
+ cuda_c/ # CUDA kernel source
54
+ globals.h # CUDA specific global constants
55
+ generate_streamlines_cuda.cu, boot.cu, ptt.cu, tracking_helpers.cu, utils.cu
56
+ cudamacro.h, cuwsort.cuh, ptt.cuh, disc.h
57
+ metal/ # Metal backend (mirrors cuda_python/)
58
+ mt_tractography.py, mt_propagate_seeds.py, mt_direction_getters.py, mutils.py
59
+ metal_shaders/ # MSL kernel source (mirrors cuda_c/)
60
+ globals.h, types.h, philox_rng.h
61
+ generate_streamlines_metal.metal, boot.metal, ptt.metal
62
+ tracking_helpers.metal, utils.metal, warp_sort.metal
63
+ webgpu/ # WebGPU backend (mirrors metal/)
64
+ wg_tractography.py, wg_propagate_seeds.py, wg_direction_getters.py, wgutils.py
65
+ benchmark.py # Cross-backend benchmark: python -m cuslines.webgpu.benchmark
66
+ wgsl_shaders/ # WGSL kernel source (mirrors metal_shaders/)
67
+ globals.wgsl, types.wgsl, philox_rng.wgsl
68
+ utils.wgsl, warp_sort.wgsl, tracking_helpers.wgsl
69
+ generate_streamlines.wgsl # Prob/PTT buffer bindings + Prob getNum/gen kernels
70
+ boot.wgsl # Boot direction getter kernels (standalone module)
71
+ disc.wgsl, ptt.wgsl # PTT support
72
+ ```
73
+
74
+ **Data flow**: DIPY preprocessing → seed generation → GPUTracker context → SeedBatchPropagator chunks seeds across GPUs → kernel launch → stream results to TRK/TRX output.
75
+
76
+ **Direction getters** (subclasses of `GPUDirectionGetter`):
77
+ - `BootDirectionGetter` — bootstrap sampling from SH coefficients (OPDT/CSA models)
78
+ - `ProbDirectionGetter` — probabilistic selection from ODF/PMF (CSD model)
79
+ - `PttDirectionGetter` — Probabilistic Tracking with Turning (CSD model)
80
+
81
+ Each has `from_dipy_*()` class methods for initialization from DIPY models.
82
+
83
+ ## Critical Conventions
84
+
85
+ - **GPU arrays must be C-contiguous** — always use `np.ascontiguousarray()` and project scalar types (`REAL_DTYPE`, `REAL_SIZE` from `cutils.py` or `mutils.py`).
86
+ - **All CUDA API calls must be wrapped** with `checkCudaErrors()`.
87
+ - **Angle units**: CLI accepts degrees, internals convert to radians before the GPU layer.
88
+ - **Multi-GPU**: CUDA uses explicit `cudaSetDevice()` calls; Metal and WebGPU are single-GPU only.
89
+ - **CPU/GPU parity**: `run_gpu_streamlines.py` maintains parallel CPU and GPU code paths — keep both in sync when changing arguments or model-selection logic.
90
+ - **Logger**: use `logging.getLogger("GPUStreamlines")`.
91
+ - **Kernel compilation**: CUDA uses `cuda.core.Program` with NVIDIA headers. Metal uses `MTLDevice.newLibraryWithSource_options_error_()` with MSL source concatenated from `metal_shaders/`. WebGPU uses `device.create_shader_module()` with WGSL source concatenated from `wgsl_shaders/`.
92
+
93
+ ## Metal Backend Notes
94
+
95
+ - **Unified memory**: Metal buffers use `storageModeShared` — numpy arrays are directly GPU-accessible (zero memcpy per batch, vs ~6 in CUDA).
96
+ - **float3 alignment**: All buffers use `packed_float3` (12 bytes) with `load_f3()`/`store_f3()` helpers. Metal `float3` is 16 bytes in registers.
97
+ - **Page alignment**: Use `aligned_array()` from `mutils.py` for arrays passed to `newBufferWithBytesNoCopy`.
98
+ - **No double precision**: Only `REAL_SIZE=4` (float32) is ported.
99
+ - **Warp primitives**: `__shfl_sync` → `simd_shuffle`, `__ballot_sync` → `simd_ballot`. SIMD width = 32.
100
+ - **SH basis**: Always use `real_sh_descoteaux(legacy=True)` for all matrices. See `boot_utils.py`.
101
+
102
+ ## WebGPU Backend Notes
103
+
104
+ - **Cross-platform**: wgpu-py maps to Metal (macOS), Vulkan (Linux/Windows), D3D12 (Windows). Install: `pip install "cuslines[webgpu]"`.
105
+ - **Explicit readbacks**: `device.queue.read_buffer()` for GPU→CPU (~3 per seed batch, matching CUDA's cudaMemcpy pattern).
106
+ - **WGSL shaders**: Concatenated in dependency order by `compile_program()`. Boot compiles standalone; Prob/PTT share `generate_streamlines.wgsl`.
107
+ - **Buffer binding**: Boot needs 17 buffers across 3 bind groups. Prob/PTT use 2 bind groups. `layout="auto"` only includes reachable bindings.
108
+ - **Subgroups required**: Device feature `"subgroup"` (singular, not `"subgroups"`). Naga does NOT support `enable subgroups;` directive.
109
+ - **WGSL constraints**: No `ptr<storage>` parameters (use module-scope accessors). `var<workgroup>` sizes must be compile-time constants. PhiloxState is pass-by-value (return result structs).
110
+ - **Boot standalone module**: `_kernel_files()` returns `[]` to avoid `params` struct redefinition.
111
+ - **Benchmark**: `python -m cuslines.webgpu.benchmark --nseeds 10000` — auto-detects all backends.
112
+
113
+ ## Key Dependencies
114
+
115
+ - `dipy` — diffusion models, CPU direction getters, seeding, stopping criteria
116
+ - `nibabel` — NIfTI/TRK file I/O (`StatefulTractogram`)
117
+ - `trx-python` — TRX format support (memory-mapped, for large outputs)
118
+ - `cuda-python` / `cuda-core` / `cuda-cccl` — CUDA Python bindings, kernel compilation, C++ headers
119
+ - `pyobjc-framework-Metal` / `pyobjc-framework-MetalPerformanceShaders` — Metal Python bindings (macOS only)
120
+ - `wgpu` — WebGPU Python bindings (wgpu-native, cross-platform)
121
+ - `numpy` — array operations throughout
@@ -0,0 +1,19 @@
1
+ ARG NVIDIAVERSION=12.0.1-devel-ubuntu20.04
2
+ FROM nvidia/cuda:${NVIDIAVERSION}
3
+
4
+ SHELL ["/bin/bash", "-c"]
5
+
6
+ ENV DEBIAN_FRONTEND=noninteractive
7
+
8
+ RUN apt-get update && apt-get install --assume-yes curl git
9
+
10
+ RUN curl -L "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" \
11
+ -o "/tmp/Miniconda3.sh"
12
+ RUN bash /tmp/Miniconda3.sh -b -p /opt/anaconda
13
+ RUN rm -rf /tmp/Miniconda3.sh
14
+ RUN cd /opt && eval "$(/opt/anaconda/bin/conda shell.bash hook)"
15
+ ENV PATH=/opt/anaconda/bin:${PATH}
16
+ ENV LD_LIBRARY_PATH=/opt/anaconda/lib:${LD_LIBRARY_PATH}
17
+
18
+ COPY . /opt/GPUStreamlines/
19
+ RUN cd /opt/GPUStreamlines && pip install .
@@ -1,29 +1,62 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cuslines
3
- Version: 2.0.0
3
+ Version: 2.2.1
4
4
  Summary: GPU-accelerated tractography package
5
+ Project-URL: Homepage, https://github.com/dipy/GPUStreamlines
5
6
  Requires-Python: >=3.7
6
7
  Description-Content-Type: text/markdown
7
8
  License-File: LICENSE
8
9
  Requires-Dist: numpy
10
+ Requires-Dist: numba
9
11
  Requires-Dist: nibabel
10
12
  Requires-Dist: tqdm
11
13
  Requires-Dist: dipy
12
14
  Requires-Dist: trx-python
13
- Requires-Dist: cuda-python
14
- Requires-Dist: cuda-core
15
- Requires-Dist: cuda-cccl
15
+ Requires-Dist: scipy
16
+ Provides-Extra: cu13
17
+ Requires-Dist: nvidia-cuda-runtime; extra == "cu13"
18
+ Requires-Dist: nvidia-curand-cu12; extra == "cu13"
19
+ Requires-Dist: cuda-python<14; extra == "cu13"
20
+ Requires-Dist: cuda-core[cu13]; extra == "cu13"
21
+ Requires-Dist: cuda-cccl[cu13]; extra == "cu13"
22
+ Provides-Extra: cu12
23
+ Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
24
+ Requires-Dist: nvidia-curand-cu12; extra == "cu12"
25
+ Requires-Dist: cuda-python<13; extra == "cu12"
26
+ Requires-Dist: cuda-core[cu12]; extra == "cu12"
27
+ Requires-Dist: cuda-cccl[cu12]; extra == "cu12"
28
+ Provides-Extra: metal
29
+ Requires-Dist: pyobjc-framework-Metal; extra == "metal"
30
+ Requires-Dist: pyobjc-framework-MetalPerformanceShaders; extra == "metal"
31
+ Provides-Extra: webgpu
32
+ Requires-Dist: wgpu>=0.18; extra == "webgpu"
33
+ Provides-Extra: dev
34
+ Requires-Dist: ruff>=0.14.10; extra == "dev"
16
35
  Dynamic: license-file
17
36
 
18
37
  # GPUStreamlines
19
38
 
20
39
  ## Installation
21
- To install, simply run `pip install .` in the top-level repository directory.
40
+ To install from pypi:
41
+ ```
42
+ pip install "cuslines[cu13]" # CUDA 13 (NVIDIA)
43
+ pip install "cuslines[cu12]" # CUDA 12 (NVIDIA)
44
+ pip install "cuslines[metal]" # Apple Metal (Apple Silicon)
45
+ pip install "cuslines[webgpu]" # WebGPU (cross-platform: NVIDIA, AMD, Intel, Apple)
46
+ ```
47
+
48
+ To install from dev:
49
+ ```
50
+ pip install ".[cu13]" # CUDA 13
51
+ pip install ".[cu12]" # CUDA 12
52
+ pip install ".[metal]" # Apple Metal
53
+ pip install ".[webgpu]" # WebGPU (any GPU)
54
+ ```
22
55
 
23
56
  ## Running the examples
24
57
  This repository contains several example usage scripts.
25
58
 
26
- The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If not data is passed, it will donaload and use the HARDI dataset.
59
+ The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If no data is passed, it will download and use the HARDI dataset.
27
60
 
28
61
  To run the baseline CPU example on a random set of 1000 seeds, this is the command and example output:
29
62
  ```
@@ -67,6 +100,12 @@ Note that if you experience memory errors, you can adjust the `--chunk-size` fla
67
100
 
68
101
  To run on more seeds, we suggest setting the `--write-method trx` flag in the GPU script to not get bottlenecked by writing files.
69
102
 
103
+ ## GPU vs CPU differences
104
+
105
+ GPU backends (CUDA, Metal, and WebGPU) operate in float32 while DIPY uses float64. This causes slightly different peak selection at fiber crossings where ODF peaks have similar magnitudes. In practice the GPU produces comparable streamline counts and commissural fiber density, with modestly longer fibers on average. See [cuslines/webgpu/README.md](cuslines/webgpu/README.md) for cross-platform benchmarks and [cuslines/metal/README.md](cuslines/metal/README.md) for Metal-specific details.
106
+
107
+ The WebGPU backend runs on any GPU (NVIDIA, AMD, Intel, Apple) via [wgpu-py](https://github.com/pygfx/wgpu-py). It is auto-detected when no vendor-specific backend is available. See `python -m cuslines.webgpu.benchmark` for a self-contained benchmark across all available backends.
108
+
70
109
  ## Running on AWS with Docker
71
110
  First, set up an AWS instance with GPU and ssh into it (we recommend a P3 instance with at least 1 V100 16 GB GPU and a Deep Learning AMI Ubuntu 18.04 v 33.0.). Then do the following:
72
111
  1. Log in to GitHub docker registry:
@@ -81,10 +120,5 @@ $ docker pull docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest
81
120
  4. Run the code, mounting the current directory into the container for easy result retrieval:
82
121
  ```
83
122
  $ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
84
- python run_gpu_streamlines.py --ngpus 1 --output-prefix output/hardi_gpu_full --use-fast-write
85
- ```
86
- 5. The code produces a number of independent track files (one per processed "chunk"), but we have provided a merge script to combine them into a single trk file. To merge files, run:
87
- ```
88
- $ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
89
- ./merge_trk.sh -o output/hardi_tracks.trk output/hardi_gpu_full*
123
+ python /opt/GPUStreamlines/run_gpu_streamlines.py --ngpus 1 --output-prefix /opt/exec/output/hardi_gpu_full
90
124
  ```
@@ -1,29 +1,26 @@
1
- Metadata-Version: 2.4
2
- Name: cuslines
3
- Version: 2.0.0
4
- Summary: GPU-accelerated tractography package
5
- Requires-Python: >=3.7
6
- Description-Content-Type: text/markdown
7
- License-File: LICENSE
8
- Requires-Dist: numpy
9
- Requires-Dist: nibabel
10
- Requires-Dist: tqdm
11
- Requires-Dist: dipy
12
- Requires-Dist: trx-python
13
- Requires-Dist: cuda-python
14
- Requires-Dist: cuda-core
15
- Requires-Dist: cuda-cccl
16
- Dynamic: license-file
17
-
18
1
  # GPUStreamlines
19
2
 
20
3
  ## Installation
21
- To install, simply run `pip install .` in the top-level repository directory.
4
+ To install from pypi:
5
+ ```
6
+ pip install "cuslines[cu13]" # CUDA 13 (NVIDIA)
7
+ pip install "cuslines[cu12]" # CUDA 12 (NVIDIA)
8
+ pip install "cuslines[metal]" # Apple Metal (Apple Silicon)
9
+ pip install "cuslines[webgpu]" # WebGPU (cross-platform: NVIDIA, AMD, Intel, Apple)
10
+ ```
11
+
12
+ To install from dev:
13
+ ```
14
+ pip install ".[cu13]" # CUDA 13
15
+ pip install ".[cu12]" # CUDA 12
16
+ pip install ".[metal]" # Apple Metal
17
+ pip install ".[webgpu]" # WebGPU (any GPU)
18
+ ```
22
19
 
23
20
  ## Running the examples
24
21
  This repository contains several example usage scripts.
25
22
 
26
- The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If not data is passed, it will donaload and use the HARDI dataset.
23
+ The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If no data is passed, it will download and use the HARDI dataset.
27
24
 
28
25
  To run the baseline CPU example on a random set of 1000 seeds, this is the command and example output:
29
26
  ```
@@ -67,6 +64,12 @@ Note that if you experience memory errors, you can adjust the `--chunk-size` fla
67
64
 
68
65
  To run on more seeds, we suggest setting the `--write-method trx` flag in the GPU script to not get bottlenecked by writing files.
69
66
 
67
+ ## GPU vs CPU differences
68
+
69
+ GPU backends (CUDA, Metal, and WebGPU) operate in float32 while DIPY uses float64. This causes slightly different peak selection at fiber crossings where ODF peaks have similar magnitudes. In practice the GPU produces comparable streamline counts and commissural fiber density, with modestly longer fibers on average. See [cuslines/webgpu/README.md](cuslines/webgpu/README.md) for cross-platform benchmarks and [cuslines/metal/README.md](cuslines/metal/README.md) for Metal-specific details.
70
+
71
+ The WebGPU backend runs on any GPU (NVIDIA, AMD, Intel, Apple) via [wgpu-py](https://github.com/pygfx/wgpu-py). It is auto-detected when no vendor-specific backend is available. See `python -m cuslines.webgpu.benchmark` for a self-contained benchmark across all available backends.
72
+
70
73
  ## Running on AWS with Docker
71
74
  First, set up an AWS instance with GPU and ssh into it (we recommend a P3 instance with at least 1 V100 16 GB GPU and a Deep Learning AMI Ubuntu 18.04 v 33.0.). Then do the following:
72
75
  1. Log in to GitHub docker registry:
@@ -81,10 +84,5 @@ $ docker pull docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest
81
84
  4. Run the code, mounting the current directory into the container for easy result retrieval:
82
85
  ```
83
86
  $ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
84
- python run_gpu_streamlines.py --ngpus 1 --output-prefix output/hardi_gpu_full --use-fast-write
85
- ```
86
- 5. The code produces a number of independent track files (one per processed "chunk"), but we have provided a merge script to combine them into a single trk file. To merge files, run:
87
- ```
88
- $ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
89
- ./merge_trk.sh -o output/hardi_tracks.trk output/hardi_gpu_full*
87
+ python /opt/GPUStreamlines/run_gpu_streamlines.py --ngpus 1 --output-prefix /opt/exec/output/hardi_gpu_full
90
88
  ```
@@ -0,0 +1,88 @@
1
+ import platform as _platform
2
+
3
+
4
+ def _detect_backend():
5
+ """Auto-detect the best available GPU backend."""
6
+ system = _platform.system()
7
+ if system == "Darwin":
8
+ try:
9
+ import Metal
10
+
11
+ if Metal.MTLCreateSystemDefaultDevice() is not None:
12
+ return "metal"
13
+ except ImportError:
14
+ pass
15
+ try:
16
+ from cuda.bindings import runtime
17
+
18
+ count = runtime.cudaGetDeviceCount()
19
+ if count[1] > 0:
20
+ return "cuda"
21
+ except (ImportError, Exception):
22
+ pass
23
+ try:
24
+ import wgpu
25
+
26
+ adapter = wgpu.gpu.request_adapter_sync()
27
+ if adapter is not None:
28
+ return "webgpu"
29
+ except (ImportError, Exception):
30
+ pass
31
+ return None
32
+
33
+ BACKEND = _detect_backend()
34
+
35
+ if BACKEND == "metal":
36
+ from cuslines.metal import (
37
+ MetalBootDirectionGetter as BootDirectionGetter,
38
+ )
39
+ from cuslines.metal import (
40
+ MetalGPUTracker as Tracker,
41
+ )
42
+ from cuslines.metal import (
43
+ MetalProbDirectionGetter as ProbDirectionGetter,
44
+ )
45
+ from cuslines.metal import (
46
+ MetalPttDirectionGetter as PttDirectionGetter,
47
+ )
48
+ elif BACKEND == "cuda":
49
+ from cuslines.cuda_python import (
50
+ BootDirectionGetter,
51
+ GPUTracker as Tracker,
52
+ ProbDirectionGetter,
53
+ PttDirectionGetter,
54
+ )
55
+ elif BACKEND == "webgpu":
56
+ from cuslines.webgpu import (
57
+ WebGPUBootDirectionGetter as BootDirectionGetter,
58
+ )
59
+ from cuslines.webgpu import (
60
+ WebGPUProbDirectionGetter as ProbDirectionGetter,
61
+ )
62
+ from cuslines.webgpu import (
63
+ WebGPUPttDirectionGetter as PttDirectionGetter,
64
+ )
65
+ from cuslines.webgpu import (
66
+ WebGPUTracker as Tracker,
67
+ )
68
+ else:
69
+ from cuslines.numba import (
70
+ CPUBootDirectionGetter as BootDirectionGetter,
71
+ )
72
+ from cuslines.numba import (
73
+ CPUProbDirectionGetter as ProbDirectionGetter,
74
+ )
75
+ from cuslines.numba import (
76
+ CPUPttDirectionGetter as PttDirectionGetter,
77
+ )
78
+ from cuslines.numba import (
79
+ CPUTracker as Tracker,
80
+ )
81
+
82
+ __all__ = [
83
+ "Tracker",
84
+ "ProbDirectionGetter",
85
+ "PttDirectionGetter",
86
+ "BootDirectionGetter",
87
+ "BACKEND",
88
+ ]
@@ -0,0 +1,94 @@
1
+ """Shared utilities for bootstrap direction getters (CUDA and Metal).
2
+
3
+ Extracts DIPY model matrices (H, R, delta_b, delta_q, sampling_matrix)
4
+ for OPDT and CSA models. Both backends need the same matrices — only
5
+ the GPU dispatch differs.
6
+ """
7
+
8
+ from dipy.reconst import shm
9
+
10
+
11
+ def prepare_opdt(
12
+ gtab, sphere, sh_order_max=6, full_basis=False, sh_lambda=0.006, min_signal=1
13
+ ):
14
+ """Build bootstrap matrices for the OPDT model.
15
+
16
+ Returns dict with keys: model_type, min_signal, H, R, delta_b,
17
+ delta_q, sampling_matrix, b0s_mask.
18
+ """
19
+ sampling_matrix, _, _ = shm.real_sh_descoteaux(
20
+ sh_order_max,
21
+ sphere.theta,
22
+ sphere.phi,
23
+ full_basis=full_basis,
24
+ legacy=True,
25
+ )
26
+ model = shm.OpdtModel(
27
+ gtab,
28
+ sh_order_max=sh_order_max,
29
+ smooth=sh_lambda,
30
+ min_signal=min_signal,
31
+ )
32
+ delta_b, delta_q = model._fit_matrix
33
+
34
+ H, R = _hat_and_lcr(gtab, model, sh_order_max)
35
+
36
+ return dict(
37
+ model_type="OPDT",
38
+ min_signal=min_signal,
39
+ H=H,
40
+ R=R,
41
+ delta_b=delta_b,
42
+ delta_q=delta_q,
43
+ sampling_matrix=sampling_matrix,
44
+ b0s_mask=gtab.b0s_mask,
45
+ )
46
+
47
+
48
+ def prepare_csa(
49
+ gtab, sphere, sh_order_max=6, full_basis=False, sh_lambda=0.006, min_signal=1
50
+ ):
51
+ """Build bootstrap matrices for the CSA model.
52
+
53
+ Returns dict with keys: model_type, min_signal, H, R, delta_b,
54
+ delta_q, sampling_matrix, b0s_mask.
55
+ """
56
+ sampling_matrix, _, _ = shm.real_sh_descoteaux(
57
+ sh_order_max,
58
+ sphere.theta,
59
+ sphere.phi,
60
+ full_basis=full_basis,
61
+ legacy=True,
62
+ )
63
+ model = shm.CsaOdfModel(
64
+ gtab,
65
+ sh_order_max=sh_order_max,
66
+ smooth=sh_lambda,
67
+ min_signal=min_signal,
68
+ )
69
+ delta_b = model._fit_matrix
70
+ delta_q = model._fit_matrix
71
+
72
+ H, R = _hat_and_lcr(gtab, model, sh_order_max)
73
+
74
+ return dict(
75
+ model_type="CSA",
76
+ min_signal=min_signal,
77
+ H=H,
78
+ R=R,
79
+ delta_b=delta_b,
80
+ delta_q=delta_q,
81
+ sampling_matrix=sampling_matrix,
82
+ b0s_mask=gtab.b0s_mask,
83
+ )
84
+
85
+
86
+ def _hat_and_lcr(gtab, model, sh_order_max):
87
+ """Compute hat matrix H and leveraged centered residuals matrix R."""
88
+ dwi_mask = ~gtab.b0s_mask
89
+ x, y, z = model.gtab.gradients[dwi_mask].T
90
+ _, theta, phi = shm.cart2sphere(x, y, z)
91
+ B, _, _ = shm.real_sh_descoteaux(sh_order_max, theta, phi, legacy=True)
92
+ H = shm.hat(B)
93
+ R = shm.lcr_matrix(H)
94
+ return H, R