cuslines 2.0.0__tar.gz → 2.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cuslines-2.2.1/.github/workflows/dockerbuild.yml +37 -0
- cuslines-2.2.1/.github/workflows/publish_pypi.yml +33 -0
- cuslines-2.2.1/.gitignore +15 -0
- cuslines-2.2.1/.pre-commit-config.yaml +18 -0
- cuslines-2.2.1/CLAUDE.md +121 -0
- cuslines-2.2.1/Dockerfile +19 -0
- {cuslines-2.0.0/cuslines.egg-info → cuslines-2.2.1}/PKG-INFO +46 -12
- cuslines-2.0.0/PKG-INFO → cuslines-2.2.1/README.md +23 -25
- cuslines-2.2.1/cuslines/__init__.py +88 -0
- cuslines-2.2.1/cuslines/boot_utils.py +94 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/boot.cu +55 -141
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/generate_streamlines_cuda.cu +55 -128
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/globals.h +5 -17
- cuslines-2.2.1/cuslines/cuda_c/ptt.cu +474 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/ptt.cuh +4 -4
- cuslines-2.2.1/cuslines/cuda_c/ptt_init.cu +63 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/tracking_helpers.cu +48 -93
- cuslines-2.2.1/cuslines/cuda_python/_globals.py +8 -0
- cuslines-2.2.1/cuslines/cuda_python/cu_direction_getters.py +648 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_python/cu_propagate_seeds.py +28 -25
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_python/cu_tractography.py +135 -150
- cuslines-2.2.1/cuslines/cuda_python/cutils.py +128 -0
- cuslines-2.2.1/cuslines/generic_tracker.py +119 -0
- cuslines-2.2.1/cuslines/metal/README.md +127 -0
- cuslines-2.2.1/cuslines/metal/__init__.py +13 -0
- cuslines-2.2.1/cuslines/metal/mt_direction_getters.py +465 -0
- cuslines-2.2.1/cuslines/metal/mt_propagate_seeds.py +201 -0
- cuslines-2.2.1/cuslines/metal/mt_tractography.py +170 -0
- cuslines-2.2.1/cuslines/metal/mutils.py +142 -0
- cuslines-2.2.1/cuslines/metal_shaders/boot.metal +869 -0
- cuslines-2.2.1/cuslines/metal_shaders/disc.h +1890 -0
- cuslines-2.2.1/cuslines/metal_shaders/generate_streamlines_metal.metal +400 -0
- cuslines-2.2.1/cuslines/metal_shaders/globals.h +67 -0
- cuslines-2.2.1/cuslines/metal_shaders/philox_rng.h +152 -0
- cuslines-2.2.1/cuslines/metal_shaders/ptt.metal +1061 -0
- cuslines-2.2.1/cuslines/metal_shaders/tracking_helpers.metal +221 -0
- cuslines-2.2.1/cuslines/metal_shaders/types.h +50 -0
- cuslines-2.2.1/cuslines/metal_shaders/utils.metal +107 -0
- cuslines-2.2.1/cuslines/metal_shaders/warp_sort.metal +109 -0
- cuslines-2.2.1/cuslines/numba/__init__.py +13 -0
- cuslines-2.2.1/cuslines/numba/nu_globals.py +19 -0
- cuslines-2.2.1/cuslines/numba/nu_tractography.py +241 -0
- cuslines-2.2.1/cuslines/numba_njit/generate_streamlines_numba.py +284 -0
- cuslines-2.2.1/cuslines/numba_njit/num_streamlines_numba.py +170 -0
- cuslines-2.2.1/cuslines/numba_njit/tracking_helpers.py +62 -0
- cuslines-2.2.1/cuslines/webgpu/README.md +147 -0
- cuslines-2.2.1/cuslines/webgpu/__init__.py +19 -0
- cuslines-2.2.1/cuslines/webgpu/benchmark.py +486 -0
- cuslines-2.2.1/cuslines/webgpu/wg_direction_getters.py +478 -0
- cuslines-2.2.1/cuslines/webgpu/wg_propagate_seeds.py +205 -0
- cuslines-2.2.1/cuslines/webgpu/wg_tractography.py +203 -0
- cuslines-2.2.1/cuslines/webgpu/wgutils.py +127 -0
- cuslines-2.2.1/cuslines/wgsl_shaders/boot.wgsl +843 -0
- cuslines-2.2.1/cuslines/wgsl_shaders/disc.wgsl +74 -0
- cuslines-2.2.1/cuslines/wgsl_shaders/generate_streamlines.wgsl +418 -0
- cuslines-2.2.1/cuslines/wgsl_shaders/globals.wgsl +38 -0
- cuslines-2.2.1/cuslines/wgsl_shaders/philox_rng.wgsl +189 -0
- cuslines-2.2.1/cuslines/wgsl_shaders/ptt.wgsl +1153 -0
- cuslines-2.2.1/cuslines/wgsl_shaders/tracking_helpers.wgsl +261 -0
- cuslines-2.2.1/cuslines/wgsl_shaders/types.wgsl +19 -0
- cuslines-2.2.1/cuslines/wgsl_shaders/utils.wgsl +78 -0
- cuslines-2.2.1/cuslines/wgsl_shaders/warp_sort.wgsl +75 -0
- cuslines-2.2.1/cuslines.egg-info/PKG-INFO +124 -0
- cuslines-2.2.1/cuslines.egg-info/SOURCES.txt +74 -0
- cuslines-2.2.1/cuslines.egg-info/requires.txt +31 -0
- cuslines-2.2.1/pyproject.toml +88 -0
- cuslines-2.2.1/run_gpu_streamlines.py +462 -0
- cuslines-2.0.0/README.md +0 -73
- cuslines-2.0.0/cuslines/__init__.py +0 -13
- cuslines-2.0.0/cuslines/cuda_c/ptt.cu +0 -559
- cuslines-2.0.0/cuslines/cuda_python/_globals.py +0 -10
- cuslines-2.0.0/cuslines/cuda_python/cu_direction_getters.py +0 -472
- cuslines-2.0.0/cuslines/cuda_python/cutils.py +0 -64
- cuslines-2.0.0/cuslines.egg-info/SOURCES.txt +0 -26
- cuslines-2.0.0/cuslines.egg-info/requires.txt +0 -8
- cuslines-2.0.0/pyproject.toml +0 -24
- cuslines-2.0.0/setup.py +0 -50
- {cuslines-2.0.0 → cuslines-2.2.1}/LICENSE +0 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/cudamacro.h +0 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/cuwsort.cuh +0 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/disc.h +0 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_c/utils.cu +0 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines/cuda_python/__init__.py +2 -2
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines.egg-info/dependency_links.txt +0 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/cuslines.egg-info/top_level.txt +0 -0
- {cuslines-2.0.0 → cuslines-2.2.1}/setup.cfg +0 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
name: Build and Push Docker Image
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- 'master'
|
|
7
|
+
pull_request:
|
|
8
|
+
branches:
|
|
9
|
+
- 'master'
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
build:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
permissions:
|
|
15
|
+
packages: write
|
|
16
|
+
contents: read
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Docker Buildx
|
|
21
|
+
uses: docker/setup-buildx-action@v3
|
|
22
|
+
|
|
23
|
+
- name: Login to GHCR
|
|
24
|
+
uses: docker/login-action@v3
|
|
25
|
+
with:
|
|
26
|
+
registry: ghcr.io
|
|
27
|
+
username: ${{ github.actor }}
|
|
28
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
29
|
+
|
|
30
|
+
- name: Build and push Docker image
|
|
31
|
+
uses: docker/build-push-action@v6
|
|
32
|
+
with:
|
|
33
|
+
context: .
|
|
34
|
+
push: true
|
|
35
|
+
tags: |
|
|
36
|
+
ghcr.io/dipy/gpustreamlines:${{ github.sha }}
|
|
37
|
+
ghcr.io/dipy/gpustreamlines:latest
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build-n-publish:
|
|
9
|
+
name: Build and publish Python distro to PyPI
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment:
|
|
12
|
+
name: pypi
|
|
13
|
+
url: https://pypi.org/p/cuslines
|
|
14
|
+
permissions:
|
|
15
|
+
id-token: write
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- name: Checkout code
|
|
19
|
+
uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: "3.x"
|
|
25
|
+
|
|
26
|
+
- name: Install build dependencies
|
|
27
|
+
run: python -m pip install build
|
|
28
|
+
|
|
29
|
+
- name: Build binary wheel and source tarball
|
|
30
|
+
run: python -m build
|
|
31
|
+
|
|
32
|
+
- name: Publish to PyPI
|
|
33
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
default_language_version:
|
|
2
|
+
python: python3
|
|
3
|
+
|
|
4
|
+
repos:
|
|
5
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
6
|
+
rev: v0.14.10
|
|
7
|
+
hooks:
|
|
8
|
+
# Run the linter
|
|
9
|
+
- id: ruff
|
|
10
|
+
args: [ --fix, --config, pyproject.toml ]
|
|
11
|
+
# Run the formatter
|
|
12
|
+
- id: ruff-format
|
|
13
|
+
- repo: https://github.com/codespell-project/codespell
|
|
14
|
+
rev: v2.3.0
|
|
15
|
+
hooks:
|
|
16
|
+
- id: codespell
|
|
17
|
+
additional_dependencies:
|
|
18
|
+
- tomli
|
cuslines-2.2.1/CLAUDE.md
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
GPUStreamlines (`cuslines`) is a GPU-accelerated tractography package for diffusion MRI. It supports **three GPU backends**: NVIDIA CUDA, Apple Metal (Apple Silicon), and WebGPU (cross-platform via wgpu-py). Backend is auto-detected at import time in `cuslines/__init__.py` (priority: Metal → CUDA → WebGPU). Kernels are compiled at runtime (NVRTC for CUDA, `MTLDevice.newLibraryWithSource` for Metal, `device.create_shader_module` for WebGPU/WGSL).
|
|
8
|
+
|
|
9
|
+
## Build & Run
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Install (pick your backend)
|
|
13
|
+
pip install ".[cu13]" # CUDA 13
|
|
14
|
+
pip install ".[cu12]" # CUDA 12
|
|
15
|
+
pip install ".[metal]" # Apple Metal (Apple Silicon)
|
|
16
|
+
pip install ".[webgpu]" # WebGPU (cross-platform: NVIDIA, AMD, Intel, Apple)
|
|
17
|
+
|
|
18
|
+
# From PyPI
|
|
19
|
+
pip install "cuslines[cu13]"
|
|
20
|
+
pip install "cuslines[metal]"
|
|
21
|
+
pip install "cuslines[webgpu]"
|
|
22
|
+
|
|
23
|
+
# GPU run (downloads HARDI dataset if no data passed)
|
|
24
|
+
python run_gpu_streamlines.py --output-prefix small --nseeds 1000 --ngpus 1
|
|
25
|
+
|
|
26
|
+
# Force a specific backend
|
|
27
|
+
python run_gpu_streamlines.py --device=webgpu --output-prefix small --nseeds 1000
|
|
28
|
+
|
|
29
|
+
# CPU reference run (for comparison/debugging)
|
|
30
|
+
python run_gpu_streamlines.py --device=cpu --output-prefix small --nseeds 1000
|
|
31
|
+
|
|
32
|
+
# Docker
|
|
33
|
+
docker build -t gpustreamlines .
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
There is no dedicated test or lint suite. Validate by comparing CPU vs GPU outputs on the same seeds.
|
|
37
|
+
|
|
38
|
+
## Architecture
|
|
39
|
+
|
|
40
|
+
**Two-layer design**: Python orchestration + GPU kernels compiled at runtime. Three parallel backend implementations share the same API surface.
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
run_gpu_streamlines.py # CLI entry: DIPY model fitting → CPU or GPU tracking
|
|
44
|
+
cuslines/
|
|
45
|
+
__init__.py # Auto-detects Metal → CUDA → WebGPU backend at import
|
|
46
|
+
boot_utils.py # Shared bootstrap matrix preparation (OPDT/CSA) for all backends
|
|
47
|
+
cuda_python/ # CUDA backend
|
|
48
|
+
cu_tractography.py # GPUTracker: context manager, multi-GPU allocation
|
|
49
|
+
cu_propagate_seeds.py # SeedBatchPropagator: chunked seed processing
|
|
50
|
+
cu_direction_getters.py # Direction getter ABC + Boot/Prob/PTT implementations
|
|
51
|
+
cutils.py # REAL_DTYPE, REAL3_DTYPE, checkCudaErrors(), ModelType enum
|
|
52
|
+
_globals.py # Global constants useful for all languages
|
|
53
|
+
cuda_c/ # CUDA kernel source
|
|
54
|
+
globals.h # CUDA specific global constants
|
|
55
|
+
generate_streamlines_cuda.cu, boot.cu, ptt.cu, tracking_helpers.cu, utils.cu
|
|
56
|
+
cudamacro.h, cuwsort.cuh, ptt.cuh, disc.h
|
|
57
|
+
metal/ # Metal backend (mirrors cuda_python/)
|
|
58
|
+
mt_tractography.py, mt_propagate_seeds.py, mt_direction_getters.py, mutils.py
|
|
59
|
+
metal_shaders/ # MSL kernel source (mirrors cuda_c/)
|
|
60
|
+
globals.h, types.h, philox_rng.h
|
|
61
|
+
generate_streamlines_metal.metal, boot.metal, ptt.metal
|
|
62
|
+
tracking_helpers.metal, utils.metal, warp_sort.metal
|
|
63
|
+
webgpu/ # WebGPU backend (mirrors metal/)
|
|
64
|
+
wg_tractography.py, wg_propagate_seeds.py, wg_direction_getters.py, wgutils.py
|
|
65
|
+
benchmark.py # Cross-backend benchmark: python -m cuslines.webgpu.benchmark
|
|
66
|
+
wgsl_shaders/ # WGSL kernel source (mirrors metal_shaders/)
|
|
67
|
+
globals.wgsl, types.wgsl, philox_rng.wgsl
|
|
68
|
+
utils.wgsl, warp_sort.wgsl, tracking_helpers.wgsl
|
|
69
|
+
generate_streamlines.wgsl # Prob/PTT buffer bindings + Prob getNum/gen kernels
|
|
70
|
+
boot.wgsl # Boot direction getter kernels (standalone module)
|
|
71
|
+
disc.wgsl, ptt.wgsl # PTT support
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**Data flow**: DIPY preprocessing → seed generation → GPUTracker context → SeedBatchPropagator chunks seeds across GPUs → kernel launch → stream results to TRK/TRX output.
|
|
75
|
+
|
|
76
|
+
**Direction getters** (subclasses of `GPUDirectionGetter`):
|
|
77
|
+
- `BootDirectionGetter` — bootstrap sampling from SH coefficients (OPDT/CSA models)
|
|
78
|
+
- `ProbDirectionGetter` — probabilistic selection from ODF/PMF (CSD model)
|
|
79
|
+
- `PttDirectionGetter` — Probabilistic Tracking with Turning (CSD model)
|
|
80
|
+
|
|
81
|
+
Each has `from_dipy_*()` class methods for initialization from DIPY models.
|
|
82
|
+
|
|
83
|
+
## Critical Conventions
|
|
84
|
+
|
|
85
|
+
- **GPU arrays must be C-contiguous** — always use `np.ascontiguousarray()` and project scalar types (`REAL_DTYPE`, `REAL_SIZE` from `cutils.py` or `mutils.py`).
|
|
86
|
+
- **All CUDA API calls must be wrapped** with `checkCudaErrors()`.
|
|
87
|
+
- **Angle units**: CLI accepts degrees, internals convert to radians before the GPU layer.
|
|
88
|
+
- **Multi-GPU**: CUDA uses explicit `cudaSetDevice()` calls; Metal and WebGPU are single-GPU only.
|
|
89
|
+
- **CPU/GPU parity**: `run_gpu_streamlines.py` maintains parallel CPU and GPU code paths — keep both in sync when changing arguments or model-selection logic.
|
|
90
|
+
- **Logger**: use `logging.getLogger("GPUStreamlines")`.
|
|
91
|
+
- **Kernel compilation**: CUDA uses `cuda.core.Program` with NVIDIA headers. Metal uses `MTLDevice.newLibraryWithSource_options_error_()` with MSL source concatenated from `metal_shaders/`. WebGPU uses `device.create_shader_module()` with WGSL source concatenated from `wgsl_shaders/`.
|
|
92
|
+
|
|
93
|
+
## Metal Backend Notes
|
|
94
|
+
|
|
95
|
+
- **Unified memory**: Metal buffers use `storageModeShared` — numpy arrays are directly GPU-accessible (zero memcpy per batch, vs ~6 in CUDA).
|
|
96
|
+
- **float3 alignment**: All buffers use `packed_float3` (12 bytes) with `load_f3()`/`store_f3()` helpers. Metal `float3` is 16 bytes in registers.
|
|
97
|
+
- **Page alignment**: Use `aligned_array()` from `mutils.py` for arrays passed to `newBufferWithBytesNoCopy`.
|
|
98
|
+
- **No double precision**: Only `REAL_SIZE=4` (float32) is ported.
|
|
99
|
+
- **Warp primitives**: `__shfl_sync` → `simd_shuffle`, `__ballot_sync` → `simd_ballot`. SIMD width = 32.
|
|
100
|
+
- **SH basis**: Always use `real_sh_descoteaux(legacy=True)` for all matrices. See `boot_utils.py`.
|
|
101
|
+
|
|
102
|
+
## WebGPU Backend Notes
|
|
103
|
+
|
|
104
|
+
- **Cross-platform**: wgpu-py maps to Metal (macOS), Vulkan (Linux/Windows), D3D12 (Windows). Install: `pip install "cuslines[webgpu]"`.
|
|
105
|
+
- **Explicit readbacks**: `device.queue.read_buffer()` for GPU→CPU (~3 per seed batch, matching CUDA's cudaMemcpy pattern).
|
|
106
|
+
- **WGSL shaders**: Concatenated in dependency order by `compile_program()`. Boot compiles standalone; Prob/PTT share `generate_streamlines.wgsl`.
|
|
107
|
+
- **Buffer binding**: Boot needs 17 buffers across 3 bind groups. Prob/PTT use 2 bind groups. `layout="auto"` only includes reachable bindings.
|
|
108
|
+
- **Subgroups required**: Device feature `"subgroup"` (singular, not `"subgroups"`). Naga does NOT support `enable subgroups;` directive.
|
|
109
|
+
- **WGSL constraints**: No `ptr<storage>` parameters (use module-scope accessors). `var<workgroup>` sizes must be compile-time constants. PhiloxState is pass-by-value (return result structs).
|
|
110
|
+
- **Boot standalone module**: `_kernel_files()` returns `[]` to avoid `params` struct redefinition.
|
|
111
|
+
- **Benchmark**: `python -m cuslines.webgpu.benchmark --nseeds 10000` — auto-detects all backends.
|
|
112
|
+
|
|
113
|
+
## Key Dependencies
|
|
114
|
+
|
|
115
|
+
- `dipy` — diffusion models, CPU direction getters, seeding, stopping criteria
|
|
116
|
+
- `nibabel` — NIfTI/TRK file I/O (`StatefulTractogram`)
|
|
117
|
+
- `trx-python` — TRX format support (memory-mapped, for large outputs)
|
|
118
|
+
- `cuda-python` / `cuda-core` / `cuda-cccl` — CUDA Python bindings, kernel compilation, C++ headers
|
|
119
|
+
- `pyobjc-framework-Metal` / `pyobjc-framework-MetalPerformanceShaders` — Metal Python bindings (macOS only)
|
|
120
|
+
- `wgpu` — WebGPU Python bindings (wgpu-native, cross-platform)
|
|
121
|
+
- `numpy` — array operations throughout
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
ARG NVIDIAVERSION=12.0.1-devel-ubuntu20.04
|
|
2
|
+
FROM nvidia/cuda:${NVIDIAVERSION}
|
|
3
|
+
|
|
4
|
+
SHELL ["/bin/bash", "-c"]
|
|
5
|
+
|
|
6
|
+
ENV DEBIAN_FRONTEND=noninteractive
|
|
7
|
+
|
|
8
|
+
RUN apt-get update && apt-get install --assume-yes curl git
|
|
9
|
+
|
|
10
|
+
RUN curl -L "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" \
|
|
11
|
+
-o "/tmp/Miniconda3.sh"
|
|
12
|
+
RUN bash /tmp/Miniconda3.sh -b -p /opt/anaconda
|
|
13
|
+
RUN rm -rf /tmp/Miniconda3.sh
|
|
14
|
+
RUN cd /opt && eval "$(/opt/anaconda/bin/conda shell.bash hook)"
|
|
15
|
+
ENV PATH=/opt/anaconda/bin:${PATH}
|
|
16
|
+
ENV LD_LIBRARY_PATH=/opt/anaconda/lib:${LD_LIBRARY_PATH}
|
|
17
|
+
|
|
18
|
+
COPY . /opt/GPUStreamlines/
|
|
19
|
+
RUN cd /opt/GPUStreamlines && pip install .
|
|
@@ -1,29 +1,62 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cuslines
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.1
|
|
4
4
|
Summary: GPU-accelerated tractography package
|
|
5
|
+
Project-URL: Homepage, https://github.com/dipy/GPUStreamlines
|
|
5
6
|
Requires-Python: >=3.7
|
|
6
7
|
Description-Content-Type: text/markdown
|
|
7
8
|
License-File: LICENSE
|
|
8
9
|
Requires-Dist: numpy
|
|
10
|
+
Requires-Dist: numba
|
|
9
11
|
Requires-Dist: nibabel
|
|
10
12
|
Requires-Dist: tqdm
|
|
11
13
|
Requires-Dist: dipy
|
|
12
14
|
Requires-Dist: trx-python
|
|
13
|
-
Requires-Dist:
|
|
14
|
-
|
|
15
|
-
Requires-Dist: cuda-
|
|
15
|
+
Requires-Dist: scipy
|
|
16
|
+
Provides-Extra: cu13
|
|
17
|
+
Requires-Dist: nvidia-cuda-runtime; extra == "cu13"
|
|
18
|
+
Requires-Dist: nvidia-curand-cu12; extra == "cu13"
|
|
19
|
+
Requires-Dist: cuda-python<14; extra == "cu13"
|
|
20
|
+
Requires-Dist: cuda-core[cu13]; extra == "cu13"
|
|
21
|
+
Requires-Dist: cuda-cccl[cu13]; extra == "cu13"
|
|
22
|
+
Provides-Extra: cu12
|
|
23
|
+
Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
|
|
24
|
+
Requires-Dist: nvidia-curand-cu12; extra == "cu12"
|
|
25
|
+
Requires-Dist: cuda-python<13; extra == "cu12"
|
|
26
|
+
Requires-Dist: cuda-core[cu12]; extra == "cu12"
|
|
27
|
+
Requires-Dist: cuda-cccl[cu12]; extra == "cu12"
|
|
28
|
+
Provides-Extra: metal
|
|
29
|
+
Requires-Dist: pyobjc-framework-Metal; extra == "metal"
|
|
30
|
+
Requires-Dist: pyobjc-framework-MetalPerformanceShaders; extra == "metal"
|
|
31
|
+
Provides-Extra: webgpu
|
|
32
|
+
Requires-Dist: wgpu>=0.18; extra == "webgpu"
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: ruff>=0.14.10; extra == "dev"
|
|
16
35
|
Dynamic: license-file
|
|
17
36
|
|
|
18
37
|
# GPUStreamlines
|
|
19
38
|
|
|
20
39
|
## Installation
|
|
21
|
-
To install
|
|
40
|
+
To install from pypi:
|
|
41
|
+
```
|
|
42
|
+
pip install "cuslines[cu13]" # CUDA 13 (NVIDIA)
|
|
43
|
+
pip install "cuslines[cu12]" # CUDA 12 (NVIDIA)
|
|
44
|
+
pip install "cuslines[metal]" # Apple Metal (Apple Silicon)
|
|
45
|
+
pip install "cuslines[webgpu]" # WebGPU (cross-platform: NVIDIA, AMD, Intel, Apple)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
To install from dev:
|
|
49
|
+
```
|
|
50
|
+
pip install ".[cu13]" # CUDA 13
|
|
51
|
+
pip install ".[cu12]" # CUDA 12
|
|
52
|
+
pip install ".[metal]" # Apple Metal
|
|
53
|
+
pip install ".[webgpu]" # WebGPU (any GPU)
|
|
54
|
+
```
|
|
22
55
|
|
|
23
56
|
## Running the examples
|
|
24
57
|
This repository contains several example usage scripts.
|
|
25
58
|
|
|
26
|
-
The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If
|
|
59
|
+
The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If no data is passed, it will download and use the HARDI dataset.
|
|
27
60
|
|
|
28
61
|
To run the baseline CPU example on a random set of 1000 seeds, this is the command and example output:
|
|
29
62
|
```
|
|
@@ -67,6 +100,12 @@ Note that if you experience memory errors, you can adjust the `--chunk-size` fla
|
|
|
67
100
|
|
|
68
101
|
To run on more seeds, we suggest setting the `--write-method trx` flag in the GPU script to not get bottlenecked by writing files.
|
|
69
102
|
|
|
103
|
+
## GPU vs CPU differences
|
|
104
|
+
|
|
105
|
+
GPU backends (CUDA, Metal, and WebGPU) operate in float32 while DIPY uses float64. This causes slightly different peak selection at fiber crossings where ODF peaks have similar magnitudes. In practice the GPU produces comparable streamline counts and commissural fiber density, with modestly longer fibers on average. See [cuslines/webgpu/README.md](cuslines/webgpu/README.md) for cross-platform benchmarks and [cuslines/metal/README.md](cuslines/metal/README.md) for Metal-specific details.
|
|
106
|
+
|
|
107
|
+
The WebGPU backend runs on any GPU (NVIDIA, AMD, Intel, Apple) via [wgpu-py](https://github.com/pygfx/wgpu-py). It is auto-detected when no vendor-specific backend is available. See `python -m cuslines.webgpu.benchmark` for a self-contained benchmark across all available backends.
|
|
108
|
+
|
|
70
109
|
## Running on AWS with Docker
|
|
71
110
|
First, set up an AWS instance with GPU and ssh into it (we recommend a P3 instance with at least 1 V100 16 GB GPU and a Deep Learning AMI Ubuntu 18.04 v 33.0.). Then do the following:
|
|
72
111
|
1. Log in to GitHub docker registry:
|
|
@@ -81,10 +120,5 @@ $ docker pull docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest
|
|
|
81
120
|
4. Run the code, mounting the current directory into the container for easy result retrieval:
|
|
82
121
|
```
|
|
83
122
|
$ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
|
|
84
|
-
python run_gpu_streamlines.py --ngpus 1 --output-prefix output/hardi_gpu_full
|
|
85
|
-
```
|
|
86
|
-
5. The code produces a number of independent track files (one per processed "chunk"), but we have provided a merge script to combine them into a single trk file. To merge files, run:
|
|
87
|
-
```
|
|
88
|
-
$ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
|
|
89
|
-
./merge_trk.sh -o output/hardi_tracks.trk output/hardi_gpu_full*
|
|
123
|
+
python /opt/GPUStreamlines/run_gpu_streamlines.py --ngpus 1 --output-prefix /opt/exec/output/hardi_gpu_full
|
|
90
124
|
```
|
|
@@ -1,29 +1,26 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: cuslines
|
|
3
|
-
Version: 2.0.0
|
|
4
|
-
Summary: GPU-accelerated tractography package
|
|
5
|
-
Requires-Python: >=3.7
|
|
6
|
-
Description-Content-Type: text/markdown
|
|
7
|
-
License-File: LICENSE
|
|
8
|
-
Requires-Dist: numpy
|
|
9
|
-
Requires-Dist: nibabel
|
|
10
|
-
Requires-Dist: tqdm
|
|
11
|
-
Requires-Dist: dipy
|
|
12
|
-
Requires-Dist: trx-python
|
|
13
|
-
Requires-Dist: cuda-python
|
|
14
|
-
Requires-Dist: cuda-core
|
|
15
|
-
Requires-Dist: cuda-cccl
|
|
16
|
-
Dynamic: license-file
|
|
17
|
-
|
|
18
1
|
# GPUStreamlines
|
|
19
2
|
|
|
20
3
|
## Installation
|
|
21
|
-
To install
|
|
4
|
+
To install from pypi:
|
|
5
|
+
```
|
|
6
|
+
pip install "cuslines[cu13]" # CUDA 13 (NVIDIA)
|
|
7
|
+
pip install "cuslines[cu12]" # CUDA 12 (NVIDIA)
|
|
8
|
+
pip install "cuslines[metal]" # Apple Metal (Apple Silicon)
|
|
9
|
+
pip install "cuslines[webgpu]" # WebGPU (cross-platform: NVIDIA, AMD, Intel, Apple)
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
To install from dev:
|
|
13
|
+
```
|
|
14
|
+
pip install ".[cu13]" # CUDA 13
|
|
15
|
+
pip install ".[cu12]" # CUDA 12
|
|
16
|
+
pip install ".[metal]" # Apple Metal
|
|
17
|
+
pip install ".[webgpu]" # WebGPU (any GPU)
|
|
18
|
+
```
|
|
22
19
|
|
|
23
20
|
## Running the examples
|
|
24
21
|
This repository contains several example usage scripts.
|
|
25
22
|
|
|
26
|
-
The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If
|
|
23
|
+
The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If no data is passed, it will download and use the HARDI dataset.
|
|
27
24
|
|
|
28
25
|
To run the baseline CPU example on a random set of 1000 seeds, this is the command and example output:
|
|
29
26
|
```
|
|
@@ -67,6 +64,12 @@ Note that if you experience memory errors, you can adjust the `--chunk-size` fla
|
|
|
67
64
|
|
|
68
65
|
To run on more seeds, we suggest setting the `--write-method trx` flag in the GPU script to not get bottlenecked by writing files.
|
|
69
66
|
|
|
67
|
+
## GPU vs CPU differences
|
|
68
|
+
|
|
69
|
+
GPU backends (CUDA, Metal, and WebGPU) operate in float32 while DIPY uses float64. This causes slightly different peak selection at fiber crossings where ODF peaks have similar magnitudes. In practice the GPU produces comparable streamline counts and commissural fiber density, with modestly longer fibers on average. See [cuslines/webgpu/README.md](cuslines/webgpu/README.md) for cross-platform benchmarks and [cuslines/metal/README.md](cuslines/metal/README.md) for Metal-specific details.
|
|
70
|
+
|
|
71
|
+
The WebGPU backend runs on any GPU (NVIDIA, AMD, Intel, Apple) via [wgpu-py](https://github.com/pygfx/wgpu-py). It is auto-detected when no vendor-specific backend is available. See `python -m cuslines.webgpu.benchmark` for a self-contained benchmark across all available backends.
|
|
72
|
+
|
|
70
73
|
## Running on AWS with Docker
|
|
71
74
|
First, set up an AWS instance with GPU and ssh into it (we recommend a P3 instance with at least 1 V100 16 GB GPU and a Deep Learning AMI Ubuntu 18.04 v 33.0.). Then do the following:
|
|
72
75
|
1. Log in to GitHub docker registry:
|
|
@@ -81,10 +84,5 @@ $ docker pull docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest
|
|
|
81
84
|
4. Run the code, mounting the current directory into the container for easy result retrieval:
|
|
82
85
|
```
|
|
83
86
|
$ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
|
|
84
|
-
python run_gpu_streamlines.py --ngpus 1 --output-prefix output/hardi_gpu_full
|
|
85
|
-
```
|
|
86
|
-
5. The code produces a number of independent track files (one per processed "chunk"), but we have provided a merge script to combine them into a single trk file. To merge files, run:
|
|
87
|
-
```
|
|
88
|
-
$ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
|
|
89
|
-
./merge_trk.sh -o output/hardi_tracks.trk output/hardi_gpu_full*
|
|
87
|
+
python /opt/GPUStreamlines/run_gpu_streamlines.py --ngpus 1 --output-prefix /opt/exec/output/hardi_gpu_full
|
|
90
88
|
```
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import platform as _platform
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _detect_backend():
|
|
5
|
+
"""Auto-detect the best available GPU backend."""
|
|
6
|
+
system = _platform.system()
|
|
7
|
+
if system == "Darwin":
|
|
8
|
+
try:
|
|
9
|
+
import Metal
|
|
10
|
+
|
|
11
|
+
if Metal.MTLCreateSystemDefaultDevice() is not None:
|
|
12
|
+
return "metal"
|
|
13
|
+
except ImportError:
|
|
14
|
+
pass
|
|
15
|
+
try:
|
|
16
|
+
from cuda.bindings import runtime
|
|
17
|
+
|
|
18
|
+
count = runtime.cudaGetDeviceCount()
|
|
19
|
+
if count[1] > 0:
|
|
20
|
+
return "cuda"
|
|
21
|
+
except (ImportError, Exception):
|
|
22
|
+
pass
|
|
23
|
+
try:
|
|
24
|
+
import wgpu
|
|
25
|
+
|
|
26
|
+
adapter = wgpu.gpu.request_adapter_sync()
|
|
27
|
+
if adapter is not None:
|
|
28
|
+
return "webgpu"
|
|
29
|
+
except (ImportError, Exception):
|
|
30
|
+
pass
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
BACKEND = _detect_backend()
|
|
34
|
+
|
|
35
|
+
if BACKEND == "metal":
|
|
36
|
+
from cuslines.metal import (
|
|
37
|
+
MetalBootDirectionGetter as BootDirectionGetter,
|
|
38
|
+
)
|
|
39
|
+
from cuslines.metal import (
|
|
40
|
+
MetalGPUTracker as Tracker,
|
|
41
|
+
)
|
|
42
|
+
from cuslines.metal import (
|
|
43
|
+
MetalProbDirectionGetter as ProbDirectionGetter,
|
|
44
|
+
)
|
|
45
|
+
from cuslines.metal import (
|
|
46
|
+
MetalPttDirectionGetter as PttDirectionGetter,
|
|
47
|
+
)
|
|
48
|
+
elif BACKEND == "cuda":
|
|
49
|
+
from cuslines.cuda_python import (
|
|
50
|
+
BootDirectionGetter,
|
|
51
|
+
GPUTracker as Tracker,
|
|
52
|
+
ProbDirectionGetter,
|
|
53
|
+
PttDirectionGetter,
|
|
54
|
+
)
|
|
55
|
+
elif BACKEND == "webgpu":
|
|
56
|
+
from cuslines.webgpu import (
|
|
57
|
+
WebGPUBootDirectionGetter as BootDirectionGetter,
|
|
58
|
+
)
|
|
59
|
+
from cuslines.webgpu import (
|
|
60
|
+
WebGPUProbDirectionGetter as ProbDirectionGetter,
|
|
61
|
+
)
|
|
62
|
+
from cuslines.webgpu import (
|
|
63
|
+
WebGPUPttDirectionGetter as PttDirectionGetter,
|
|
64
|
+
)
|
|
65
|
+
from cuslines.webgpu import (
|
|
66
|
+
WebGPUTracker as Tracker,
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
from cuslines.numba import (
|
|
70
|
+
CPUBootDirectionGetter as BootDirectionGetter,
|
|
71
|
+
)
|
|
72
|
+
from cuslines.numba import (
|
|
73
|
+
CPUProbDirectionGetter as ProbDirectionGetter,
|
|
74
|
+
)
|
|
75
|
+
from cuslines.numba import (
|
|
76
|
+
CPUPttDirectionGetter as PttDirectionGetter,
|
|
77
|
+
)
|
|
78
|
+
from cuslines.numba import (
|
|
79
|
+
CPUTracker as Tracker,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
__all__ = [
|
|
83
|
+
"Tracker",
|
|
84
|
+
"ProbDirectionGetter",
|
|
85
|
+
"PttDirectionGetter",
|
|
86
|
+
"BootDirectionGetter",
|
|
87
|
+
"BACKEND",
|
|
88
|
+
]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Shared utilities for bootstrap direction getters (CUDA and Metal).
|
|
2
|
+
|
|
3
|
+
Extracts DIPY model matrices (H, R, delta_b, delta_q, sampling_matrix)
|
|
4
|
+
for OPDT and CSA models. Both backends need the same matrices — only
|
|
5
|
+
the GPU dispatch differs.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dipy.reconst import shm
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def prepare_opdt(
|
|
12
|
+
gtab, sphere, sh_order_max=6, full_basis=False, sh_lambda=0.006, min_signal=1
|
|
13
|
+
):
|
|
14
|
+
"""Build bootstrap matrices for the OPDT model.
|
|
15
|
+
|
|
16
|
+
Returns dict with keys: model_type, min_signal, H, R, delta_b,
|
|
17
|
+
delta_q, sampling_matrix, b0s_mask.
|
|
18
|
+
"""
|
|
19
|
+
sampling_matrix, _, _ = shm.real_sh_descoteaux(
|
|
20
|
+
sh_order_max,
|
|
21
|
+
sphere.theta,
|
|
22
|
+
sphere.phi,
|
|
23
|
+
full_basis=full_basis,
|
|
24
|
+
legacy=True,
|
|
25
|
+
)
|
|
26
|
+
model = shm.OpdtModel(
|
|
27
|
+
gtab,
|
|
28
|
+
sh_order_max=sh_order_max,
|
|
29
|
+
smooth=sh_lambda,
|
|
30
|
+
min_signal=min_signal,
|
|
31
|
+
)
|
|
32
|
+
delta_b, delta_q = model._fit_matrix
|
|
33
|
+
|
|
34
|
+
H, R = _hat_and_lcr(gtab, model, sh_order_max)
|
|
35
|
+
|
|
36
|
+
return dict(
|
|
37
|
+
model_type="OPDT",
|
|
38
|
+
min_signal=min_signal,
|
|
39
|
+
H=H,
|
|
40
|
+
R=R,
|
|
41
|
+
delta_b=delta_b,
|
|
42
|
+
delta_q=delta_q,
|
|
43
|
+
sampling_matrix=sampling_matrix,
|
|
44
|
+
b0s_mask=gtab.b0s_mask,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def prepare_csa(
|
|
49
|
+
gtab, sphere, sh_order_max=6, full_basis=False, sh_lambda=0.006, min_signal=1
|
|
50
|
+
):
|
|
51
|
+
"""Build bootstrap matrices for the CSA model.
|
|
52
|
+
|
|
53
|
+
Returns dict with keys: model_type, min_signal, H, R, delta_b,
|
|
54
|
+
delta_q, sampling_matrix, b0s_mask.
|
|
55
|
+
"""
|
|
56
|
+
sampling_matrix, _, _ = shm.real_sh_descoteaux(
|
|
57
|
+
sh_order_max,
|
|
58
|
+
sphere.theta,
|
|
59
|
+
sphere.phi,
|
|
60
|
+
full_basis=full_basis,
|
|
61
|
+
legacy=True,
|
|
62
|
+
)
|
|
63
|
+
model = shm.CsaOdfModel(
|
|
64
|
+
gtab,
|
|
65
|
+
sh_order_max=sh_order_max,
|
|
66
|
+
smooth=sh_lambda,
|
|
67
|
+
min_signal=min_signal,
|
|
68
|
+
)
|
|
69
|
+
delta_b = model._fit_matrix
|
|
70
|
+
delta_q = model._fit_matrix
|
|
71
|
+
|
|
72
|
+
H, R = _hat_and_lcr(gtab, model, sh_order_max)
|
|
73
|
+
|
|
74
|
+
return dict(
|
|
75
|
+
model_type="CSA",
|
|
76
|
+
min_signal=min_signal,
|
|
77
|
+
H=H,
|
|
78
|
+
R=R,
|
|
79
|
+
delta_b=delta_b,
|
|
80
|
+
delta_q=delta_q,
|
|
81
|
+
sampling_matrix=sampling_matrix,
|
|
82
|
+
b0s_mask=gtab.b0s_mask,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _hat_and_lcr(gtab, model, sh_order_max):
|
|
87
|
+
"""Compute hat matrix H and leveraged centered residuals matrix R."""
|
|
88
|
+
dwi_mask = ~gtab.b0s_mask
|
|
89
|
+
x, y, z = model.gtab.gradients[dwi_mask].T
|
|
90
|
+
_, theta, phi = shm.cart2sphere(x, y, z)
|
|
91
|
+
B, _, _ = shm.real_sh_descoteaux(sh_order_max, theta, phi, legacy=True)
|
|
92
|
+
H = shm.hat(B)
|
|
93
|
+
R = shm.lcr_matrix(H)
|
|
94
|
+
return H, R
|