binary-thinning-3d-cuda 1.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- binary_thinning_3d_cuda-1.1.8/LICENSE +21 -0
- binary_thinning_3d_cuda-1.1.8/PKG-INFO +109 -0
- binary_thinning_3d_cuda-1.1.8/README.md +75 -0
- binary_thinning_3d_cuda-1.1.8/binary_thinning_3d/__init__.py +41 -0
- binary_thinning_3d_cuda-1.1.8/binary_thinning_3d_cuda.egg-info/PKG-INFO +109 -0
- binary_thinning_3d_cuda-1.1.8/binary_thinning_3d_cuda.egg-info/SOURCES.txt +13 -0
- binary_thinning_3d_cuda-1.1.8/binary_thinning_3d_cuda.egg-info/dependency_links.txt +1 -0
- binary_thinning_3d_cuda-1.1.8/binary_thinning_3d_cuda.egg-info/requires.txt +6 -0
- binary_thinning_3d_cuda-1.1.8/binary_thinning_3d_cuda.egg-info/top_level.txt +1 -0
- binary_thinning_3d_cuda-1.1.8/csrc/thinning.cpp +8 -0
- binary_thinning_3d_cuda-1.1.8/csrc/thinning_kernel.cu +511 -0
- binary_thinning_3d_cuda-1.1.8/pyproject.toml +3 -0
- binary_thinning_3d_cuda-1.1.8/setup.cfg +4 -0
- binary_thinning_3d_cuda-1.1.8/setup.py +57 -0
- binary_thinning_3d_cuda-1.1.8/tests/test_thinning.py +29 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shiyang Chen
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of Package, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: binary_thinning_3d_cuda
|
|
3
|
+
Version: 1.1.8
|
|
4
|
+
Summary: A fast 3D binary thinning implementation using CUDA and PyTorch.
|
|
5
|
+
Home-page: https://github.com/sychen52/binary_thinning_3d_cuda
|
|
6
|
+
Author: Shiyang Chen
|
|
7
|
+
Author-email: sychen52@gmail.com
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/sychen52/binary_thinning_3d_cuda/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Image Processing
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: torch
|
|
18
|
+
Requires-Dist: numpy
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: SimpleITK; extra == "dev"
|
|
21
|
+
Requires-Dist: itk-thickness3d; extra == "dev"
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: author-email
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
Dynamic: project-url
|
|
30
|
+
Dynamic: provides-extra
|
|
31
|
+
Dynamic: requires-dist
|
|
32
|
+
Dynamic: requires-python
|
|
33
|
+
Dynamic: summary
|
|
34
|
+
|
|
35
|
+
# Binary Thinning 3D CUDA
|
|
36
|
+
|
|
37
|
+
This package provides a blazing fast, memory-efficient GPU implementation of 3D Binary Thinning (skeletonization) using CUDA and PyTorch.
|
|
38
|
+
|
|
39
|
+
It is based on the [3D thinning algorithm by Lee and Kashyap (1994)](https://doi.org/10.1006/cvgi.1994.1039), which uses Euler characteristic invariance and 26-connectivity checks to safely erode a 3D binary volume down to a 1-pixel wide skeleton without altering its fundamental topology.
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
This implementation provides two topologically safe operating modes to suit your needs:
|
|
44
|
+
|
|
45
|
+
1. **Mode 0: GPU Subgrid 8-Color Parallel (`mode=0`, Default)**
|
|
46
|
+
* **Speed:** Extremely Fast (~200x speedup over CPU)
|
|
47
|
+
* **Behavior:** Operates entirely on the GPU. It avoids race conditions by partitioning the image into an 8-color 3D checkerboard. It re-checks and deletes pixels of the same color in parallel because they are mathematically guaranteed not to touch each other.
|
|
48
|
+
* **Topology:** **Topologically Safe**. Produces a mathematically valid skeleton. *Note: Because the deletion order differs slightly from a strict CPU raster-scan, the exact pixel placement may differ very slightly from ITK (e.g. 0.003% difference), but the overall global topology is preserved perfectly.*
|
|
49
|
+
2. **Mode 1: Hybrid CPU-GPU Sequential (`mode=1`)**
|
|
50
|
+
* **Speed:** Fast (~80x speedup over CPU)
|
|
51
|
+
* **Behavior:** Calculates Euler invariance on the GPU in parallel, but performs the final 26-connectivity re-checks strictly sequentially on the CPU (using zero-overhead memory compaction).
|
|
52
|
+
* **Topology:** **100% Identical to ITK**. Guaranteed to produce the exact same pixel output as standard sequential CPU implementations like `itk.BinaryThinningImageFilter3D`.
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
### Dependencies
|
|
57
|
+
* Python 3.8+
|
|
58
|
+
* PyTorch (with CUDA support)
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
git clone https://github.com/sychen52/binary_thinning_3d_cuda.git
|
|
62
|
+
cd binary_thinning_3d_cuda
|
|
63
|
+
|
|
64
|
+
# Standard install
|
|
65
|
+
pip install -e --no-build-isolation .
|
|
66
|
+
|
|
67
|
+
# Install with development dependencies (for running benchmarks)
|
|
68
|
+
pip install -e --no-build-isolation ".[dev]"
|
|
69
|
+
```
|
|
70
|
+
*(Note: `itk-thickness3d` and `SimpleITK` are **not** hard dependencies. They are only included in the `[dev]` extras for the purpose of benchmarking and validating against the CPU implementation).*
|
|
71
|
+
|
|
72
|
+
## Usage
|
|
73
|
+
|
|
74
|
+
The input must be a 3D contiguous PyTorch `uint8` (Byte) tensor located on a CUDA device. All non-zero values are treated as foreground (`0` for background, `>0` for foreground).
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import torch
|
|
78
|
+
from binary_thinning_3d import binary_thinning
|
|
79
|
+
|
|
80
|
+
# Create or load a 3D binary mask on the GPU
|
|
81
|
+
tensor = torch.zeros((100, 100, 100), dtype=torch.uint8, device='cuda')
|
|
82
|
+
tensor[25:75, 25:75, 25:75] = 1 # Solid block
|
|
83
|
+
|
|
84
|
+
# 1. GPU Subgrid (Default, Max Speed, Topologically Safe)
|
|
85
|
+
# Modifies the tensor in-place
|
|
86
|
+
binary_thinning(tensor, mode=0)
|
|
87
|
+
|
|
88
|
+
# 2. Hybrid CPU-GPU (Exact ITK Match)
|
|
89
|
+
binary_thinning(tensor, mode=1)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Benchmark
|
|
93
|
+
|
|
94
|
+
The following benchmark was run on a `(767, 512, 512)` NIfTI volume (CT Airways Label) containing `451,530` foreground voxels.
|
|
95
|
+
|
|
96
|
+
The benchmark compares this CUDA implementation against `itk.BinaryThinningImageFilter3D` (which is run sequentially on the CPU).
|
|
97
|
+
|
|
98
|
+
| Method | Output Voxel Count | Time (Seconds) | Speedup vs ITK | Matches ITK CPU? |
|
|
99
|
+
| :--- | :--- | :--- | :--- | :--- |
|
|
100
|
+
| **Mode 0 (GPU Subgrid)** | 4,286 | **0.72 s** | **194x** | Topologically equivalent |
|
|
101
|
+
| **Mode 1 (Hybrid CPU)** | 4,281 | 1.82 s | 77x | **Yes (100% Identical)** |
|
|
102
|
+
| **ITK (CPU Baseline)** | 4,281 | 140.27 s | 1x | Baseline |
|
|
103
|
+
|
|
104
|
+
To reproduce these benchmarks yourself:
|
|
105
|
+
```bash
|
|
106
|
+
# Ensure you installed with dev dependencies: pip install -e ".[dev]"
|
|
107
|
+
python examples/process_nifti.py
|
|
108
|
+
```
|
|
109
|
+
*(The script will cache the slow ITK result to disk on the first run, so subsequent runs finish instantly).*
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Binary Thinning 3D CUDA
|
|
2
|
+
|
|
3
|
+
This package provides a blazing fast, memory-efficient GPU implementation of 3D Binary Thinning (skeletonization) using CUDA and PyTorch.
|
|
4
|
+
|
|
5
|
+
It is based on the [3D thinning algorithm by Lee and Kashyap (1994)](https://doi.org/10.1006/cvgi.1994.1039), which uses Euler characteristic invariance and 26-connectivity checks to safely erode a 3D binary volume down to a 1-pixel wide skeleton without altering its fundamental topology.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
This implementation provides two topologically safe operating modes to suit your needs:
|
|
10
|
+
|
|
11
|
+
1. **Mode 0: GPU Subgrid 8-Color Parallel (`mode=0`, Default)**
|
|
12
|
+
* **Speed:** Extremely Fast (~200x speedup over CPU)
|
|
13
|
+
* **Behavior:** Operates entirely on the GPU. It avoids race conditions by partitioning the image into an 8-color 3D checkerboard. It re-checks and deletes pixels of the same color in parallel because they are mathematically guaranteed not to touch each other.
|
|
14
|
+
* **Topology:** **Topologically Safe**. Produces a mathematically valid skeleton. *Note: Because the deletion order differs slightly from a strict CPU raster-scan, the exact pixel placement may differ very slightly from ITK (e.g. 0.003% difference), but the overall global topology is preserved perfectly.*
|
|
15
|
+
2. **Mode 1: Hybrid CPU-GPU Sequential (`mode=1`)**
|
|
16
|
+
* **Speed:** Fast (~80x speedup over CPU)
|
|
17
|
+
* **Behavior:** Calculates Euler invariance on the GPU in parallel, but performs the final 26-connectivity re-checks strictly sequentially on the CPU (using zero-overhead memory compaction).
|
|
18
|
+
* **Topology:** **100% Identical to ITK**. Guaranteed to produce the exact same pixel output as standard sequential CPU implementations like `itk.BinaryThinningImageFilter3D`.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
### Dependencies
|
|
23
|
+
* Python 3.8+
|
|
24
|
+
* PyTorch (with CUDA support)
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
git clone https://github.com/sychen52/binary_thinning_3d_cuda.git
|
|
28
|
+
cd binary_thinning_3d_cuda
|
|
29
|
+
|
|
30
|
+
# Standard install
|
|
31
|
+
pip install -e --no-build-isolation .
|
|
32
|
+
|
|
33
|
+
# Install with development dependencies (for running benchmarks)
|
|
34
|
+
pip install -e --no-build-isolation ".[dev]"
|
|
35
|
+
```
|
|
36
|
+
*(Note: `itk-thickness3d` and `SimpleITK` are **not** hard dependencies. They are only included in the `[dev]` extras for the purpose of benchmarking and validating against the CPU implementation).*
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
The input must be a 3D contiguous PyTorch `uint8` (Byte) tensor located on a CUDA device. All non-zero values are treated as foreground (`0` for background, `>0` for foreground).
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import torch
|
|
44
|
+
from binary_thinning_3d import binary_thinning
|
|
45
|
+
|
|
46
|
+
# Create or load a 3D binary mask on the GPU
|
|
47
|
+
tensor = torch.zeros((100, 100, 100), dtype=torch.uint8, device='cuda')
|
|
48
|
+
tensor[25:75, 25:75, 25:75] = 1 # Solid block
|
|
49
|
+
|
|
50
|
+
# 1. GPU Subgrid (Default, Max Speed, Topologically Safe)
|
|
51
|
+
# Modifies the tensor in-place
|
|
52
|
+
binary_thinning(tensor, mode=0)
|
|
53
|
+
|
|
54
|
+
# 2. Hybrid CPU-GPU (Exact ITK Match)
|
|
55
|
+
binary_thinning(tensor, mode=1)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Benchmark
|
|
59
|
+
|
|
60
|
+
The following benchmark was run on a `(767, 512, 512)` NIfTI volume (CT Airways Label) containing `451,530` foreground voxels.
|
|
61
|
+
|
|
62
|
+
The benchmark compares this CUDA implementation against `itk.BinaryThinningImageFilter3D` (which is run sequentially on the CPU).
|
|
63
|
+
|
|
64
|
+
| Method | Output Voxel Count | Time (Seconds) | Speedup vs ITK | Matches ITK CPU? |
|
|
65
|
+
| :--- | :--- | :--- | :--- | :--- |
|
|
66
|
+
| **Mode 0 (GPU Subgrid)** | 4,286 | **0.72 s** | **194x** | Topologically equivalent |
|
|
67
|
+
| **Mode 1 (Hybrid CPU)** | 4,281 | 1.82 s | 77x | **Yes (100% Identical)** |
|
|
68
|
+
| **ITK (CPU Baseline)** | 4,281 | 140.27 s | 1x | Baseline |
|
|
69
|
+
|
|
70
|
+
To reproduce these benchmarks yourself:
|
|
71
|
+
```bash
|
|
72
|
+
# Ensure you installed with dev dependencies: pip install -e ".[dev]"
|
|
73
|
+
python examples/process_nifti.py
|
|
74
|
+
```
|
|
75
|
+
*(The script will cache the slow ITK result to disk on the first run, so subsequent runs finish instantly).*
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from . import cuda_thinning_ext
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def binary_thinning(tensor: torch.Tensor, mode: int = 0) -> torch.Tensor:
|
|
6
|
+
"""
|
|
7
|
+
3D binary thinning using CUDA. If the tensor is on CUDA, the operation is performed in-place.
|
|
8
|
+
If it's on CPU, it will be moved to CUDA for processing and then copied back.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
tensor (torch.Tensor): A 3D tensor. All non-zero values are treated as foreground.
|
|
12
|
+
mode (int):
|
|
13
|
+
0 = GPU Subgrid (Fastest, preserves topology, fully GPU)
|
|
14
|
+
1 = CPU Sequential Re-check (Matches ITK exactly, slower)
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
torch.Tensor: The thinned binary tensor.
|
|
18
|
+
"""
|
|
19
|
+
if tensor.dim() != 3:
|
|
20
|
+
raise ValueError("Tensor must be 3D.")
|
|
21
|
+
if mode not in [0, 1]:
|
|
22
|
+
raise ValueError("Mode must be 0 (GPU Subgrid) or 1 (CPU Sequential).")
|
|
23
|
+
|
|
24
|
+
# We must operate on a contiguous ByteTensor (uint8)
|
|
25
|
+
if not tensor.is_cuda:
|
|
26
|
+
work_tensor = tensor.to(device="cuda", dtype=torch.uint8)
|
|
27
|
+
elif tensor.dtype != torch.uint8 or not tensor.is_contiguous():
|
|
28
|
+
work_tensor = (tensor != 0).to(torch.uint8).contiguous()
|
|
29
|
+
else:
|
|
30
|
+
work_tensor = tensor
|
|
31
|
+
tensor[tensor != 0] = 1 # Ensure binary
|
|
32
|
+
|
|
33
|
+
cuda_thinning_ext.binary_thinning(work_tensor, mode)
|
|
34
|
+
|
|
35
|
+
if not tensor.is_cuda:
|
|
36
|
+
return work_tensor.cpu()
|
|
37
|
+
|
|
38
|
+
if work_tensor is not tensor:
|
|
39
|
+
tensor.copy_(work_tensor)
|
|
40
|
+
|
|
41
|
+
return tensor
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: binary_thinning_3d_cuda
|
|
3
|
+
Version: 1.1.8
|
|
4
|
+
Summary: A fast 3D binary thinning implementation using CUDA and PyTorch.
|
|
5
|
+
Home-page: https://github.com/sychen52/binary_thinning_3d_cuda
|
|
6
|
+
Author: Shiyang Chen
|
|
7
|
+
Author-email: sychen52@gmail.com
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/sychen52/binary_thinning_3d_cuda/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Image Processing
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: torch
|
|
18
|
+
Requires-Dist: numpy
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: SimpleITK; extra == "dev"
|
|
21
|
+
Requires-Dist: itk-thickness3d; extra == "dev"
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: author-email
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
Dynamic: project-url
|
|
30
|
+
Dynamic: provides-extra
|
|
31
|
+
Dynamic: requires-dist
|
|
32
|
+
Dynamic: requires-python
|
|
33
|
+
Dynamic: summary
|
|
34
|
+
|
|
35
|
+
# Binary Thinning 3D CUDA
|
|
36
|
+
|
|
37
|
+
This package provides a blazing fast, memory-efficient GPU implementation of 3D Binary Thinning (skeletonization) using CUDA and PyTorch.
|
|
38
|
+
|
|
39
|
+
It is based on the [3D thinning algorithm by Lee and Kashyap (1994)](https://doi.org/10.1006/cvgi.1994.1039), which uses Euler characteristic invariance and 26-connectivity checks to safely erode a 3D binary volume down to a 1-pixel wide skeleton without altering its fundamental topology.
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
This implementation provides two topologically safe operating modes to suit your needs:
|
|
44
|
+
|
|
45
|
+
1. **Mode 0: GPU Subgrid 8-Color Parallel (`mode=0`, Default)**
|
|
46
|
+
* **Speed:** Extremely Fast (~200x speedup over CPU)
|
|
47
|
+
* **Behavior:** Operates entirely on the GPU. It avoids race conditions by partitioning the image into an 8-color 3D checkerboard. It re-checks and deletes pixels of the same color in parallel because they are mathematically guaranteed not to touch each other.
|
|
48
|
+
* **Topology:** **Topologically Safe**. Produces a mathematically valid skeleton. *Note: Because the deletion order differs slightly from a strict CPU raster-scan, the exact pixel placement may differ very slightly from ITK (e.g. 0.003% difference), but the overall global topology is preserved perfectly.*
|
|
49
|
+
2. **Mode 1: Hybrid CPU-GPU Sequential (`mode=1`)**
|
|
50
|
+
* **Speed:** Fast (~80x speedup over CPU)
|
|
51
|
+
* **Behavior:** Calculates Euler invariance on the GPU in parallel, but performs the final 26-connectivity re-checks strictly sequentially on the CPU (using zero-overhead memory compaction).
|
|
52
|
+
* **Topology:** **100% Identical to ITK**. Guaranteed to produce the exact same pixel output as standard sequential CPU implementations like `itk.BinaryThinningImageFilter3D`.
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
### Dependencies
|
|
57
|
+
* Python 3.8+
|
|
58
|
+
* PyTorch (with CUDA support)
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
git clone https://github.com/sychen52/binary_thinning_3d_cuda.git
|
|
62
|
+
cd binary_thinning_3d_cuda
|
|
63
|
+
|
|
64
|
+
# Standard install
|
|
65
|
+
pip install -e --no-build-isolation .
|
|
66
|
+
|
|
67
|
+
# Install with development dependencies (for running benchmarks)
|
|
68
|
+
pip install -e --no-build-isolation ".[dev]"
|
|
69
|
+
```
|
|
70
|
+
*(Note: `itk-thickness3d` and `SimpleITK` are **not** hard dependencies. They are only included in the `[dev]` extras for the purpose of benchmarking and validating against the CPU implementation).*
|
|
71
|
+
|
|
72
|
+
## Usage
|
|
73
|
+
|
|
74
|
+
The input must be a 3D contiguous PyTorch `uint8` (Byte) tensor located on a CUDA device. All non-zero values are treated as foreground (`0` for background, `>0` for foreground).
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import torch
|
|
78
|
+
from binary_thinning_3d import binary_thinning
|
|
79
|
+
|
|
80
|
+
# Create or load a 3D binary mask on the GPU
|
|
81
|
+
tensor = torch.zeros((100, 100, 100), dtype=torch.uint8, device='cuda')
|
|
82
|
+
tensor[25:75, 25:75, 25:75] = 1 # Solid block
|
|
83
|
+
|
|
84
|
+
# 1. GPU Subgrid (Default, Max Speed, Topologically Safe)
|
|
85
|
+
# Modifies the tensor in-place
|
|
86
|
+
binary_thinning(tensor, mode=0)
|
|
87
|
+
|
|
88
|
+
# 2. Hybrid CPU-GPU (Exact ITK Match)
|
|
89
|
+
binary_thinning(tensor, mode=1)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Benchmark
|
|
93
|
+
|
|
94
|
+
The following benchmark was run on a `(767, 512, 512)` NIfTI volume (CT Airways Label) containing `451,530` foreground voxels.
|
|
95
|
+
|
|
96
|
+
The benchmark compares this CUDA implementation against `itk.BinaryThinningImageFilter3D` (which is run sequentially on the CPU).
|
|
97
|
+
|
|
98
|
+
| Method | Output Voxel Count | Time (Seconds) | Speedup vs ITK | Matches ITK CPU? |
|
|
99
|
+
| :--- | :--- | :--- | :--- | :--- |
|
|
100
|
+
| **Mode 0 (GPU Subgrid)** | 4,286 | **0.72 s** | **194x** | Topologically equivalent |
|
|
101
|
+
| **Mode 1 (Hybrid CPU)** | 4,281 | 1.82 s | 77x | **Yes (100% Identical)** |
|
|
102
|
+
| **ITK (CPU Baseline)** | 4,281 | 140.27 s | 1x | Baseline |
|
|
103
|
+
|
|
104
|
+
To reproduce these benchmarks yourself:
|
|
105
|
+
```bash
|
|
106
|
+
# Ensure you installed with dev dependencies: pip install -e ".[dev]"
|
|
107
|
+
python examples/process_nifti.py
|
|
108
|
+
```
|
|
109
|
+
*(The script will cache the slow ITK result to disk on the first run, so subsequent runs finish instantly).*
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
binary_thinning_3d/__init__.py
|
|
6
|
+
binary_thinning_3d_cuda.egg-info/PKG-INFO
|
|
7
|
+
binary_thinning_3d_cuda.egg-info/SOURCES.txt
|
|
8
|
+
binary_thinning_3d_cuda.egg-info/dependency_links.txt
|
|
9
|
+
binary_thinning_3d_cuda.egg-info/requires.txt
|
|
10
|
+
binary_thinning_3d_cuda.egg-info/top_level.txt
|
|
11
|
+
csrc/thinning.cpp
|
|
12
|
+
csrc/thinning_kernel.cu
|
|
13
|
+
tests/test_thinning.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
binary_thinning_3d
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
#include <torch/extension.h>
|
|
2
|
+
|
|
3
|
+
void binary_thinning_cuda(torch::Tensor image, int mode);
|
|
4
|
+
|
|
5
|
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
6
|
+
m.def("binary_thinning", &binary_thinning_cuda, "3D Binary Thinning (CUDA)",
|
|
7
|
+
pybind11::arg("image"), pybind11::arg("mode") = 0);
|
|
8
|
+
}
|
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 3D Binary Thinning CUDA Kernel
|
|
3
|
+
*
|
|
4
|
+
* Voxel value assumptions:
|
|
5
|
+
* - 0: Background
|
|
6
|
+
* - 1: Foreground (object)
|
|
7
|
+
* - 2: Internal marker for "candidate for deletion" during a thinning
|
|
8
|
+
* iteration.
|
|
9
|
+
*
|
|
10
|
+
* The algorithm iteratively identifies border points that are "simple" (can be
|
|
11
|
+
* removed without changing the topology) and marks them with 2. It then
|
|
12
|
+
* resolves these candidates either sequentially (Mode 1) or in parallel
|
|
13
|
+
* subgrids (Mode 2) to ensure topological correctness.
|
|
14
|
+
*/
|
|
15
|
+
#include <cuda.h>
|
|
16
|
+
#include <cuda_runtime.h>
|
|
17
|
+
#include <thrust/copy.h>
|
|
18
|
+
#include <thrust/device_ptr.h>
|
|
19
|
+
#include <thrust/execution_policy.h>
|
|
20
|
+
#include <thrust/iterator/counting_iterator.h>
|
|
21
|
+
#include <torch/extension.h>
|
|
22
|
+
#include <vector>
|
|
23
|
+
|
|
24
|
+
__constant__ int d_eulerLUT[256] = {
|
|
25
|
+
0, 1, 0, -1, 0, -1, 0, 1, 0, -3, 0, -1, 0, -1, 0, 1, 0, -1, 0, 1, 0, 1,
|
|
26
|
+
0, -1, 0, 3, 0, 1, 0, 1, 0, -1, 0, -3, 0, -1, 0, 3, 0, 1, 0, 1, 0, -1,
|
|
27
|
+
0, 3, 0, 1, 0, -1, 0, 1, 0, 1, 0, -1, 0, 3, 0, 1, 0, 1, 0, -1, 0, -3,
|
|
28
|
+
0, 3, 0, -1, 0, 1, 0, 1, 0, 3, 0, -1, 0, 1, 0, -1, 0, 1, 0, 1, 0, -1,
|
|
29
|
+
0, 3, 0, 1, 0, 1, 0, -1, 0, 1, 0, 3, 0, 3, 0, 1, 0, 5, 0, 3, 0, 3,
|
|
30
|
+
0, 1, 0, -1, 0, 1, 0, 1, 0, -1, 0, 3, 0, 1, 0, 1, 0, -1, 0, -7, 0, -1,
|
|
31
|
+
0, -1, 0, 1, 0, -3, 0, -1, 0, -1, 0, 1, 0, -1, 0, 1, 0, 1, 0, -1, 0, 3,
|
|
32
|
+
0, 1, 0, 1, 0, -1, 0, -3, 0, -1, 0, 3, 0, 1, 0, 1, 0, -1, 0, 3, 0, 1,
|
|
33
|
+
0, -1, 0, 1, 0, 1, 0, -1, 0, 3, 0, 1, 0, 1, 0, -1, 0, -3, 0, 3, 0, -1,
|
|
34
|
+
0, 1, 0, 1, 0, 3, 0, -1, 0, 1, 0, -1, 0, 1, 0, 1, 0, -1, 0, 3, 0, 1,
|
|
35
|
+
0, 1, 0, -1, 0, 1, 0, 3, 0, 3, 0, 1, 0, 5, 0, 3, 0, 3, 0, 1, 0, -1,
|
|
36
|
+
0, 1, 0, 1, 0, -1, 0, 3, 0, 1, 0, 1, 0, -1};
|
|
37
|
+
|
|
38
|
+
__device__ bool is_euler_invariant(const int *neighbors) {
|
|
39
|
+
int eulerChar = 0;
|
|
40
|
+
unsigned char n;
|
|
41
|
+
|
|
42
|
+
// Octant SWU
|
|
43
|
+
n = 1;
|
|
44
|
+
if (neighbors[24] == 1)
|
|
45
|
+
n |= 128;
|
|
46
|
+
if (neighbors[25] == 1)
|
|
47
|
+
n |= 64;
|
|
48
|
+
if (neighbors[15] == 1)
|
|
49
|
+
n |= 32;
|
|
50
|
+
if (neighbors[16] == 1)
|
|
51
|
+
n |= 16;
|
|
52
|
+
if (neighbors[21] == 1)
|
|
53
|
+
n |= 8;
|
|
54
|
+
if (neighbors[22] == 1)
|
|
55
|
+
n |= 4;
|
|
56
|
+
if (neighbors[12] == 1)
|
|
57
|
+
n |= 2;
|
|
58
|
+
eulerChar += d_eulerLUT[n];
|
|
59
|
+
|
|
60
|
+
// Octant SEU
|
|
61
|
+
n = 1;
|
|
62
|
+
if (neighbors[26] == 1)
|
|
63
|
+
n |= 128;
|
|
64
|
+
if (neighbors[23] == 1)
|
|
65
|
+
n |= 64;
|
|
66
|
+
if (neighbors[17] == 1)
|
|
67
|
+
n |= 32;
|
|
68
|
+
if (neighbors[14] == 1)
|
|
69
|
+
n |= 16;
|
|
70
|
+
if (neighbors[25] == 1)
|
|
71
|
+
n |= 8;
|
|
72
|
+
if (neighbors[22] == 1)
|
|
73
|
+
n |= 4;
|
|
74
|
+
if (neighbors[16] == 1)
|
|
75
|
+
n |= 2;
|
|
76
|
+
eulerChar += d_eulerLUT[n];
|
|
77
|
+
|
|
78
|
+
// Octant NWU
|
|
79
|
+
n = 1;
|
|
80
|
+
if (neighbors[18] == 1)
|
|
81
|
+
n |= 128;
|
|
82
|
+
if (neighbors[21] == 1)
|
|
83
|
+
n |= 64;
|
|
84
|
+
if (neighbors[9] == 1)
|
|
85
|
+
n |= 32;
|
|
86
|
+
if (neighbors[12] == 1)
|
|
87
|
+
n |= 16;
|
|
88
|
+
if (neighbors[19] == 1)
|
|
89
|
+
n |= 8;
|
|
90
|
+
if (neighbors[22] == 1)
|
|
91
|
+
n |= 4;
|
|
92
|
+
if (neighbors[10] == 1)
|
|
93
|
+
n |= 2;
|
|
94
|
+
eulerChar += d_eulerLUT[n];
|
|
95
|
+
|
|
96
|
+
// Octant NEU
|
|
97
|
+
n = 1;
|
|
98
|
+
if (neighbors[20] == 1)
|
|
99
|
+
n |= 128;
|
|
100
|
+
if (neighbors[23] == 1)
|
|
101
|
+
n |= 64;
|
|
102
|
+
if (neighbors[19] == 1)
|
|
103
|
+
n |= 32;
|
|
104
|
+
if (neighbors[22] == 1)
|
|
105
|
+
n |= 16;
|
|
106
|
+
if (neighbors[11] == 1)
|
|
107
|
+
n |= 8;
|
|
108
|
+
if (neighbors[14] == 1)
|
|
109
|
+
n |= 4;
|
|
110
|
+
if (neighbors[10] == 1)
|
|
111
|
+
n |= 2;
|
|
112
|
+
eulerChar += d_eulerLUT[n];
|
|
113
|
+
|
|
114
|
+
// Octant SWB
|
|
115
|
+
n = 1;
|
|
116
|
+
if (neighbors[6] == 1)
|
|
117
|
+
n |= 128;
|
|
118
|
+
if (neighbors[15] == 1)
|
|
119
|
+
n |= 64;
|
|
120
|
+
if (neighbors[7] == 1)
|
|
121
|
+
n |= 32;
|
|
122
|
+
if (neighbors[16] == 1)
|
|
123
|
+
n |= 16;
|
|
124
|
+
if (neighbors[3] == 1)
|
|
125
|
+
n |= 8;
|
|
126
|
+
if (neighbors[12] == 1)
|
|
127
|
+
n |= 4;
|
|
128
|
+
if (neighbors[4] == 1)
|
|
129
|
+
n |= 2;
|
|
130
|
+
eulerChar += d_eulerLUT[n];
|
|
131
|
+
|
|
132
|
+
// Octant SEB
|
|
133
|
+
n = 1;
|
|
134
|
+
if (neighbors[8] == 1)
|
|
135
|
+
n |= 128;
|
|
136
|
+
if (neighbors[7] == 1)
|
|
137
|
+
n |= 64;
|
|
138
|
+
if (neighbors[17] == 1)
|
|
139
|
+
n |= 32;
|
|
140
|
+
if (neighbors[16] == 1)
|
|
141
|
+
n |= 16;
|
|
142
|
+
if (neighbors[5] == 1)
|
|
143
|
+
n |= 8;
|
|
144
|
+
if (neighbors[4] == 1)
|
|
145
|
+
n |= 4;
|
|
146
|
+
if (neighbors[14] == 1)
|
|
147
|
+
n |= 2;
|
|
148
|
+
eulerChar += d_eulerLUT[n];
|
|
149
|
+
|
|
150
|
+
// Octant NWB
|
|
151
|
+
n = 1;
|
|
152
|
+
if (neighbors[0] == 1)
|
|
153
|
+
n |= 128;
|
|
154
|
+
if (neighbors[9] == 1)
|
|
155
|
+
n |= 64;
|
|
156
|
+
if (neighbors[3] == 1)
|
|
157
|
+
n |= 32;
|
|
158
|
+
if (neighbors[12] == 1)
|
|
159
|
+
n |= 16;
|
|
160
|
+
if (neighbors[1] == 1)
|
|
161
|
+
n |= 8;
|
|
162
|
+
if (neighbors[10] == 1)
|
|
163
|
+
n |= 4;
|
|
164
|
+
if (neighbors[4] == 1)
|
|
165
|
+
n |= 2;
|
|
166
|
+
eulerChar += d_eulerLUT[n];
|
|
167
|
+
|
|
168
|
+
// Octant NEB
|
|
169
|
+
n = 1;
|
|
170
|
+
if (neighbors[2] == 1)
|
|
171
|
+
n |= 128;
|
|
172
|
+
if (neighbors[1] == 1)
|
|
173
|
+
n |= 64;
|
|
174
|
+
if (neighbors[11] == 1)
|
|
175
|
+
n |= 32;
|
|
176
|
+
if (neighbors[10] == 1)
|
|
177
|
+
n |= 16;
|
|
178
|
+
if (neighbors[5] == 1)
|
|
179
|
+
n |= 8;
|
|
180
|
+
if (neighbors[4] == 1)
|
|
181
|
+
n |= 4;
|
|
182
|
+
if (neighbors[14] == 1)
|
|
183
|
+
n |= 2;
|
|
184
|
+
eulerChar += d_eulerLUT[n];
|
|
185
|
+
|
|
186
|
+
return (eulerChar == 0);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
__host__ __device__ int uf_find(int i, int *parent) {
|
|
190
|
+
while (parent[i] != i) {
|
|
191
|
+
parent[i] = parent[parent[i]];
|
|
192
|
+
i = parent[i];
|
|
193
|
+
}
|
|
194
|
+
return i;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
__host__ __device__ void uf_union(int i, int j, int *parent) {
|
|
198
|
+
int root_i = uf_find(i, parent);
|
|
199
|
+
int root_j = uf_find(j, parent);
|
|
200
|
+
if (root_i != root_j) {
|
|
201
|
+
parent[root_i] = root_j;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
__host__ __device__ bool is_simple_point(const int *neighbors) {
|
|
206
|
+
int parent[27];
|
|
207
|
+
for (int i = 0; i < 27; ++i) {
|
|
208
|
+
parent[i] = i;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
for (int i = 0; i < 27; ++i) {
|
|
212
|
+
if (i == 13 || neighbors[i] != 1)
|
|
213
|
+
continue;
|
|
214
|
+
int x1 = i % 3;
|
|
215
|
+
int y1 = (i / 3) % 3;
|
|
216
|
+
int z1 = i / 9;
|
|
217
|
+
|
|
218
|
+
for (int j = i + 1; j < 27; ++j) {
|
|
219
|
+
if (j == 13 || neighbors[j] != 1)
|
|
220
|
+
continue;
|
|
221
|
+
int x2 = j % 3;
|
|
222
|
+
int y2 = (j / 3) % 3;
|
|
223
|
+
int z2 = j / 9;
|
|
224
|
+
|
|
225
|
+
if (abs(x1 - x2) <= 1 && abs(y1 - y2) <= 1 && abs(z1 - z2) <= 1) {
|
|
226
|
+
uf_union(i, j, parent);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
int components = 0;
|
|
232
|
+
for (int i = 0; i < 27; ++i) {
|
|
233
|
+
if (i == 13)
|
|
234
|
+
continue;
|
|
235
|
+
if (neighbors[i] == 1 && parent[i] == i) {
|
|
236
|
+
components++;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return (components <= 1);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
__global__ void mark_deletable_points_kernel(unsigned char *img, int d, int h,
|
|
244
|
+
int w, int currentBorder) {
|
|
245
|
+
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
246
|
+
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
247
|
+
int z = blockIdx.z * blockDim.z + threadIdx.z;
|
|
248
|
+
|
|
249
|
+
if (x >= w || y >= h || z >= d)
|
|
250
|
+
return;
|
|
251
|
+
|
|
252
|
+
size_t idx = (size_t)z * (h * w) + y * w + x;
|
|
253
|
+
if (img[idx] == 0 || img[idx] == 2)
|
|
254
|
+
return;
|
|
255
|
+
|
|
256
|
+
int neighbors[27];
|
|
257
|
+
int num_neighbors = -1;
|
|
258
|
+
bool isBorderPoint = false;
|
|
259
|
+
|
|
260
|
+
for (int dz = -1; dz <= 1; ++dz) {
|
|
261
|
+
for (int dy = -1; dy <= 1; ++dy) {
|
|
262
|
+
for (int dx = -1; dx <= 1; ++dx) {
|
|
263
|
+
int nx = x + dx;
|
|
264
|
+
int ny = y + dy;
|
|
265
|
+
int nz = z + dz;
|
|
266
|
+
|
|
267
|
+
int n_idx = (dz + 1) * 9 + (dy + 1) * 3 + (dx + 1);
|
|
268
|
+
|
|
269
|
+
int val = 0;
|
|
270
|
+
if (nx >= 0 && nx < w && ny >= 0 && ny < h && nz >= 0 && nz < d) {
|
|
271
|
+
size_t flat_n_idx = (size_t)nz * (h * w) + ny * w + nx;
|
|
272
|
+
val = img[flat_n_idx];
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
int binary_val = (val > 0) ? 1 : 0;
|
|
276
|
+
neighbors[n_idx] = binary_val;
|
|
277
|
+
|
|
278
|
+
if (binary_val == 1) {
|
|
279
|
+
num_neighbors++;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if (dx == 0 && dy == -1 && dz == 0 && currentBorder == 1 && val == 0)
|
|
283
|
+
isBorderPoint = true; // N
|
|
284
|
+
if (dx == 0 && dy == 1 && dz == 0 && currentBorder == 2 && val == 0)
|
|
285
|
+
isBorderPoint = true; // S
|
|
286
|
+
if (dx == 1 && dy == 0 && dz == 0 && currentBorder == 3 && val == 0)
|
|
287
|
+
isBorderPoint = true; // E
|
|
288
|
+
if (dx == -1 && dy == 0 && dz == 0 && currentBorder == 4 && val == 0)
|
|
289
|
+
isBorderPoint = true; // W
|
|
290
|
+
if (dx == 0 && dy == 0 && dz == 1 && currentBorder == 5 && val == 0)
|
|
291
|
+
isBorderPoint = true; // U
|
|
292
|
+
if (dx == 0 && dy == 0 && dz == -1 && currentBorder == 6 && val == 0)
|
|
293
|
+
isBorderPoint = true; // B
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if (!isBorderPoint)
|
|
299
|
+
return;
|
|
300
|
+
if (num_neighbors == 1)
|
|
301
|
+
return;
|
|
302
|
+
if (!is_euler_invariant(neighbors))
|
|
303
|
+
return;
|
|
304
|
+
if (!is_simple_point(neighbors))
|
|
305
|
+
return;
|
|
306
|
+
|
|
307
|
+
img[idx] = 2; // mark for deletion
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
struct is_marked {
|
|
311
|
+
unsigned char *img;
|
|
312
|
+
__host__ __device__ is_marked(unsigned char *_img) : img(_img) {}
|
|
313
|
+
__device__ bool operator()(const size_t &idx) const { return img[idx] == 2; }
|
|
314
|
+
};
|
|
315
|
+
|
|
316
|
+
__global__ void subgrid_recheck_kernel(unsigned char *img, int d, int h, int w,
|
|
317
|
+
unsigned int *marked_indices, int count,
|
|
318
|
+
int color, int *changed) {
|
|
319
|
+
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
320
|
+
if (i >= count)
|
|
321
|
+
return;
|
|
322
|
+
|
|
323
|
+
size_t idx = marked_indices[i];
|
|
324
|
+
if (img[idx] != 2)
|
|
325
|
+
return; // Already handled by previous color or untouched
|
|
326
|
+
|
|
327
|
+
int x = idx % w;
|
|
328
|
+
int y = (idx / w) % h;
|
|
329
|
+
int z = idx / (w * h);
|
|
330
|
+
|
|
331
|
+
int p = (x % 2) + (y % 2) * 2 + (z % 2) * 4;
|
|
332
|
+
if (p != color)
|
|
333
|
+
return;
|
|
334
|
+
|
|
335
|
+
img[idx] = 0; // Temporarily delete
|
|
336
|
+
|
|
337
|
+
int neighbors[27];
|
|
338
|
+
for (int dz = -1; dz <= 1; ++dz) {
|
|
339
|
+
for (int dy = -1; dy <= 1; ++dy) {
|
|
340
|
+
for (int dx = -1; dx <= 1; ++dx) {
|
|
341
|
+
int nx = x + dx;
|
|
342
|
+
int ny = y + dy;
|
|
343
|
+
int nz = z + dz;
|
|
344
|
+
int n_idx = (dz + 1) * 9 + (dy + 1) * 3 + (dx + 1);
|
|
345
|
+
|
|
346
|
+
int val = 0;
|
|
347
|
+
if (nx >= 0 && nx < w && ny >= 0 && ny < h && nz >= 0 && nz < d) {
|
|
348
|
+
size_t flat_n_idx = (size_t)nz * (h * w) + ny * w + nx;
|
|
349
|
+
val = img[flat_n_idx];
|
|
350
|
+
}
|
|
351
|
+
neighbors[n_idx] = (val > 0) ? 1 : 0;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
if (!is_simple_point(neighbors)) {
|
|
357
|
+
img[idx] = 1; // Not simple anymore, restore
|
|
358
|
+
} else {
|
|
359
|
+
atomicAdd(changed, 1);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
__global__ void apply_updates_kernel(unsigned char *img, unsigned int *indices,
|
|
364
|
+
int count, unsigned char val) {
|
|
365
|
+
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
366
|
+
if (i < count) {
|
|
367
|
+
img[indices[i]] = val;
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
void binary_thinning_cuda(torch::Tensor image, int mode) {
|
|
372
|
+
TORCH_CHECK(image.is_cuda(), "image must be a CUDA tensor");
|
|
373
|
+
TORCH_CHECK(image.is_contiguous(), "image must be contiguous");
|
|
374
|
+
TORCH_CHECK(image.scalar_type() == torch::kByte,
|
|
375
|
+
"image must be a ByteTensor (uint8)");
|
|
376
|
+
TORCH_CHECK(image.dim() == 3, "image must be a 3D tensor");
|
|
377
|
+
|
|
378
|
+
int d = image.size(0);
|
|
379
|
+
int h = image.size(1);
|
|
380
|
+
int w = image.size(2);
|
|
381
|
+
size_t total_size = (size_t)d * h * w;
|
|
382
|
+
|
|
383
|
+
unsigned char *d_img = image.data_ptr<unsigned char>();
|
|
384
|
+
|
|
385
|
+
int *d_changed;
|
|
386
|
+
cudaMalloc(&d_changed, sizeof(int));
|
|
387
|
+
|
|
388
|
+
unsigned int *d_marked_indices = nullptr;
|
|
389
|
+
unsigned char *h_img = nullptr;
|
|
390
|
+
|
|
391
|
+
cudaMalloc(&d_marked_indices, total_size * sizeof(unsigned int));
|
|
392
|
+
if (mode == 1) { // Mode 1: Exact ITK Hybrid
|
|
393
|
+
h_img = new unsigned char[total_size];
|
|
394
|
+
cudaMemcpy(h_img, d_img, total_size, cudaMemcpyDeviceToHost);
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
dim3 blockSize(8, 4, 4);
|
|
398
|
+
dim3 gridSize((w + blockSize.x - 1) / blockSize.x,
|
|
399
|
+
(h + blockSize.y - 1) / blockSize.y,
|
|
400
|
+
(d + blockSize.z - 1) / blockSize.z);
|
|
401
|
+
|
|
402
|
+
int h_changed = 0;
|
|
403
|
+
do {
|
|
404
|
+
h_changed = 0;
|
|
405
|
+
for (int border = 1; border <= 6; ++border) {
|
|
406
|
+
mark_deletable_points_kernel<<<gridSize, blockSize>>>(d_img, d, h, w,
|
|
407
|
+
border);
|
|
408
|
+
|
|
409
|
+
cudaMemset(d_changed, 0, sizeof(int));
|
|
410
|
+
|
|
411
|
+
thrust::counting_iterator<size_t> first(0);
|
|
412
|
+
thrust::counting_iterator<size_t> last(total_size);
|
|
413
|
+
thrust::device_ptr<unsigned int> dest(d_marked_indices);
|
|
414
|
+
|
|
415
|
+
auto end_ptr =
|
|
416
|
+
thrust::copy_if(thrust::device, first, last, dest, is_marked(d_img));
|
|
417
|
+
int h_count = end_ptr - dest;
|
|
418
|
+
|
|
419
|
+
if (h_count > 0) {
|
|
420
|
+
if (mode == 1) {
|
|
421
|
+
// Mode 1: CPU Sequential (Exact ITK Match)
|
|
422
|
+
std::vector<unsigned int> h_marked(h_count);
|
|
423
|
+
cudaMemcpy(h_marked.data(), d_marked_indices,
|
|
424
|
+
h_count * sizeof(unsigned int), cudaMemcpyDeviceToHost);
|
|
425
|
+
|
|
426
|
+
std::vector<unsigned int> h_deleted;
|
|
427
|
+
std::vector<unsigned int> h_restored;
|
|
428
|
+
h_deleted.reserve(h_count);
|
|
429
|
+
h_restored.reserve(h_count);
|
|
430
|
+
|
|
431
|
+
for (int i = 0; i < h_count; ++i) {
|
|
432
|
+
unsigned int idx = h_marked[i];
|
|
433
|
+
|
|
434
|
+
int x = idx % w;
|
|
435
|
+
int y = (idx / w) % h;
|
|
436
|
+
int z = idx / (w * h);
|
|
437
|
+
|
|
438
|
+
h_img[idx] = 0; // Temporarily delete
|
|
439
|
+
|
|
440
|
+
int neighbors[27];
|
|
441
|
+
for (int dz = -1; dz <= 1; ++dz) {
|
|
442
|
+
for (int dy = -1; dy <= 1; ++dy) {
|
|
443
|
+
for (int dx = -1; dx <= 1; ++dx) {
|
|
444
|
+
int nx = x + dx;
|
|
445
|
+
int ny = y + dy;
|
|
446
|
+
int nz = z + dz;
|
|
447
|
+
int n_idx = (dz + 1) * 9 + (dy + 1) * 3 + (dx + 1);
|
|
448
|
+
|
|
449
|
+
int val = 0;
|
|
450
|
+
if (nx >= 0 && nx < w && ny >= 0 && ny < h && nz >= 0 &&
|
|
451
|
+
nz < d) {
|
|
452
|
+
size_t flat_n_idx = (size_t)nz * (h * w) + ny * w + nx;
|
|
453
|
+
val = h_img[flat_n_idx];
|
|
454
|
+
}
|
|
455
|
+
neighbors[n_idx] = (val > 0) ? 1 : 0;
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
if (!is_simple_point(neighbors)) {
|
|
461
|
+
h_img[idx] = 1; // Not simple anymore, restore
|
|
462
|
+
h_restored.push_back(idx);
|
|
463
|
+
} else {
|
|
464
|
+
h_deleted.push_back(idx);
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
if (!h_deleted.empty()) {
|
|
469
|
+
cudaMemcpy(d_marked_indices, h_deleted.data(),
|
|
470
|
+
h_deleted.size() * sizeof(unsigned int),
|
|
471
|
+
cudaMemcpyHostToDevice);
|
|
472
|
+
int threads = 256;
|
|
473
|
+
int blocks = (h_deleted.size() + threads - 1) / threads;
|
|
474
|
+
apply_updates_kernel<<<blocks, threads>>>(d_img, d_marked_indices,
|
|
475
|
+
h_deleted.size(), 0);
|
|
476
|
+
h_changed += h_deleted.size();
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
if (!h_restored.empty()) {
|
|
480
|
+
cudaMemcpy(d_marked_indices, h_restored.data(),
|
|
481
|
+
h_restored.size() * sizeof(unsigned int),
|
|
482
|
+
cudaMemcpyHostToDevice);
|
|
483
|
+
int threads = 256;
|
|
484
|
+
int blocks = (h_restored.size() + threads - 1) / threads;
|
|
485
|
+
apply_updates_kernel<<<blocks, threads>>>(d_img, d_marked_indices,
|
|
486
|
+
h_restored.size(), 1);
|
|
487
|
+
}
|
|
488
|
+
} else if (mode == 0) {
|
|
489
|
+
// Mode 0: GPU Subgrid (8-color parallel) - Topologically safe, purely
|
|
490
|
+
// GPU
|
|
491
|
+
int threads = 256;
|
|
492
|
+
int blocks = (h_count + threads - 1) / threads;
|
|
493
|
+
for (int color = 0; color < 8; ++color) {
|
|
494
|
+
subgrid_recheck_kernel<<<blocks, threads>>>(
|
|
495
|
+
d_img, d, h, w, d_marked_indices, h_count, color, d_changed);
|
|
496
|
+
}
|
|
497
|
+
int changed_this_border = 0;
|
|
498
|
+
cudaMemcpy(&changed_this_border, d_changed, sizeof(int),
|
|
499
|
+
cudaMemcpyDeviceToHost);
|
|
500
|
+
h_changed += changed_this_border;
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
} while (h_changed > 0);
|
|
505
|
+
|
|
506
|
+
cudaFree(d_marked_indices);
|
|
507
|
+
if (mode == 1) {
|
|
508
|
+
delete[] h_img;
|
|
509
|
+
}
|
|
510
|
+
cudaFree(d_changed);
|
|
511
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from setuptools import setup, Extension
|
|
3
|
+
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
|
|
4
|
+
|
|
5
|
+
# Read README.md for long_description
|
|
6
|
+
this_directory = os.path.abspath(os.path.dirname(__file__))
|
|
7
|
+
with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f:
|
|
8
|
+
long_description = f.read()
|
|
9
|
+
|
|
10
|
+
# Check if CUDA is available. If not, we are likely building a source distribution (sdist)
|
|
11
|
+
# or just gathering metadata, so we use a dummy extension.
|
|
12
|
+
if CUDA_HOME is None:
|
|
13
|
+
# Use a dummy extension so metadata can be gathered without CUDA checks
|
|
14
|
+
ext_modules = [
|
|
15
|
+
Extension(
|
|
16
|
+
"binary_thinning_3d.cuda_thinning_ext",
|
|
17
|
+
["csrc/thinning.cpp", "csrc/thinning_kernel.cu"],
|
|
18
|
+
)
|
|
19
|
+
]
|
|
20
|
+
else:
|
|
21
|
+
# Use the real CUDAExtension for binary wheel builds
|
|
22
|
+
ext_modules = [
|
|
23
|
+
CUDAExtension(
|
|
24
|
+
"binary_thinning_3d.cuda_thinning_ext",
|
|
25
|
+
[
|
|
26
|
+
"csrc/thinning.cpp",
|
|
27
|
+
"csrc/thinning_kernel.cu",
|
|
28
|
+
],
|
|
29
|
+
),
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
setup(
|
|
33
|
+
name="binary_thinning_3d_cuda",
|
|
34
|
+
version="1.1.8",
|
|
35
|
+
author="Shiyang Chen",
|
|
36
|
+
author_email="sychen52@gmail.com",
|
|
37
|
+
description="A fast 3D binary thinning implementation using CUDA and PyTorch.",
|
|
38
|
+
long_description=long_description,
|
|
39
|
+
long_description_content_type="text/markdown",
|
|
40
|
+
url="https://github.com/sychen52/binary_thinning_3d_cuda",
|
|
41
|
+
project_urls={
|
|
42
|
+
"Bug Tracker": "https://github.com/sychen52/binary_thinning_3d_cuda/issues",
|
|
43
|
+
},
|
|
44
|
+
classifiers=[
|
|
45
|
+
"Programming Language :: Python :: 3",
|
|
46
|
+
"License :: OSI Approved :: MIT License",
|
|
47
|
+
"Operating System :: POSIX :: Linux",
|
|
48
|
+
"Topic :: Scientific/Engineering :: Image Processing",
|
|
49
|
+
"Intended Audience :: Science/Research",
|
|
50
|
+
],
|
|
51
|
+
packages=["binary_thinning_3d"],
|
|
52
|
+
install_requires=["torch", "numpy"],
|
|
53
|
+
extras_require={"dev": ["SimpleITK", "itk-thickness3d"]},
|
|
54
|
+
python_requires=">=3.8",
|
|
55
|
+
ext_modules=ext_modules,
|
|
56
|
+
cmdclass={"build_ext": BuildExtension},
|
|
57
|
+
)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from binary_thinning_3d import binary_thinning
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_thinning():
|
|
6
|
+
# Create a 5x5x5 block and put a 3x3x3 cube in the middle
|
|
7
|
+
t = torch.zeros((5, 5, 5), dtype=torch.uint8, device="cuda")
|
|
8
|
+
t[1:4, 1:4, 1:4] = 1
|
|
9
|
+
|
|
10
|
+
print("Original sum:", t.sum().item())
|
|
11
|
+
|
|
12
|
+
# Run thinning
|
|
13
|
+
binary_thinning(t)
|
|
14
|
+
|
|
15
|
+
print("Thinned sum:", t.sum().item())
|
|
16
|
+
|
|
17
|
+
print("Coordinates of remaining points:")
|
|
18
|
+
coords = torch.nonzero(t)
|
|
19
|
+
for c in coords:
|
|
20
|
+
print(c.tolist())
|
|
21
|
+
|
|
22
|
+
print("Center pixel value:", t[2, 2, 2].item())
|
|
23
|
+
assert t.sum().item() == 3
|
|
24
|
+
assert t[2, 2, 2].item() == 1
|
|
25
|
+
print("Test passed!")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
test_thinning()
|