learning3d 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. learning3d/__init__.py +2 -0
  2. learning3d/data_utils/__init__.py +4 -0
  3. learning3d/data_utils/dataloaders.py +454 -0
  4. learning3d/data_utils/user_data.py +119 -0
  5. learning3d/examples/test_dcp.py +139 -0
  6. learning3d/examples/test_deepgmr.py +144 -0
  7. learning3d/examples/test_flownet.py +113 -0
  8. learning3d/examples/test_masknet.py +159 -0
  9. learning3d/examples/test_masknet2.py +162 -0
  10. learning3d/examples/test_pcn.py +118 -0
  11. learning3d/examples/test_pcrnet.py +120 -0
  12. learning3d/examples/test_pnlk.py +121 -0
  13. learning3d/examples/test_pointconv.py +126 -0
  14. learning3d/examples/test_pointnet.py +121 -0
  15. learning3d/examples/test_prnet.py +126 -0
  16. learning3d/examples/test_rpmnet.py +120 -0
  17. learning3d/examples/train_PointNetLK.py +240 -0
  18. learning3d/examples/train_dcp.py +249 -0
  19. learning3d/examples/train_deepgmr.py +244 -0
  20. learning3d/examples/train_flownet.py +259 -0
  21. learning3d/examples/train_masknet.py +239 -0
  22. learning3d/examples/train_pcn.py +216 -0
  23. learning3d/examples/train_pcrnet.py +228 -0
  24. learning3d/examples/train_pointconv.py +245 -0
  25. learning3d/examples/train_pointnet.py +244 -0
  26. learning3d/examples/train_prnet.py +229 -0
  27. learning3d/examples/train_rpmnet.py +228 -0
  28. learning3d/losses/__init__.py +12 -0
  29. learning3d/losses/chamfer_distance.py +51 -0
  30. learning3d/losses/classification.py +14 -0
  31. learning3d/losses/correspondence_loss.py +10 -0
  32. learning3d/losses/cuda/chamfer_distance/__init__.py +1 -0
  33. learning3d/losses/cuda/chamfer_distance/chamfer_distance.cpp +185 -0
  34. learning3d/losses/cuda/chamfer_distance/chamfer_distance.cu +209 -0
  35. learning3d/losses/cuda/chamfer_distance/chamfer_distance.py +66 -0
  36. learning3d/losses/cuda/emd_torch/pkg/emd_loss_layer.py +41 -0
  37. learning3d/losses/cuda/emd_torch/pkg/include/cuda/emd.cuh +347 -0
  38. learning3d/losses/cuda/emd_torch/pkg/include/cuda_helper.h +18 -0
  39. learning3d/losses/cuda/emd_torch/pkg/include/emd.h +54 -0
  40. learning3d/losses/cuda/emd_torch/pkg/layer/__init__.py +1 -0
  41. learning3d/losses/cuda/emd_torch/pkg/layer/emd_loss_layer.py +40 -0
  42. learning3d/losses/cuda/emd_torch/pkg/src/cuda/emd.cu +70 -0
  43. learning3d/losses/cuda/emd_torch/pkg/src/emd.cpp +1 -0
  44. learning3d/losses/cuda/emd_torch/setup.py +29 -0
  45. learning3d/losses/emd.py +16 -0
  46. learning3d/losses/frobenius_norm.py +21 -0
  47. learning3d/losses/rmse_features.py +16 -0
  48. learning3d/models/__init__.py +23 -0
  49. learning3d/models/classifier.py +41 -0
  50. learning3d/models/dcp.py +92 -0
  51. learning3d/models/deepgmr.py +165 -0
  52. learning3d/models/dgcnn.py +92 -0
  53. learning3d/models/flownet3d.py +446 -0
  54. learning3d/models/masknet.py +84 -0
  55. learning3d/models/masknet2.py +264 -0
  56. learning3d/models/pcn.py +164 -0
  57. learning3d/models/pcrnet.py +74 -0
  58. learning3d/models/pointconv.py +108 -0
  59. learning3d/models/pointnet.py +108 -0
  60. learning3d/models/pointnetlk.py +173 -0
  61. learning3d/models/pooling.py +15 -0
  62. learning3d/models/ppfnet.py +102 -0
  63. learning3d/models/prnet.py +431 -0
  64. learning3d/models/rpmnet.py +359 -0
  65. learning3d/models/segmentation.py +38 -0
  66. learning3d/ops/__init__.py +0 -0
  67. learning3d/ops/data_utils.py +45 -0
  68. learning3d/ops/invmat.py +134 -0
  69. learning3d/ops/quaternion.py +218 -0
  70. learning3d/ops/se3.py +157 -0
  71. learning3d/ops/sinc.py +229 -0
  72. learning3d/ops/so3.py +213 -0
  73. learning3d/ops/transform_functions.py +342 -0
  74. learning3d/utils/__init__.py +9 -0
  75. learning3d/utils/lib/build/lib.linux-x86_64-3.5/pointnet2_cuda.cpython-35m-x86_64-linux-gnu.so +0 -0
  76. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/ball_query.o +0 -0
  77. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/ball_query_gpu.o +0 -0
  78. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/group_points.o +0 -0
  79. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/group_points_gpu.o +0 -0
  80. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/interpolate.o +0 -0
  81. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/interpolate_gpu.o +0 -0
  82. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/pointnet2_api.o +0 -0
  83. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/sampling.o +0 -0
  84. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/sampling_gpu.o +0 -0
  85. learning3d/utils/lib/dist/pointnet2-0.0.0-py3.5-linux-x86_64.egg +0 -0
  86. learning3d/utils/lib/pointnet2.egg-info/SOURCES.txt +14 -0
  87. learning3d/utils/lib/pointnet2.egg-info/dependency_links.txt +1 -0
  88. learning3d/utils/lib/pointnet2.egg-info/top_level.txt +1 -0
  89. learning3d/utils/lib/pointnet2_modules.py +160 -0
  90. learning3d/utils/lib/pointnet2_utils.py +318 -0
  91. learning3d/utils/lib/pytorch_utils.py +236 -0
  92. learning3d/utils/lib/setup.py +23 -0
  93. learning3d/utils/lib/src/ball_query.cpp +25 -0
  94. learning3d/utils/lib/src/ball_query_gpu.cu +67 -0
  95. learning3d/utils/lib/src/ball_query_gpu.h +15 -0
  96. learning3d/utils/lib/src/cuda_utils.h +15 -0
  97. learning3d/utils/lib/src/group_points.cpp +36 -0
  98. learning3d/utils/lib/src/group_points_gpu.cu +86 -0
  99. learning3d/utils/lib/src/group_points_gpu.h +22 -0
  100. learning3d/utils/lib/src/interpolate.cpp +65 -0
  101. learning3d/utils/lib/src/interpolate_gpu.cu +233 -0
  102. learning3d/utils/lib/src/interpolate_gpu.h +36 -0
  103. learning3d/utils/lib/src/pointnet2_api.cpp +25 -0
  104. learning3d/utils/lib/src/sampling.cpp +46 -0
  105. learning3d/utils/lib/src/sampling_gpu.cu +253 -0
  106. learning3d/utils/lib/src/sampling_gpu.h +29 -0
  107. learning3d/utils/pointconv_util.py +382 -0
  108. learning3d/utils/ppfnet_util.py +244 -0
  109. learning3d/utils/svd.py +59 -0
  110. learning3d/utils/transformer.py +243 -0
  111. learning3d-0.0.1.dist-info/LICENSE +21 -0
  112. learning3d-0.0.1.dist-info/METADATA +271 -0
  113. learning3d-0.0.1.dist-info/RECORD +115 -0
  114. learning3d-0.0.1.dist-info/WHEEL +5 -0
  115. learning3d-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,23 @@
1
+ from setuptools import setup
2
+ from torch.utils.cpp_extension import BuildExtension, CUDAExtension
3
+
4
+ setup(
5
+ name='pointnet2',
6
+ ext_modules=[
7
+ CUDAExtension('pointnet2_cuda', [
8
+ 'src/pointnet2_api.cpp',
9
+
10
+ 'src/ball_query.cpp',
11
+ 'src/ball_query_gpu.cu',
12
+ 'src/group_points.cpp',
13
+ 'src/group_points_gpu.cu',
14
+ 'src/interpolate.cpp',
15
+ 'src/interpolate_gpu.cu',
16
+ 'src/sampling.cpp',
17
+ 'src/sampling_gpu.cu',
18
+ ],
19
+ extra_compile_args={'cxx': ['-g'],
20
+ 'nvcc': ['-O2']})
21
+ ],
22
+ cmdclass={'build_ext': BuildExtension}
23
+ )
@@ -0,0 +1,25 @@
1
+ #include <torch/serialize/tensor.h>
2
+ #include <vector>
3
+ #include <THC/THC.h>
4
+ #include <cuda.h>
5
+ #include <cuda_runtime_api.h>
6
+ #include "ball_query_gpu.h"
7
+
8
+ extern THCState *state;
9
+
10
+ #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
11
+ #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
12
+ #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
13
+
14
+ int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
15
+ at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
16
+ CHECK_INPUT(new_xyz_tensor);
17
+ CHECK_INPUT(xyz_tensor);
18
+ const float *new_xyz = new_xyz_tensor.data<float>();
19
+ const float *xyz = xyz_tensor.data<float>();
20
+ int *idx = idx_tensor.data<int>();
21
+
22
+ cudaStream_t stream = THCState_getCurrentStream(state);
23
+ ball_query_kernel_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx, stream);
24
+ return 1;
25
+ }
@@ -0,0 +1,67 @@
1
+ #include <math.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "ball_query_gpu.h"
6
+ #include "cuda_utils.h"
7
+
8
+
9
+ __global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int nsample,
10
+ const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
11
+ // new_xyz: (B, M, 3)
12
+ // xyz: (B, N, 3)
13
+ // output:
14
+ // idx: (B, M, nsample)
15
+ int bs_idx = blockIdx.y;
16
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
17
+ if (bs_idx >= b || pt_idx >= m) return;
18
+
19
+ new_xyz += bs_idx * m * 3 + pt_idx * 3;
20
+ xyz += bs_idx * n * 3;
21
+ idx += bs_idx * m * nsample + pt_idx * nsample;
22
+
23
+ float radius2 = radius * radius;
24
+ float new_x = new_xyz[0];
25
+ float new_y = new_xyz[1];
26
+ float new_z = new_xyz[2];
27
+
28
+ int cnt = 0;
29
+ for (int k = 0; k < n; ++k) {
30
+ float x = xyz[k * 3 + 0];
31
+ float y = xyz[k * 3 + 1];
32
+ float z = xyz[k * 3 + 2];
33
+ float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
34
+ if (d2 < radius2){
35
+ if (cnt == 0){
36
+ for (int l = 0; l < nsample; ++l) {
37
+ idx[l] = k;
38
+ }
39
+ }
40
+ idx[cnt] = k;
41
+ ++cnt;
42
+ if (cnt >= nsample) break;
43
+ }
44
+ }
45
+ }
46
+
47
+
48
+ void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \
49
+ const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) {
50
+ // new_xyz: (B, M, 3)
51
+ // xyz: (B, N, 3)
52
+ // output:
53
+ // idx: (B, M, nsample)
54
+
55
+ cudaError_t err;
56
+
57
+ dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
58
+ dim3 threads(THREADS_PER_BLOCK);
59
+
60
+ ball_query_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
61
+ // cudaDeviceSynchronize(); // for using printf in kernel function
62
+ err = cudaGetLastError();
63
+ if (cudaSuccess != err) {
64
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
65
+ exit(-1);
66
+ }
67
+ }
@@ -0,0 +1,15 @@
1
+ #ifndef _BALL_QUERY_GPU_H
2
+ #define _BALL_QUERY_GPU_H
3
+
4
+ #include <torch/serialize/tensor.h>
5
+ #include <vector>
6
+ #include <cuda.h>
7
+ #include <cuda_runtime_api.h>
8
+
9
+ int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
10
+ at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
11
+
12
+ void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample,
13
+ const float *xyz, const float *new_xyz, int *idx, cudaStream_t stream);
14
+
15
+ #endif
@@ -0,0 +1,15 @@
1
+ #ifndef _CUDA_UTILS_H
2
+ #define _CUDA_UTILS_H
3
+
4
+ #include <cmath>
5
+
6
+ #define TOTAL_THREADS 1024
7
+ #define THREADS_PER_BLOCK 256
8
+ #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
9
+
10
+ inline int opt_n_threads(int work_size) {
11
+ const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
12
+
13
+ return max(min(1 << pow_2, TOTAL_THREADS), 1);
14
+ }
15
+ #endif
@@ -0,0 +1,36 @@
1
+ #include <torch/serialize/tensor.h>
2
+ #include <cuda.h>
3
+ #include <cuda_runtime_api.h>
4
+ #include <vector>
5
+ #include <THC/THC.h>
6
+ #include "group_points_gpu.h"
7
+
8
+ extern THCState *state;
9
+
10
+
11
+ int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
12
+ at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
13
+
14
+ float *grad_points = grad_points_tensor.data<float>();
15
+ const int *idx = idx_tensor.data<int>();
16
+ const float *grad_out = grad_out_tensor.data<float>();
17
+
18
+ cudaStream_t stream = THCState_getCurrentStream(state);
19
+
20
+ group_points_grad_kernel_launcher_fast(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream);
21
+ return 1;
22
+ }
23
+
24
+
25
+ int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
26
+ at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
27
+
28
+ const float *points = points_tensor.data<float>();
29
+ const int *idx = idx_tensor.data<int>();
30
+ float *out = out_tensor.data<float>();
31
+
32
+ cudaStream_t stream = THCState_getCurrentStream(state);
33
+
34
+ group_points_kernel_launcher_fast(b, c, n, npoints, nsample, points, idx, out, stream);
35
+ return 1;
36
+ }
@@ -0,0 +1,86 @@
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+
4
+ #include "cuda_utils.h"
5
+ #include "group_points_gpu.h"
6
+
7
+
8
+ __global__ void group_points_grad_kernel_fast(int b, int c, int n, int npoints, int nsample,
9
+ const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
10
+ // grad_out: (B, C, npoints, nsample)
11
+ // idx: (B, npoints, nsample)
12
+ // output:
13
+ // grad_points: (B, C, N)
14
+ int bs_idx = blockIdx.z;
15
+ int c_idx = blockIdx.y;
16
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
17
+ int pt_idx = index / nsample;
18
+ if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
19
+
20
+ int sample_idx = index % nsample;
21
+ grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
22
+ idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
23
+
24
+ atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
25
+ }
26
+
27
+ void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
28
+ const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
29
+ // grad_out: (B, C, npoints, nsample)
30
+ // idx: (B, npoints, nsample)
31
+ // output:
32
+ // grad_points: (B, C, N)
33
+ cudaError_t err;
34
+ dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
35
+ dim3 threads(THREADS_PER_BLOCK);
36
+
37
+ group_points_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
38
+
39
+ err = cudaGetLastError();
40
+ if (cudaSuccess != err) {
41
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
42
+ exit(-1);
43
+ }
44
+ }
45
+
46
+
47
+ __global__ void group_points_kernel_fast(int b, int c, int n, int npoints, int nsample,
48
+ const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
49
+ // points: (B, C, N)
50
+ // idx: (B, npoints, nsample)
51
+ // output:
52
+ // out: (B, C, npoints, nsample)
53
+ int bs_idx = blockIdx.z;
54
+ int c_idx = blockIdx.y;
55
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
56
+ int pt_idx = index / nsample;
57
+ if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
58
+
59
+ int sample_idx = index % nsample;
60
+
61
+ idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
62
+ int in_idx = bs_idx * c * n + c_idx * n + idx[0];
63
+ int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
64
+
65
+ out[out_idx] = points[in_idx];
66
+ }
67
+
68
+
69
+ void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
70
+ const float *points, const int *idx, float *out, cudaStream_t stream) {
71
+ // points: (B, C, N)
72
+ // idx: (B, npoints, nsample)
73
+ // output:
74
+ // out: (B, C, npoints, nsample)
75
+ cudaError_t err;
76
+ dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
77
+ dim3 threads(THREADS_PER_BLOCK);
78
+
79
+ group_points_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out);
80
+ // cudaDeviceSynchronize(); // for using printf in kernel function
81
+ err = cudaGetLastError();
82
+ if (cudaSuccess != err) {
83
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
84
+ exit(-1);
85
+ }
86
+ }
@@ -0,0 +1,22 @@
1
+ #ifndef _GROUP_POINTS_GPU_H
2
+ #define _GROUP_POINTS_GPU_H
3
+
4
+ #include <torch/serialize/tensor.h>
5
+ #include <cuda.h>
6
+ #include <cuda_runtime_api.h>
7
+ #include <vector>
8
+
9
+
10
+ int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
11
+ at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
12
+
13
+ void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
14
+ const float *points, const int *idx, float *out, cudaStream_t stream);
15
+
16
+ int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
17
+ at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
18
+
19
+ void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
20
+ const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
21
+
22
+ #endif
@@ -0,0 +1,65 @@
1
+ #include <torch/serialize/tensor.h>
2
+ #include <vector>
3
+ #include <THC/THC.h>
4
+ #include <math.h>
5
+ #include <stdio.h>
6
+ #include <stdlib.h>
7
+ #include <cuda.h>
8
+ #include <cuda_runtime_api.h>
9
+ #include "interpolate_gpu.h"
10
+
11
+ extern THCState *state;
12
+
13
+
14
+ void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
15
+ at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
16
+ const float *unknown = unknown_tensor.data<float>();
17
+ const float *known = known_tensor.data<float>();
18
+ float *dist2 = dist2_tensor.data<float>();
19
+ int *idx = idx_tensor.data<int>();
20
+
21
+ cudaStream_t stream = THCState_getCurrentStream(state);
22
+ three_nn_kernel_launcher_fast(b, n, m, unknown, known, dist2, idx, stream);
23
+ }
24
+
25
+ void knn_wrapper_fast(int b, int n, int m, int k, at::Tensor unknown_tensor,
26
+ at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
27
+ const float *unknown = unknown_tensor.data<float>();
28
+ const float *known = known_tensor.data<float>();
29
+ float *dist2 = dist2_tensor.data<float>();
30
+ int *idx = idx_tensor.data<int>();
31
+
32
+ cudaStream_t stream = THCState_getCurrentStream(state);
33
+ knn_kernel_launcher_fast(b, n, m, k, unknown, known, dist2, idx, stream);
34
+ }
35
+
36
+
37
+ void three_interpolate_wrapper_fast(int b, int c, int m, int n,
38
+ at::Tensor points_tensor,
39
+ at::Tensor idx_tensor,
40
+ at::Tensor weight_tensor,
41
+ at::Tensor out_tensor) {
42
+
43
+ const float *points = points_tensor.data<float>();
44
+ const float *weight = weight_tensor.data<float>();
45
+ float *out = out_tensor.data<float>();
46
+ const int *idx = idx_tensor.data<int>();
47
+
48
+ cudaStream_t stream = THCState_getCurrentStream(state);
49
+ three_interpolate_kernel_launcher_fast(b, c, m, n, points, idx, weight, out, stream);
50
+ }
51
+
52
+ void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m,
53
+ at::Tensor grad_out_tensor,
54
+ at::Tensor idx_tensor,
55
+ at::Tensor weight_tensor,
56
+ at::Tensor grad_points_tensor) {
57
+
58
+ const float *grad_out = grad_out_tensor.data<float>();
59
+ const float *weight = weight_tensor.data<float>();
60
+ float *grad_points = grad_points_tensor.data<float>();
61
+ const int *idx = idx_tensor.data<int>();
62
+
63
+ cudaStream_t stream = THCState_getCurrentStream(state);
64
+ three_interpolate_grad_kernel_launcher_fast(b, c, n, m, grad_out, idx, weight, grad_points, stream);
65
+ }
@@ -0,0 +1,233 @@
1
+ #include <math.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "cuda_utils.h"
6
+ #include "interpolate_gpu.h"
7
+
8
+
9
+ __global__ void knn_kernel_fast(int b, int n, int m, int k, const float *__restrict__ unknown,
10
+ const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
11
+ // unknown: (B, N, 3)
12
+ // known: (B, M, 3)
13
+ // output:
14
+ // dist2: (B, N, k)
15
+ // idx: (B, N, k)
16
+
17
+ int bs_idx = blockIdx.y;
18
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
19
+ if (bs_idx >= b || pt_idx >= n) return;
20
+
21
+ unknown += bs_idx * n * 3 + pt_idx * 3;
22
+ known += bs_idx * m * 3;
23
+ dist2 += bs_idx * n * k + pt_idx * k;
24
+ idx += bs_idx * n * k + pt_idx * k;
25
+
26
+ float ux = unknown[0];
27
+ float uy = unknown[1];
28
+ float uz = unknown[2];
29
+
30
+ double best[200];
31
+ int besti[200];
32
+ for(int i = 0; i < k; i++){
33
+ best[i] = 1e40;
34
+ besti[i] = 0;
35
+ }
36
+ for (int i = 0; i < m; ++i) {
37
+ float x = known[i * 3 + 0];
38
+ float y = known[i * 3 + 1];
39
+ float z = known[i * 3 + 2];
40
+ float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
41
+ for(int j = 0; j < k; j++){
42
+ if(d < best[j]){
43
+ for(int l = k - 1; l > j; l--){
44
+ best[l] = best[l - 1];
45
+ besti[l] = besti[l - 1];
46
+ }
47
+ best[j] = d;
48
+ besti[j] = i;
49
+ break;
50
+ }
51
+ }
52
+ }
53
+ for(int i = 0; i < k; i++){
54
+ idx[i] = besti[i];
55
+ dist2[i] = best[i];
56
+ }
57
+ }
58
+
59
+
60
+ void knn_kernel_launcher_fast(int b, int n, int m, int k, const float *unknown,
61
+ const float *known, float *dist2, int *idx, cudaStream_t stream) {
62
+ // unknown: (B, N, 3)
63
+ // known: (B, M, 3)
64
+ // output:
65
+ // dist2: (B, N, k)
66
+ // idx: (B, N, k)
67
+
68
+ cudaError_t err;
69
+ dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
70
+ dim3 threads(THREADS_PER_BLOCK);
71
+
72
+ knn_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, k, unknown, known, dist2, idx);
73
+
74
+ err = cudaGetLastError();
75
+ if (cudaSuccess != err) {
76
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
77
+ exit(-1);
78
+ }
79
+ }
80
+
81
+ __global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restrict__ unknown,
82
+ const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
83
+ // unknown: (B, N, 3)
84
+ // known: (B, M, 3)
85
+ // output:
86
+ // dist2: (B, N, 3)
87
+ // idx: (B, N, 3)
88
+
89
+ int bs_idx = blockIdx.y;
90
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
91
+ if (bs_idx >= b || pt_idx >= n) return;
92
+
93
+ unknown += bs_idx * n * 3 + pt_idx * 3;
94
+ known += bs_idx * m * 3;
95
+ dist2 += bs_idx * n * 3 + pt_idx * 3;
96
+ idx += bs_idx * n * 3 + pt_idx * 3;
97
+
98
+ float ux = unknown[0];
99
+ float uy = unknown[1];
100
+ float uz = unknown[2];
101
+
102
+ double best1 = 1e40, best2 = 1e40, best3 = 1e40;
103
+ int besti1 = 0, besti2 = 0, besti3 = 0;
104
+ for (int k = 0; k < m; ++k) {
105
+ float x = known[k * 3 + 0];
106
+ float y = known[k * 3 + 1];
107
+ float z = known[k * 3 + 2];
108
+ float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
109
+ if (d < best1) {
110
+ best3 = best2; besti3 = besti2;
111
+ best2 = best1; besti2 = besti1;
112
+ best1 = d; besti1 = k;
113
+ }
114
+ else if (d < best2) {
115
+ best3 = best2; besti3 = besti2;
116
+ best2 = d; besti2 = k;
117
+ }
118
+ else if (d < best3) {
119
+ best3 = d; besti3 = k;
120
+ }
121
+ }
122
+ dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
123
+ idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
124
+ }
125
+
126
+
127
+ void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
128
+ const float *known, float *dist2, int *idx, cudaStream_t stream) {
129
+ // unknown: (B, N, 3)
130
+ // known: (B, M, 3)
131
+ // output:
132
+ // dist2: (B, N, 3)
133
+ // idx: (B, N, 3)
134
+
135
+ cudaError_t err;
136
+ dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
137
+ dim3 threads(THREADS_PER_BLOCK);
138
+
139
+ three_nn_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known, dist2, idx);
140
+
141
+ err = cudaGetLastError();
142
+ if (cudaSuccess != err) {
143
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
144
+ exit(-1);
145
+ }
146
+ }
147
+
148
+
149
+ __global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points,
150
+ const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
151
+ // points: (B, C, M)
152
+ // idx: (B, N, 3)
153
+ // weight: (B, N, 3)
154
+ // output:
155
+ // out: (B, C, N)
156
+
157
+ int bs_idx = blockIdx.z;
158
+ int c_idx = blockIdx.y;
159
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
160
+
161
+ if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
162
+
163
+ weight += bs_idx * n * 3 + pt_idx * 3;
164
+ points += bs_idx * c * m + c_idx * m;
165
+ idx += bs_idx * n * 3 + pt_idx * 3;
166
+ out += bs_idx * c * n + c_idx * n;
167
+
168
+ out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
169
+ }
170
+
171
+ void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
172
+ const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) {
173
+ // points: (B, C, M)
174
+ // idx: (B, N, 3)
175
+ // weight: (B, N, 3)
176
+ // output:
177
+ // out: (B, C, N)
178
+
179
+ cudaError_t err;
180
+ dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
181
+ dim3 threads(THREADS_PER_BLOCK);
182
+ three_interpolate_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, m, n, points, idx, weight, out);
183
+
184
+ err = cudaGetLastError();
185
+ if (cudaSuccess != err) {
186
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
187
+ exit(-1);
188
+ }
189
+ }
190
+
191
+
192
+ __global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
193
+ const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
194
+ // grad_out: (B, C, N)
195
+ // weight: (B, N, 3)
196
+ // output:
197
+ // grad_points: (B, C, M)
198
+
199
+ int bs_idx = blockIdx.z;
200
+ int c_idx = blockIdx.y;
201
+ int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
202
+
203
+ if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
204
+
205
+ grad_out += bs_idx * c * n + c_idx * n + pt_idx;
206
+ weight += bs_idx * n * 3 + pt_idx * 3;
207
+ grad_points += bs_idx * c * m + c_idx * m;
208
+ idx += bs_idx * n * 3 + pt_idx * 3;
209
+
210
+
211
+ atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
212
+ atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
213
+ atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
214
+ }
215
+
216
+ void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
217
+ const int *idx, const float *weight, float *grad_points, cudaStream_t stream) {
218
+ // grad_out: (B, C, N)
219
+ // weight: (B, N, 3)
220
+ // output:
221
+ // grad_points: (B, C, M)
222
+
223
+ cudaError_t err;
224
+ dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
225
+ dim3 threads(THREADS_PER_BLOCK);
226
+ three_interpolate_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, m, grad_out, idx, weight, grad_points);
227
+
228
+ err = cudaGetLastError();
229
+ if (cudaSuccess != err) {
230
+ fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
231
+ exit(-1);
232
+ }
233
+ }
@@ -0,0 +1,36 @@
1
+ #ifndef _INTERPOLATE_GPU_H
2
+ #define _INTERPOLATE_GPU_H
3
+
4
+ #include <torch/serialize/tensor.h>
5
+ #include<vector>
6
+ #include <cuda.h>
7
+ #include <cuda_runtime_api.h>
8
+
9
+
10
+ void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
11
+ at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
12
+
13
+ void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
14
+ const float *known, float *dist2, int *idx, cudaStream_t stream);
15
+
16
+ void knn_wrapper_fast(int b, int n, int m, int k, at::Tensor unknown_tensor,
17
+ at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
18
+
19
+ void knn_kernel_launcher_fast(int b, int n, int m, int k, const float *unknown,
20
+ const float *known, float *dist2, int *idx, cudaStream_t stream);
21
+
22
+
23
+ void three_interpolate_wrapper_fast(int b, int c, int m, int n, at::Tensor points_tensor,
24
+ at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
25
+
26
+ void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
27
+ const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream);
28
+
29
+
30
+ void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, at::Tensor grad_out_tensor,
31
+ at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
32
+
33
+ void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
34
+ const int *idx, const float *weight, float *grad_points, cudaStream_t stream);
35
+
36
+ #endif
@@ -0,0 +1,25 @@
1
+ #include <torch/serialize/tensor.h>
2
+ #include <torch/extension.h>
3
+
4
+ #include "ball_query_gpu.h"
5
+ #include "group_points_gpu.h"
6
+ #include "sampling_gpu.h"
7
+ #include "interpolate_gpu.h"
8
+
9
+
10
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
11
+ m.def("ball_query_wrapper", &ball_query_wrapper_fast, "ball_query_wrapper_fast");
12
+
13
+ m.def("group_points_wrapper", &group_points_wrapper_fast, "group_points_wrapper_fast");
14
+ m.def("group_points_grad_wrapper", &group_points_grad_wrapper_fast, "group_points_grad_wrapper_fast");
15
+
16
+ m.def("gather_points_wrapper", &gather_points_wrapper_fast, "gather_points_wrapper_fast");
17
+ m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper_fast, "gather_points_grad_wrapper_fast");
18
+
19
+ m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
20
+
21
+ m.def("knn_wrapper", &knn_wrapper_fast, "knn_wrapper_fast");
22
+ m.def("three_nn_wrapper", &three_nn_wrapper_fast, "three_nn_wrapper_fast");
23
+ m.def("three_interpolate_wrapper", &three_interpolate_wrapper_fast, "three_interpolate_wrapper_fast");
24
+ m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper_fast, "three_interpolate_grad_wrapper_fast");
25
+ }