learning3d 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. learning3d/__init__.py +2 -0
  2. learning3d/data_utils/__init__.py +4 -0
  3. learning3d/data_utils/dataloaders.py +454 -0
  4. learning3d/data_utils/user_data.py +119 -0
  5. learning3d/examples/test_dcp.py +139 -0
  6. learning3d/examples/test_deepgmr.py +144 -0
  7. learning3d/examples/test_flownet.py +113 -0
  8. learning3d/examples/test_masknet.py +159 -0
  9. learning3d/examples/test_masknet2.py +162 -0
  10. learning3d/examples/test_pcn.py +118 -0
  11. learning3d/examples/test_pcrnet.py +120 -0
  12. learning3d/examples/test_pnlk.py +121 -0
  13. learning3d/examples/test_pointconv.py +126 -0
  14. learning3d/examples/test_pointnet.py +121 -0
  15. learning3d/examples/test_prnet.py +126 -0
  16. learning3d/examples/test_rpmnet.py +120 -0
  17. learning3d/examples/train_PointNetLK.py +240 -0
  18. learning3d/examples/train_dcp.py +249 -0
  19. learning3d/examples/train_deepgmr.py +244 -0
  20. learning3d/examples/train_flownet.py +259 -0
  21. learning3d/examples/train_masknet.py +239 -0
  22. learning3d/examples/train_pcn.py +216 -0
  23. learning3d/examples/train_pcrnet.py +228 -0
  24. learning3d/examples/train_pointconv.py +245 -0
  25. learning3d/examples/train_pointnet.py +244 -0
  26. learning3d/examples/train_prnet.py +229 -0
  27. learning3d/examples/train_rpmnet.py +228 -0
  28. learning3d/losses/__init__.py +12 -0
  29. learning3d/losses/chamfer_distance.py +51 -0
  30. learning3d/losses/classification.py +14 -0
  31. learning3d/losses/correspondence_loss.py +10 -0
  32. learning3d/losses/cuda/chamfer_distance/__init__.py +1 -0
  33. learning3d/losses/cuda/chamfer_distance/chamfer_distance.cpp +185 -0
  34. learning3d/losses/cuda/chamfer_distance/chamfer_distance.cu +209 -0
  35. learning3d/losses/cuda/chamfer_distance/chamfer_distance.py +66 -0
  36. learning3d/losses/cuda/emd_torch/pkg/emd_loss_layer.py +41 -0
  37. learning3d/losses/cuda/emd_torch/pkg/include/cuda/emd.cuh +347 -0
  38. learning3d/losses/cuda/emd_torch/pkg/include/cuda_helper.h +18 -0
  39. learning3d/losses/cuda/emd_torch/pkg/include/emd.h +54 -0
  40. learning3d/losses/cuda/emd_torch/pkg/layer/__init__.py +1 -0
  41. learning3d/losses/cuda/emd_torch/pkg/layer/emd_loss_layer.py +40 -0
  42. learning3d/losses/cuda/emd_torch/pkg/src/cuda/emd.cu +70 -0
  43. learning3d/losses/cuda/emd_torch/pkg/src/emd.cpp +1 -0
  44. learning3d/losses/cuda/emd_torch/setup.py +29 -0
  45. learning3d/losses/emd.py +16 -0
  46. learning3d/losses/frobenius_norm.py +21 -0
  47. learning3d/losses/rmse_features.py +16 -0
  48. learning3d/models/__init__.py +23 -0
  49. learning3d/models/classifier.py +41 -0
  50. learning3d/models/dcp.py +92 -0
  51. learning3d/models/deepgmr.py +165 -0
  52. learning3d/models/dgcnn.py +92 -0
  53. learning3d/models/flownet3d.py +446 -0
  54. learning3d/models/masknet.py +84 -0
  55. learning3d/models/masknet2.py +264 -0
  56. learning3d/models/pcn.py +164 -0
  57. learning3d/models/pcrnet.py +74 -0
  58. learning3d/models/pointconv.py +108 -0
  59. learning3d/models/pointnet.py +108 -0
  60. learning3d/models/pointnetlk.py +173 -0
  61. learning3d/models/pooling.py +15 -0
  62. learning3d/models/ppfnet.py +102 -0
  63. learning3d/models/prnet.py +431 -0
  64. learning3d/models/rpmnet.py +359 -0
  65. learning3d/models/segmentation.py +38 -0
  66. learning3d/ops/__init__.py +0 -0
  67. learning3d/ops/data_utils.py +45 -0
  68. learning3d/ops/invmat.py +134 -0
  69. learning3d/ops/quaternion.py +218 -0
  70. learning3d/ops/se3.py +157 -0
  71. learning3d/ops/sinc.py +229 -0
  72. learning3d/ops/so3.py +213 -0
  73. learning3d/ops/transform_functions.py +342 -0
  74. learning3d/utils/__init__.py +9 -0
  75. learning3d/utils/lib/build/lib.linux-x86_64-3.5/pointnet2_cuda.cpython-35m-x86_64-linux-gnu.so +0 -0
  76. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/ball_query.o +0 -0
  77. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/ball_query_gpu.o +0 -0
  78. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/group_points.o +0 -0
  79. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/group_points_gpu.o +0 -0
  80. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/interpolate.o +0 -0
  81. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/interpolate_gpu.o +0 -0
  82. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/pointnet2_api.o +0 -0
  83. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/sampling.o +0 -0
  84. learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/sampling_gpu.o +0 -0
  85. learning3d/utils/lib/dist/pointnet2-0.0.0-py3.5-linux-x86_64.egg +0 -0
  86. learning3d/utils/lib/pointnet2.egg-info/SOURCES.txt +14 -0
  87. learning3d/utils/lib/pointnet2.egg-info/dependency_links.txt +1 -0
  88. learning3d/utils/lib/pointnet2.egg-info/top_level.txt +1 -0
  89. learning3d/utils/lib/pointnet2_modules.py +160 -0
  90. learning3d/utils/lib/pointnet2_utils.py +318 -0
  91. learning3d/utils/lib/pytorch_utils.py +236 -0
  92. learning3d/utils/lib/setup.py +23 -0
  93. learning3d/utils/lib/src/ball_query.cpp +25 -0
  94. learning3d/utils/lib/src/ball_query_gpu.cu +67 -0
  95. learning3d/utils/lib/src/ball_query_gpu.h +15 -0
  96. learning3d/utils/lib/src/cuda_utils.h +15 -0
  97. learning3d/utils/lib/src/group_points.cpp +36 -0
  98. learning3d/utils/lib/src/group_points_gpu.cu +86 -0
  99. learning3d/utils/lib/src/group_points_gpu.h +22 -0
  100. learning3d/utils/lib/src/interpolate.cpp +65 -0
  101. learning3d/utils/lib/src/interpolate_gpu.cu +233 -0
  102. learning3d/utils/lib/src/interpolate_gpu.h +36 -0
  103. learning3d/utils/lib/src/pointnet2_api.cpp +25 -0
  104. learning3d/utils/lib/src/sampling.cpp +46 -0
  105. learning3d/utils/lib/src/sampling_gpu.cu +253 -0
  106. learning3d/utils/lib/src/sampling_gpu.h +29 -0
  107. learning3d/utils/pointconv_util.py +382 -0
  108. learning3d/utils/ppfnet_util.py +244 -0
  109. learning3d/utils/svd.py +59 -0
  110. learning3d/utils/transformer.py +243 -0
  111. learning3d-0.0.1.dist-info/LICENSE +21 -0
  112. learning3d-0.0.1.dist-info/METADATA +271 -0
  113. learning3d-0.0.1.dist-info/RECORD +115 -0
  114. learning3d-0.0.1.dist-info/WHEEL +5 -0
  115. learning3d-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,209 @@
1
+ #include <ATen/ATen.h>
2
+
3
+ #include <cuda.h>
4
+ #include <cuda_runtime.h>
5
+
6
+ __global__
7
+ void ChamferDistanceKernel(
8
+ int b,
9
+ int n,
10
+ const float* xyz,
11
+ int m,
12
+ const float* xyz2,
13
+ float* result,
14
+ int* result_i)
15
+ {
16
+ const int batch=512;
17
+ __shared__ float buf[batch*3];
18
+ for (int i=blockIdx.x;i<b;i+=gridDim.x){
19
+ for (int k2=0;k2<m;k2+=batch){
20
+ int end_k=min(m,k2+batch)-k2;
21
+ for (int j=threadIdx.x;j<end_k*3;j+=blockDim.x){
22
+ buf[j]=xyz2[(i*m+k2)*3+j];
23
+ }
24
+ __syncthreads();
25
+ for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
26
+ float x1=xyz[(i*n+j)*3+0];
27
+ float y1=xyz[(i*n+j)*3+1];
28
+ float z1=xyz[(i*n+j)*3+2];
29
+ int best_i=0;
30
+ float best=0;
31
+ int end_ka=end_k-(end_k&3);
32
+ if (end_ka==batch){
33
+ for (int k=0;k<batch;k+=4){
34
+ {
35
+ float x2=buf[k*3+0]-x1;
36
+ float y2=buf[k*3+1]-y1;
37
+ float z2=buf[k*3+2]-z1;
38
+ float d=x2*x2+y2*y2+z2*z2;
39
+ if (k==0 || d<best){
40
+ best=d;
41
+ best_i=k+k2;
42
+ }
43
+ }
44
+ {
45
+ float x2=buf[k*3+3]-x1;
46
+ float y2=buf[k*3+4]-y1;
47
+ float z2=buf[k*3+5]-z1;
48
+ float d=x2*x2+y2*y2+z2*z2;
49
+ if (d<best){
50
+ best=d;
51
+ best_i=k+k2+1;
52
+ }
53
+ }
54
+ {
55
+ float x2=buf[k*3+6]-x1;
56
+ float y2=buf[k*3+7]-y1;
57
+ float z2=buf[k*3+8]-z1;
58
+ float d=x2*x2+y2*y2+z2*z2;
59
+ if (d<best){
60
+ best=d;
61
+ best_i=k+k2+2;
62
+ }
63
+ }
64
+ {
65
+ float x2=buf[k*3+9]-x1;
66
+ float y2=buf[k*3+10]-y1;
67
+ float z2=buf[k*3+11]-z1;
68
+ float d=x2*x2+y2*y2+z2*z2;
69
+ if (d<best){
70
+ best=d;
71
+ best_i=k+k2+3;
72
+ }
73
+ }
74
+ }
75
+ }else{
76
+ for (int k=0;k<end_ka;k+=4){
77
+ {
78
+ float x2=buf[k*3+0]-x1;
79
+ float y2=buf[k*3+1]-y1;
80
+ float z2=buf[k*3+2]-z1;
81
+ float d=x2*x2+y2*y2+z2*z2;
82
+ if (k==0 || d<best){
83
+ best=d;
84
+ best_i=k+k2;
85
+ }
86
+ }
87
+ {
88
+ float x2=buf[k*3+3]-x1;
89
+ float y2=buf[k*3+4]-y1;
90
+ float z2=buf[k*3+5]-z1;
91
+ float d=x2*x2+y2*y2+z2*z2;
92
+ if (d<best){
93
+ best=d;
94
+ best_i=k+k2+1;
95
+ }
96
+ }
97
+ {
98
+ float x2=buf[k*3+6]-x1;
99
+ float y2=buf[k*3+7]-y1;
100
+ float z2=buf[k*3+8]-z1;
101
+ float d=x2*x2+y2*y2+z2*z2;
102
+ if (d<best){
103
+ best=d;
104
+ best_i=k+k2+2;
105
+ }
106
+ }
107
+ {
108
+ float x2=buf[k*3+9]-x1;
109
+ float y2=buf[k*3+10]-y1;
110
+ float z2=buf[k*3+11]-z1;
111
+ float d=x2*x2+y2*y2+z2*z2;
112
+ if (d<best){
113
+ best=d;
114
+ best_i=k+k2+3;
115
+ }
116
+ }
117
+ }
118
+ }
119
+ for (int k=end_ka;k<end_k;k++){
120
+ float x2=buf[k*3+0]-x1;
121
+ float y2=buf[k*3+1]-y1;
122
+ float z2=buf[k*3+2]-z1;
123
+ float d=x2*x2+y2*y2+z2*z2;
124
+ if (k==0 || d<best){
125
+ best=d;
126
+ best_i=k+k2;
127
+ }
128
+ }
129
+ if (k2==0 || result[(i*n+j)]>best){
130
+ result[(i*n+j)]=best;
131
+ result_i[(i*n+j)]=best_i;
132
+ }
133
+ }
134
+ __syncthreads();
135
+ }
136
+ }
137
+ }
138
+
139
+ void ChamferDistanceKernelLauncher(
140
+ const int b, const int n,
141
+ const float* xyz,
142
+ const int m,
143
+ const float* xyz2,
144
+ float* result,
145
+ int* result_i,
146
+ float* result2,
147
+ int* result2_i)
148
+ {
149
+ ChamferDistanceKernel<<<dim3(32,16,1),512>>>(b, n, xyz, m, xyz2, result, result_i);
150
+ ChamferDistanceKernel<<<dim3(32,16,1),512>>>(b, m, xyz2, n, xyz, result2, result2_i);
151
+
152
+ cudaError_t err = cudaGetLastError();
153
+ if (err != cudaSuccess)
154
+ printf("error in chamfer distance updateOutput: %s\n", cudaGetErrorString(err));
155
+ }
156
+
157
+
158
+ __global__
159
+ void ChamferDistanceGradKernel(
160
+ int b, int n,
161
+ const float* xyz1,
162
+ int m,
163
+ const float* xyz2,
164
+ const float* grad_dist1,
165
+ const int* idx1,
166
+ float* grad_xyz1,
167
+ float* grad_xyz2)
168
+ {
169
+ for (int i = blockIdx.x; i<b; i += gridDim.x) {
170
+ for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x*gridDim.y) {
171
+ float x1=xyz1[(i*n+j)*3+0];
172
+ float y1=xyz1[(i*n+j)*3+1];
173
+ float z1=xyz1[(i*n+j)*3+2];
174
+ int j2=idx1[i*n+j];
175
+ float x2=xyz2[(i*m+j2)*3+0];
176
+ float y2=xyz2[(i*m+j2)*3+1];
177
+ float z2=xyz2[(i*m+j2)*3+2];
178
+ float g=grad_dist1[i*n+j]*2;
179
+ atomicAdd(&(grad_xyz1[(i*n+j)*3+0]),g*(x1-x2));
180
+ atomicAdd(&(grad_xyz1[(i*n+j)*3+1]),g*(y1-y2));
181
+ atomicAdd(&(grad_xyz1[(i*n+j)*3+2]),g*(z1-z2));
182
+ atomicAdd(&(grad_xyz2[(i*m+j2)*3+0]),-(g*(x1-x2)));
183
+ atomicAdd(&(grad_xyz2[(i*m+j2)*3+1]),-(g*(y1-y2)));
184
+ atomicAdd(&(grad_xyz2[(i*m+j2)*3+2]),-(g*(z1-z2)));
185
+ }
186
+ }
187
+ }
188
+
189
+ void ChamferDistanceGradKernelLauncher(
190
+ const int b, const int n,
191
+ const float* xyz1,
192
+ const int m,
193
+ const float* xyz2,
194
+ const float* grad_dist1,
195
+ const int* idx1,
196
+ const float* grad_dist2,
197
+ const int* idx2,
198
+ float* grad_xyz1,
199
+ float* grad_xyz2)
200
+ {
201
+ cudaMemset(grad_xyz1, 0, b*n*3*4);
202
+ cudaMemset(grad_xyz2, 0, b*m*3*4);
203
+ ChamferDistanceGradKernel<<<dim3(1,16,1), 256>>>(b, n, xyz1, m, xyz2, grad_dist1, idx1, grad_xyz1, grad_xyz2);
204
+ ChamferDistanceGradKernel<<<dim3(1,16,1), 256>>>(b, m, xyz2, n, xyz1, grad_dist2, idx2, grad_xyz2, grad_xyz1);
205
+
206
+ cudaError_t err = cudaGetLastError();
207
+ if (err != cudaSuccess)
208
+ printf("error in chamfer distance get grad: %s\n", cudaGetErrorString(err));
209
+ }
@@ -0,0 +1,66 @@
1
+ import torch
2
+ from torch.utils.cpp_extension import load
3
+ import os
4
+
5
+ script_dir = os.path.dirname(__file__)
6
+ sources = [
7
+ os.path.join(script_dir, "chamfer_distance.cpp"),
8
+ os.path.join(script_dir, "chamfer_distance.cu"),
9
+ ]
10
+
11
+ cd = load(name="cd", sources=sources)
12
+
13
+
14
+ class ChamferDistanceFunction(torch.autograd.Function):
15
+ @staticmethod
16
+ def forward(ctx, xyz1, xyz2):
17
+ batchsize, n, _ = xyz1.size()
18
+ _, m, _ = xyz2.size()
19
+ xyz1 = xyz1.contiguous()
20
+ xyz2 = xyz2.contiguous()
21
+ dist1 = torch.zeros(batchsize, n)
22
+ dist2 = torch.zeros(batchsize, m)
23
+
24
+ idx1 = torch.zeros(batchsize, n, dtype=torch.int)
25
+ idx2 = torch.zeros(batchsize, m, dtype=torch.int)
26
+
27
+ if not xyz1.is_cuda:
28
+ cd.forward(xyz1, xyz2, dist1, dist2, idx1, idx2)
29
+ else:
30
+ dist1 = dist1.cuda()
31
+ dist2 = dist2.cuda()
32
+ idx1 = idx1.cuda()
33
+ idx2 = idx2.cuda()
34
+ cd.forward_cuda(xyz1, xyz2, dist1, dist2, idx1, idx2)
35
+
36
+ ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
37
+
38
+ return dist1, dist2
39
+
40
+ @staticmethod
41
+ def backward(ctx, graddist1, graddist2):
42
+ xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
43
+
44
+ graddist1 = graddist1.contiguous()
45
+ graddist2 = graddist2.contiguous()
46
+
47
+ gradxyz1 = torch.zeros(xyz1.size())
48
+ gradxyz2 = torch.zeros(xyz2.size())
49
+
50
+ if not graddist1.is_cuda:
51
+ cd.backward(
52
+ xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2
53
+ )
54
+ else:
55
+ gradxyz1 = gradxyz1.cuda()
56
+ gradxyz2 = gradxyz2.cuda()
57
+ cd.backward_cuda(
58
+ xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2
59
+ )
60
+
61
+ return gradxyz1, gradxyz2
62
+
63
+
64
+ class ChamferDistance(torch.nn.Module):
65
+ def forward(self, xyz1, xyz2):
66
+ return ChamferDistanceFunction.apply(xyz1, xyz2)
@@ -0,0 +1,41 @@
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ import _emd_ext._emd as emd
5
+
6
+
7
+ class EMDFunction(torch.autograd.Function):
8
+ @staticmethod
9
+ def forward(self, xyz1, xyz2):
10
+ cost, match = emd.emd_forward(xyz1, xyz2)
11
+ self.save_for_backward(xyz1, xyz2, match)
12
+ return cost
13
+
14
+
15
+ @staticmethod
16
+ def backward(self, grad_output):
17
+ xyz1, xyz2, match = self.saved_tensors
18
+ grad_xyz1, grad_xyz2 = emd.emd_backward(xyz1, xyz2, match)
19
+ return grad_xyz1, grad_xyz2
20
+
21
+
22
+
23
+
24
+ class EMDLoss(nn.Module):
25
+ '''
26
+ Computes the (approximate) Earth Mover's Distance between two point sets.
27
+
28
+ IMPLEMENTATION LIMITATIONS:
29
+ - Double tensors must have <=11 dimensions
30
+ - Float tensors must have <=23 dimensions
31
+ This is due to the use of CUDA shared memory in the computation. This shared memory is limited by the hardware to 48kB.
32
+ '''
33
+
34
+ def __init__(self):
35
+ super(EMDLoss, self).__init__()
36
+
37
+ def forward(self, xyz1, xyz2):
38
+
39
+ assert xyz1.shape[-1] == xyz2.shape[-1], 'Both point sets must have the same dimensions!'
40
+ assert xyz1.shape[1] == xyz2.shape[1], 'Both Point Clouds must have same number of points in it.'
41
+ return EMDFunction.apply(xyz1, xyz2)
@@ -0,0 +1,347 @@
1
+ #ifndef EMD_CUH_
2
+ #define EMD_CUH_
3
+
4
+ #include "cuda_helper.h"
5
+
6
+ template<typename T>
7
+ __global__ void approxmatch(const int b, const int n, const int m, const T * __restrict__ xyz1, const T * __restrict__ xyz2, T * __restrict__ match, T * temp){
8
+ T * remainL=temp+blockIdx.x*(n+m)*2, * remainR=temp+blockIdx.x*(n+m)*2+n,*ratioL=temp+blockIdx.x*(n+m)*2+n+m,*ratioR=temp+blockIdx.x*(n+m)*2+n+m+n;
9
+ T multiL,multiR;
10
+ if (n>=m){
11
+ multiL=1;
12
+ multiR=n/m;
13
+ }else{
14
+ multiL=m/n;
15
+ multiR=1;
16
+ }
17
+ const int Block=1024;
18
+ __shared__ T buf[Block*4];
19
+ for (int i=blockIdx.x;i<b;i+=gridDim.x){
20
+ for (int j=threadIdx.x;j<n*m;j+=blockDim.x)
21
+ match[i*n*m+j]=0;
22
+ for (int j=threadIdx.x;j<n;j+=blockDim.x)
23
+ remainL[j]=multiL;
24
+ for (int j=threadIdx.x;j<m;j+=blockDim.x)
25
+ remainR[j]=multiR;
26
+ __syncthreads();
27
+ for (int j=7;j>=-2;j--){
28
+ T level=-powf(4.0f,j);
29
+ if (j==-2){
30
+ level=0;
31
+ }
32
+ for (int k0=0;k0<n;k0+=blockDim.x){
33
+ int k=k0+threadIdx.x;
34
+ T x1=0,y1=0,z1=0;
35
+ if (k<n){
36
+ x1=xyz1[i*n*3+k*3+0];
37
+ y1=xyz1[i*n*3+k*3+1];
38
+ z1=xyz1[i*n*3+k*3+2];
39
+ }
40
+ T suml=T(1e-9f);
41
+ for (int l0=0;l0<m;l0+=Block){
42
+ int lend=min(m,l0+Block)-l0;
43
+ for (int l=threadIdx.x;l<lend;l+=blockDim.x){
44
+ T x2=xyz2[i*m*3+l0*3+l*3+0];
45
+ T y2=xyz2[i*m*3+l0*3+l*3+1];
46
+ T z2=xyz2[i*m*3+l0*3+l*3+2];
47
+ buf[l*4+0]=x2;
48
+ buf[l*4+1]=y2;
49
+ buf[l*4+2]=z2;
50
+ buf[l*4+3]=remainR[l0+l];
51
+ }
52
+ __syncthreads();
53
+ for (int l=0;l<lend;l++){
54
+ T x2=buf[l*4+0];
55
+ T y2=buf[l*4+1];
56
+ T z2=buf[l*4+2];
57
+ T d=level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1));
58
+ T w=__expf(d)*buf[l*4+3];
59
+ suml+=w;
60
+ }
61
+ __syncthreads();
62
+ }
63
+ if (k<n)
64
+ ratioL[k]=remainL[k]/suml;
65
+ }
66
+ /*for (int k=threadIdx.x;k<n;k+=gridDim.x){
67
+ T x1=xyz1[i*n*3+k*3+0];
68
+ T y1=xyz1[i*n*3+k*3+1];
69
+ T z1=xyz1[i*n*3+k*3+2];
70
+ T suml=1e-9f;
71
+ for (int l=0;l<m;l++){
72
+ T x2=xyz2[i*m*3+l*3+0];
73
+ T y2=xyz2[i*m*3+l*3+1];
74
+ T z2=xyz2[i*m*3+l*3+2];
75
+ T w=expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*remainR[l];
76
+ suml+=w;
77
+ }
78
+ ratioL[k]=remainL[k]/suml;
79
+ }*/
80
+ __syncthreads();
81
+ for (int l0=0;l0<m;l0+=blockDim.x){
82
+ int l=l0+threadIdx.x;
83
+ T x2=0,y2=0,z2=0;
84
+ if (l<m){
85
+ x2=xyz2[i*m*3+l*3+0];
86
+ y2=xyz2[i*m*3+l*3+1];
87
+ z2=xyz2[i*m*3+l*3+2];
88
+ }
89
+ T sumr=0;
90
+ for (int k0=0;k0<n;k0+=Block){
91
+ int kend=min(n,k0+Block)-k0;
92
+ for (int k=threadIdx.x;k<kend;k+=blockDim.x){
93
+ buf[k*4+0]=xyz1[i*n*3+k0*3+k*3+0];
94
+ buf[k*4+1]=xyz1[i*n*3+k0*3+k*3+1];
95
+ buf[k*4+2]=xyz1[i*n*3+k0*3+k*3+2];
96
+ buf[k*4+3]=ratioL[k0+k];
97
+ }
98
+ __syncthreads();
99
+ for (int k=0;k<kend;k++){
100
+ T x1=buf[k*4+0];
101
+ T y1=buf[k*4+1];
102
+ T z1=buf[k*4+2];
103
+ T w=__expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*buf[k*4+3];
104
+ sumr+=w;
105
+ }
106
+ __syncthreads();
107
+ }
108
+ if (l<m){
109
+ sumr*=remainR[l];
110
+ T consumption=fminf(remainR[l]/(sumr+1e-9f),1.0f);
111
+ ratioR[l]=consumption*remainR[l];
112
+ remainR[l]=fmaxf(0.0f,remainR[l]-sumr);
113
+ }
114
+ }
115
+ /*for (int l=threadIdx.x;l<m;l+=blockDim.x){
116
+ T x2=xyz2[i*m*3+l*3+0];
117
+ T y2=xyz2[i*m*3+l*3+1];
118
+ T z2=xyz2[i*m*3+l*3+2];
119
+ T sumr=0;
120
+ for (int k=0;k<n;k++){
121
+ T x1=xyz1[i*n*3+k*3+0];
122
+ T y1=xyz1[i*n*3+k*3+1];
123
+ T z1=xyz1[i*n*3+k*3+2];
124
+ T w=expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*ratioL[k];
125
+ sumr+=w;
126
+ }
127
+ sumr*=remainR[l];
128
+ T consumption=fminf(remainR[l]/(sumr+1e-9f),1.0f);
129
+ ratioR[l]=consumption*remainR[l];
130
+ remainR[l]=fmaxf(0.0f,remainR[l]-sumr);
131
+ }*/
132
+ __syncthreads();
133
+ for (int k0=0;k0<n;k0+=blockDim.x){
134
+ int k=k0+threadIdx.x;
135
+ T x1=0,y1=0,z1=0;
136
+ if (k<n){
137
+ x1=xyz1[i*n*3+k*3+0];
138
+ y1=xyz1[i*n*3+k*3+1];
139
+ z1=xyz1[i*n*3+k*3+2];
140
+ }
141
+ T suml=0;
142
+ for (int l0=0;l0<m;l0+=Block){
143
+ int lend=min(m,l0+Block)-l0;
144
+ for (int l=threadIdx.x;l<lend;l+=blockDim.x){
145
+ buf[l*4+0]=xyz2[i*m*3+l0*3+l*3+0];
146
+ buf[l*4+1]=xyz2[i*m*3+l0*3+l*3+1];
147
+ buf[l*4+2]=xyz2[i*m*3+l0*3+l*3+2];
148
+ buf[l*4+3]=ratioR[l0+l];
149
+ }
150
+ __syncthreads();
151
+ T rl=ratioL[k];
152
+ if (k<n){
153
+ for (int l=0;l<lend;l++){
154
+ T x2=buf[l*4+0];
155
+ T y2=buf[l*4+1];
156
+ T z2=buf[l*4+2];
157
+ T w=__expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*rl*buf[l*4+3];
158
+ match[i*n*m+(l0+l)*n+k]+=w;
159
+ suml+=w;
160
+ }
161
+ }
162
+ __syncthreads();
163
+ }
164
+ if (k<n)
165
+ remainL[k]=fmaxf(0.0f,remainL[k]-suml);
166
+ }
167
+ /*for (int k=threadIdx.x;k<n;k+=blockDim.x){
168
+ T x1=xyz1[i*n*3+k*3+0];
169
+ T y1=xyz1[i*n*3+k*3+1];
170
+ T z1=xyz1[i*n*3+k*3+2];
171
+ T suml=0;
172
+ for (int l=0;l<m;l++){
173
+ T x2=xyz2[i*m*3+l*3+0];
174
+ T y2=xyz2[i*m*3+l*3+1];
175
+ T z2=xyz2[i*m*3+l*3+2];
176
+ T w=expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*ratioL[k]*ratioR[l];
177
+ match[i*n*m+l*n+k]+=w;
178
+ suml+=w;
179
+ }
180
+ remainL[k]=fmaxf(0.0f,remainL[k]-suml);
181
+ }*/
182
+ __syncthreads();
183
+ }
184
+ }
185
+ }
186
+
187
+ void approxmatchLauncher(const int b, const int n, const int m, const at::Tensor xyz1, const at::Tensor xyz2, at::Tensor match, at::Tensor temp){
188
+ AT_DISPATCH_FLOATING_TYPES(match.type(), "approxmatch", ([&] {
189
+ approxmatch
190
+ <<<32, 512>>>(
191
+ b, n, m,
192
+ xyz1.data<scalar_t>(),
193
+ xyz2.data<scalar_t>(),
194
+ match.data<scalar_t>(),
195
+ temp.data<scalar_t>());
196
+ }));
197
+ cudaDeviceSynchronize();
198
+ CUDA_CHECK(cudaGetLastError())
199
+ }
200
+
201
+ template<typename T>
202
+ __global__ void matchcost(const int b, const int n, const int m, const T * __restrict__ xyz1, const T * __restrict__ xyz2, const T * __restrict__ match, T * __restrict__ out){
203
+ __shared__ T allsum[512];
204
+ const int Block=1024;
205
+ __shared__ T buf[Block*3];
206
+ for (int i=blockIdx.x;i<b;i+=gridDim.x){
207
+ T subsum=0;
208
+ for (int k0=0;k0<n;k0+=blockDim.x){
209
+ int k=k0+threadIdx.x;
210
+ T x1=0,y1=0,z1=0;
211
+ if (k<n){
212
+ x1=xyz1[i*n*3+k*3+0];
213
+ y1=xyz1[i*n*3+k*3+1];
214
+ z1=xyz1[i*n*3+k*3+2];
215
+ }
216
+ for (int l0=0;l0<m;l0+=Block){
217
+ int lend=min(m,l0+Block)-l0;
218
+ for (int l=threadIdx.x;l<lend*3;l+=blockDim.x)
219
+ buf[l]=xyz2[i*m*3+l0*3+l];
220
+ __syncthreads();
221
+ if (k<n){
222
+ for (int l=0;l<lend;l++){
223
+ T x2=buf[l*3+0];
224
+ T y2=buf[l*3+1];
225
+ T z2=buf[l*3+2];
226
+ T d=sqrtf((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1));
227
+ subsum+=d*match[i*n*m+(l0+l)*n+k];
228
+ }
229
+ }
230
+ __syncthreads();
231
+ }
232
+ }
233
+ allsum[threadIdx.x]=subsum;
234
+ for (int j=1;j<blockDim.x;j<<=1){
235
+ __syncthreads();
236
+ if ((threadIdx.x&j)==0 && threadIdx.x+j<blockDim.x){
237
+ allsum[threadIdx.x]+=allsum[threadIdx.x+j];
238
+ }
239
+ }
240
+ if (threadIdx.x==0)
241
+ out[i]=allsum[0];
242
+ __syncthreads();
243
+ }
244
+ }
245
+
246
+ void matchcostLauncher(const int b, const int n, const int m, const at::Tensor xyz1, const at::Tensor xyz2, const at::Tensor match, at::Tensor out){
247
+ AT_DISPATCH_FLOATING_TYPES(xyz1.type(), "matchcost", ([&] {
248
+ matchcost<<<32, 512>>>(
249
+ b, n, m,
250
+ xyz1.data<scalar_t>(),
251
+ xyz2.data<scalar_t>(),
252
+ match.data<scalar_t>(),
253
+ out.data<scalar_t>());
254
+ }));
255
+ CUDA_CHECK(cudaGetLastError())
256
+ }
257
+
258
+ template <typename T>
259
+ __global__ void matchcostgrad2(const int b, const int n, const int m,const T * __restrict__ xyz1, const T * __restrict__ xyz2, const T * __restrict__ match, T * __restrict__ grad2){
260
+ __shared__ T sum_grad[256*3];
261
+ for (int i=blockIdx.x;i<b;i+=gridDim.x){
262
+ int kbeg=m*blockIdx.y/gridDim.y;
263
+ int kend=m*(blockIdx.y+1)/gridDim.y;
264
+ for (int k=kbeg;k<kend;k++){
265
+ T x2=xyz2[(i*m+k)*3+0];
266
+ T y2=xyz2[(i*m+k)*3+1];
267
+ T z2=xyz2[(i*m+k)*3+2];
268
+ T subsumx=0,subsumy=0,subsumz=0;
269
+ for (int j=threadIdx.x;j<n;j+=blockDim.x){
270
+ T x1=x2-xyz1[(i*n+j)*3+0];
271
+ T y1=y2-xyz1[(i*n+j)*3+1];
272
+ T z1=z2-xyz1[(i*n+j)*3+2];
273
+ T d=match[i*n*m+k*n+j]*rsqrtf(fmaxf(x1*x1+y1*y1+z1*z1,1e-20f));
274
+ subsumx+=x1*d;
275
+ subsumy+=y1*d;
276
+ subsumz+=z1*d;
277
+ }
278
+ sum_grad[threadIdx.x*3+0]=subsumx;
279
+ sum_grad[threadIdx.x*3+1]=subsumy;
280
+ sum_grad[threadIdx.x*3+2]=subsumz;
281
+ for (int j=1;j<blockDim.x;j<<=1){
282
+ __syncthreads();
283
+ int j1=threadIdx.x;
284
+ int j2=threadIdx.x+j;
285
+ if ((j1&j)==0 && j2<blockDim.x){
286
+ sum_grad[j1*3+0]+=sum_grad[j2*3+0];
287
+ sum_grad[j1*3+1]+=sum_grad[j2*3+1];
288
+ sum_grad[j1*3+2]+=sum_grad[j2*3+2];
289
+ }
290
+ }
291
+ if (threadIdx.x==0){
292
+ grad2[(i*m+k)*3+0]=sum_grad[0];
293
+ grad2[(i*m+k)*3+1]=sum_grad[1];
294
+ grad2[(i*m+k)*3+2]=sum_grad[2];
295
+ }
296
+ __syncthreads();
297
+ }
298
+ }
299
+ }
300
+
301
+ template <typename T>
302
+ __global__ void matchcostgrad1(const int b, const int n, const int m, const T * __restrict__ xyz1, const T * __restrict__ xyz2, const T * __restrict__ match, T * __restrict__ grad1){
303
+ for (int i=blockIdx.x;i<b;i+=gridDim.x){
304
+ for (int l=threadIdx.x;l<n;l+=blockDim.x){
305
+ T x1=xyz1[i*n*3+l*3+0];
306
+ T y1=xyz1[i*n*3+l*3+1];
307
+ T z1=xyz1[i*n*3+l*3+2];
308
+ T dx=0,dy=0,dz=0;
309
+ for (int k=0;k<m;k++){
310
+ T x2=xyz2[i*m*3+k*3+0];
311
+ T y2=xyz2[i*m*3+k*3+1];
312
+ T z2=xyz2[i*m*3+k*3+2];
313
+ T d=match[i*n*m+k*n+l]*rsqrtf(fmaxf((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2)+(z1-z2)*(z1-z2),1e-20f));
314
+ dx+=(x1-x2)*d;
315
+ dy+=(y1-y2)*d;
316
+ dz+=(z1-z2)*d;
317
+ }
318
+ grad1[i*n*3+l*3+0]=dx;
319
+ grad1[i*n*3+l*3+1]=dy;
320
+ grad1[i*n*3+l*3+2]=dz;
321
+ }
322
+ }
323
+ }
324
+
325
+ void matchcostgradLauncher(const int b, const int n, const int m, const at::Tensor xyz1, const at::Tensor xyz2, const at::Tensor match, at::Tensor grad1, at::Tensor grad2){
326
+ AT_DISPATCH_FLOATING_TYPES(xyz1.type(), "matchcostgrad1", ([&] {
327
+ matchcostgrad1<<<32,512>>>(
328
+ b, n, m,
329
+ xyz1.data<scalar_t>(),
330
+ xyz2.data<scalar_t>(),
331
+ match.data<scalar_t>(),
332
+ grad1.data<scalar_t>());
333
+ }));
334
+ CUDA_CHECK(cudaGetLastError())
335
+
336
+ AT_DISPATCH_FLOATING_TYPES(xyz1.type(), "matchcostgrad2", ([&] {
337
+ matchcostgrad2<<<dim3(32,32),256>>>(
338
+ b, n, m,
339
+ xyz1.data<scalar_t>(),
340
+ xyz2.data<scalar_t>(),
341
+ match.data<scalar_t>(),
342
+ grad2.data<scalar_t>());
343
+ }));
344
+ CUDA_CHECK(cudaGetLastError())
345
+ }
346
+
347
+ #endif
@@ -0,0 +1,18 @@
1
+ #ifndef CUDA_HELPER_H_
2
+ #define CUDA_HELPER_H_
3
+
4
+ #define CUDA_CHECK(err) \
5
+ if (cudaSuccess != err) \
6
+ { \
7
+ fprintf(stderr, "CUDA kernel failed: %s (%s:%d)\n", \
8
+ cudaGetErrorString(err), __FILE__, __LINE__); \
9
+ std::exit(-1); \
10
+ }
11
+
12
+ #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), \
13
+ #x " must be a CUDA tensor")
14
+ #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), \
15
+ #x " must be contiguous")
16
+ #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
17
+
18
+ #endif