PyPI - learning3d - Versions diffs - 0.0.1__py3-none-any.whl - Mend

learning3d 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

learning3d/__init__.py +2 -0
learning3d/data_utils/__init__.py +4 -0
learning3d/data_utils/dataloaders.py +454 -0
learning3d/data_utils/user_data.py +119 -0
learning3d/examples/test_dcp.py +139 -0
learning3d/examples/test_deepgmr.py +144 -0
learning3d/examples/test_flownet.py +113 -0
learning3d/examples/test_masknet.py +159 -0
learning3d/examples/test_masknet2.py +162 -0
learning3d/examples/test_pcn.py +118 -0
learning3d/examples/test_pcrnet.py +120 -0
learning3d/examples/test_pnlk.py +121 -0
learning3d/examples/test_pointconv.py +126 -0
learning3d/examples/test_pointnet.py +121 -0
learning3d/examples/test_prnet.py +126 -0
learning3d/examples/test_rpmnet.py +120 -0
learning3d/examples/train_PointNetLK.py +240 -0
learning3d/examples/train_dcp.py +249 -0
learning3d/examples/train_deepgmr.py +244 -0
learning3d/examples/train_flownet.py +259 -0
learning3d/examples/train_masknet.py +239 -0
learning3d/examples/train_pcn.py +216 -0
learning3d/examples/train_pcrnet.py +228 -0
learning3d/examples/train_pointconv.py +245 -0
learning3d/examples/train_pointnet.py +244 -0
learning3d/examples/train_prnet.py +229 -0
learning3d/examples/train_rpmnet.py +228 -0
learning3d/losses/__init__.py +12 -0
learning3d/losses/chamfer_distance.py +51 -0
learning3d/losses/classification.py +14 -0
learning3d/losses/correspondence_loss.py +10 -0
learning3d/losses/cuda/chamfer_distance/__init__.py +1 -0
learning3d/losses/cuda/chamfer_distance/chamfer_distance.cpp +185 -0
learning3d/losses/cuda/chamfer_distance/chamfer_distance.cu +209 -0
learning3d/losses/cuda/chamfer_distance/chamfer_distance.py +66 -0
learning3d/losses/cuda/emd_torch/pkg/emd_loss_layer.py +41 -0
learning3d/losses/cuda/emd_torch/pkg/include/cuda/emd.cuh +347 -0
learning3d/losses/cuda/emd_torch/pkg/include/cuda_helper.h +18 -0
learning3d/losses/cuda/emd_torch/pkg/include/emd.h +54 -0
learning3d/losses/cuda/emd_torch/pkg/layer/__init__.py +1 -0
learning3d/losses/cuda/emd_torch/pkg/layer/emd_loss_layer.py +40 -0
learning3d/losses/cuda/emd_torch/pkg/src/cuda/emd.cu +70 -0
learning3d/losses/cuda/emd_torch/pkg/src/emd.cpp +1 -0
learning3d/losses/cuda/emd_torch/setup.py +29 -0
learning3d/losses/emd.py +16 -0
learning3d/losses/frobenius_norm.py +21 -0
learning3d/losses/rmse_features.py +16 -0
learning3d/models/__init__.py +23 -0
learning3d/models/classifier.py +41 -0
learning3d/models/dcp.py +92 -0
learning3d/models/deepgmr.py +165 -0
learning3d/models/dgcnn.py +92 -0
learning3d/models/flownet3d.py +446 -0
learning3d/models/masknet.py +84 -0
learning3d/models/masknet2.py +264 -0
learning3d/models/pcn.py +164 -0
learning3d/models/pcrnet.py +74 -0
learning3d/models/pointconv.py +108 -0
learning3d/models/pointnet.py +108 -0
learning3d/models/pointnetlk.py +173 -0
learning3d/models/pooling.py +15 -0
learning3d/models/ppfnet.py +102 -0
learning3d/models/prnet.py +431 -0
learning3d/models/rpmnet.py +359 -0
learning3d/models/segmentation.py +38 -0
learning3d/ops/__init__.py +0 -0
learning3d/ops/data_utils.py +45 -0
learning3d/ops/invmat.py +134 -0
learning3d/ops/quaternion.py +218 -0
learning3d/ops/se3.py +157 -0
learning3d/ops/sinc.py +229 -0
learning3d/ops/so3.py +213 -0
learning3d/ops/transform_functions.py +342 -0
learning3d/utils/__init__.py +9 -0
learning3d/utils/lib/build/lib.linux-x86_64-3.5/pointnet2_cuda.cpython-35m-x86_64-linux-gnu.so +0 -0
learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/ball_query.o +0 -0
learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/ball_query_gpu.o +0 -0
learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/group_points.o +0 -0
learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/group_points_gpu.o +0 -0
learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/interpolate.o +0 -0
learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/interpolate_gpu.o +0 -0
learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/pointnet2_api.o +0 -0
learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/sampling.o +0 -0
learning3d/utils/lib/build/temp.linux-x86_64-3.5/src/sampling_gpu.o +0 -0
learning3d/utils/lib/dist/pointnet2-0.0.0-py3.5-linux-x86_64.egg +0 -0
learning3d/utils/lib/pointnet2.egg-info/SOURCES.txt +14 -0
learning3d/utils/lib/pointnet2.egg-info/dependency_links.txt +1 -0
learning3d/utils/lib/pointnet2.egg-info/top_level.txt +1 -0
learning3d/utils/lib/pointnet2_modules.py +160 -0
learning3d/utils/lib/pointnet2_utils.py +318 -0
learning3d/utils/lib/pytorch_utils.py +236 -0
learning3d/utils/lib/setup.py +23 -0
learning3d/utils/lib/src/ball_query.cpp +25 -0
learning3d/utils/lib/src/ball_query_gpu.cu +67 -0
learning3d/utils/lib/src/ball_query_gpu.h +15 -0
learning3d/utils/lib/src/cuda_utils.h +15 -0
learning3d/utils/lib/src/group_points.cpp +36 -0
learning3d/utils/lib/src/group_points_gpu.cu +86 -0
learning3d/utils/lib/src/group_points_gpu.h +22 -0
learning3d/utils/lib/src/interpolate.cpp +65 -0
learning3d/utils/lib/src/interpolate_gpu.cu +233 -0
learning3d/utils/lib/src/interpolate_gpu.h +36 -0
learning3d/utils/lib/src/pointnet2_api.cpp +25 -0
learning3d/utils/lib/src/sampling.cpp +46 -0
learning3d/utils/lib/src/sampling_gpu.cu +253 -0
learning3d/utils/lib/src/sampling_gpu.h +29 -0
learning3d/utils/pointconv_util.py +382 -0
learning3d/utils/ppfnet_util.py +244 -0
learning3d/utils/svd.py +59 -0
learning3d/utils/transformer.py +243 -0
learning3d-0.0.1.dist-info/LICENSE +21 -0
learning3d-0.0.1.dist-info/METADATA +271 -0
learning3d-0.0.1.dist-info/RECORD +115 -0
learning3d-0.0.1.dist-info/WHEEL +5 -0
learning3d-0.0.1.dist-info/top_level.txt +1 -0

learning3d/losses/cuda/chamfer_distance/chamfer_distance.cu ADDED Viewed

@@ -0,0 +1,209 @@
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+__global__
+void ChamferDistanceKernel(
+	int b,
+	int n,
+	const float* xyz,
+	int m,
+	const float* xyz2,
+	float* result,
+	int* result_i)
+{
+	const int batch=512;
+	__shared__ float buf[batch*3];
+	for (int i=blockIdx.x;i<b;i+=gridDim.x){
+		for (int k2=0;k2<m;k2+=batch){
+			int end_k=min(m,k2+batch)-k2;
+			for (int j=threadIdx.x;j<end_k*3;j+=blockDim.x){
+				buf[j]=xyz2[(i*m+k2)*3+j];
+			}
+			__syncthreads();
+			for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
+				float x1=xyz[(i*n+j)*3+0];
+				float y1=xyz[(i*n+j)*3+1];
+				float z1=xyz[(i*n+j)*3+2];
+				int best_i=0;
+				float best=0;
+				int end_ka=end_k-(end_k&3);
+				if (end_ka==batch){
+					for (int k=0;k<batch;k+=4){
+						{
+							float x2=buf[k*3+0]-x1;
+							float y2=buf[k*3+1]-y1;
+							float z2=buf[k*3+2]-z1;
+							float d=x2*x2+y2*y2+z2*z2;
+							if (k==0 || d<best){
+								best=d;
+								best_i=k+k2;
+							}
+						}
+						{
+							float x2=buf[k*3+3]-x1;
+							float y2=buf[k*3+4]-y1;
+							float z2=buf[k*3+5]-z1;
+							float d=x2*x2+y2*y2+z2*z2;
+							if (d<best){
+								best=d;
+								best_i=k+k2+1;
+							}
+						}
+						{
+							float x2=buf[k*3+6]-x1;
+							float y2=buf[k*3+7]-y1;
+							float z2=buf[k*3+8]-z1;
+							float d=x2*x2+y2*y2+z2*z2;
+							if (d<best){
+								best=d;
+								best_i=k+k2+2;
+							}
+						}
+						{
+							float x2=buf[k*3+9]-x1;
+							float y2=buf[k*3+10]-y1;
+							float z2=buf[k*3+11]-z1;
+							float d=x2*x2+y2*y2+z2*z2;
+							if (d<best){
+								best=d;
+								best_i=k+k2+3;
+							}
+						}
+					}
+				}else{
+					for (int k=0;k<end_ka;k+=4){
+						{
+							float x2=buf[k*3+0]-x1;
+							float y2=buf[k*3+1]-y1;
+							float z2=buf[k*3+2]-z1;
+							float d=x2*x2+y2*y2+z2*z2;
+							if (k==0 || d<best){
+								best=d;
+								best_i=k+k2;
+							}
+						}
+						{
+							float x2=buf[k*3+3]-x1;
+							float y2=buf[k*3+4]-y1;
+							float z2=buf[k*3+5]-z1;
+							float d=x2*x2+y2*y2+z2*z2;
+							if (d<best){
+								best=d;
+								best_i=k+k2+1;
+							}
+						}
+						{
+							float x2=buf[k*3+6]-x1;
+							float y2=buf[k*3+7]-y1;
+							float z2=buf[k*3+8]-z1;
+							float d=x2*x2+y2*y2+z2*z2;
+							if (d<best){
+								best=d;
+								best_i=k+k2+2;
+							}
+						}
+						{
+							float x2=buf[k*3+9]-x1;
+							float y2=buf[k*3+10]-y1;
+							float z2=buf[k*3+11]-z1;
+							float d=x2*x2+y2*y2+z2*z2;
+							if (d<best){
+								best=d;
+								best_i=k+k2+3;
+							}
+						}
+					}
+				}
+				for (int k=end_ka;k<end_k;k++){
+					float x2=buf[k*3+0]-x1;
+					float y2=buf[k*3+1]-y1;
+					float z2=buf[k*3+2]-z1;
+					float d=x2*x2+y2*y2+z2*z2;
+					if (k==0 || d<best){
+						best=d;
+						best_i=k+k2;
+					}
+				}
+				if (k2==0 || result[(i*n+j)]>best){
+					result[(i*n+j)]=best;
+					result_i[(i*n+j)]=best_i;
+				}
+			}
+			__syncthreads();
+		}
+	}
+}
+void ChamferDistanceKernelLauncher(
+    const int b, const int n,
+    const float* xyz,
+    const int m,
+    const float* xyz2,
+    float* result,
+    int* result_i,
+    float* result2,
+    int* result2_i)
+{
+	ChamferDistanceKernel<<<dim3(32,16,1),512>>>(b, n, xyz, m, xyz2, result, result_i);
+	ChamferDistanceKernel<<<dim3(32,16,1),512>>>(b, m, xyz2, n, xyz, result2, result2_i);
+	cudaError_t err = cudaGetLastError();
+	if (err != cudaSuccess)
+	    printf("error in chamfer distance updateOutput: %s\n", cudaGetErrorString(err));
+}
+__global__
+void ChamferDistanceGradKernel(
+	int b, int n,
+	const float* xyz1,
+	int m,
+	const float* xyz2,
+	const float* grad_dist1,
+	const int* idx1,
+	float* grad_xyz1,
+	float* grad_xyz2)
+{
+	for (int i = blockIdx.x; i<b; i += gridDim.x) {
+		for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x*gridDim.y) {
+			float x1=xyz1[(i*n+j)*3+0];
+			float y1=xyz1[(i*n+j)*3+1];
+			float z1=xyz1[(i*n+j)*3+2];
+			int j2=idx1[i*n+j];
+			float x2=xyz2[(i*m+j2)*3+0];
+			float y2=xyz2[(i*m+j2)*3+1];
+			float z2=xyz2[(i*m+j2)*3+2];
+			float g=grad_dist1[i*n+j]*2;
+			atomicAdd(&(grad_xyz1[(i*n+j)*3+0]),g*(x1-x2));
+			atomicAdd(&(grad_xyz1[(i*n+j)*3+1]),g*(y1-y2));
+			atomicAdd(&(grad_xyz1[(i*n+j)*3+2]),g*(z1-z2));
+			atomicAdd(&(grad_xyz2[(i*m+j2)*3+0]),-(g*(x1-x2)));
+			atomicAdd(&(grad_xyz2[(i*m+j2)*3+1]),-(g*(y1-y2)));
+			atomicAdd(&(grad_xyz2[(i*m+j2)*3+2]),-(g*(z1-z2)));
+		}
+	}
+}
+void ChamferDistanceGradKernelLauncher(
+    const int b, const int n,
+    const float* xyz1,
+    const int m,
+    const float* xyz2,
+    const float* grad_dist1,
+    const int* idx1,
+    const float* grad_dist2,
+    const int* idx2,
+    float* grad_xyz1,
+    float* grad_xyz2)
+{
+	cudaMemset(grad_xyz1, 0, b*n*3*4);
+	cudaMemset(grad_xyz2, 0, b*m*3*4);
+	ChamferDistanceGradKernel<<<dim3(1,16,1), 256>>>(b, n, xyz1, m, xyz2, grad_dist1, idx1, grad_xyz1, grad_xyz2);
+	ChamferDistanceGradKernel<<<dim3(1,16,1), 256>>>(b, m, xyz2, n, xyz1, grad_dist2, idx2, grad_xyz2, grad_xyz1);
+	cudaError_t err = cudaGetLastError();
+  	if (err != cudaSuccess)
+	    printf("error in chamfer distance get grad: %s\n", cudaGetErrorString(err));
+}

learning3d/losses/cuda/chamfer_distance/chamfer_distance.py ADDED Viewed

@@ -0,0 +1,66 @@
+import torch
+from torch.utils.cpp_extension import load
+import os
+script_dir = os.path.dirname(__file__)
+sources = [
+    os.path.join(script_dir, "chamfer_distance.cpp"),
+    os.path.join(script_dir, "chamfer_distance.cu"),
+]
+cd = load(name="cd", sources=sources)
+class ChamferDistanceFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, xyz1, xyz2):
+        batchsize, n, _ = xyz1.size()
+        _, m, _ = xyz2.size()
+        xyz1 = xyz1.contiguous()
+        xyz2 = xyz2.contiguous()
+        dist1 = torch.zeros(batchsize, n)
+        dist2 = torch.zeros(batchsize, m)
+        idx1 = torch.zeros(batchsize, n, dtype=torch.int)
+        idx2 = torch.zeros(batchsize, m, dtype=torch.int)
+        if not xyz1.is_cuda:
+            cd.forward(xyz1, xyz2, dist1, dist2, idx1, idx2)
+        else:
+            dist1 = dist1.cuda()
+            dist2 = dist2.cuda()
+            idx1 = idx1.cuda()
+            idx2 = idx2.cuda()
+            cd.forward_cuda(xyz1, xyz2, dist1, dist2, idx1, idx2)
+        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
+        return dist1, dist2
+    @staticmethod
+    def backward(ctx, graddist1, graddist2):
+        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
+        graddist1 = graddist1.contiguous()
+        graddist2 = graddist2.contiguous()
+        gradxyz1 = torch.zeros(xyz1.size())
+        gradxyz2 = torch.zeros(xyz2.size())
+        if not graddist1.is_cuda:
+            cd.backward(
+                xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2
+            )
+        else:
+            gradxyz1 = gradxyz1.cuda()
+            gradxyz2 = gradxyz2.cuda()
+            cd.backward_cuda(
+                xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2
+            )
+        return gradxyz1, gradxyz2
+class ChamferDistance(torch.nn.Module):
+    def forward(self, xyz1, xyz2):
+        return ChamferDistanceFunction.apply(xyz1, xyz2)

learning3d/losses/cuda/emd_torch/pkg/emd_loss_layer.py ADDED Viewed

@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+import _emd_ext._emd as emd
+class EMDFunction(torch.autograd.Function):
+	@staticmethod
+	def forward(self, xyz1, xyz2):
+		cost, match = emd.emd_forward(xyz1, xyz2)
+		self.save_for_backward(xyz1, xyz2, match)
+		return cost
+	@staticmethod
+	def backward(self, grad_output):
+		xyz1, xyz2, match = self.saved_tensors
+		grad_xyz1, grad_xyz2 = emd.emd_backward(xyz1, xyz2, match)
+		return grad_xyz1, grad_xyz2
+class EMDLoss(nn.Module):
+	'''
+	Computes the (approximate) Earth Mover's Distance between two point sets.
+	IMPLEMENTATION LIMITATIONS:
+	- Double tensors must have <=11 dimensions
+	- Float tensors must have <=23 dimensions
+	This is due to the use of CUDA shared memory in the computation. This shared memory is limited by the hardware to 48kB.
+	'''
+	def __init__(self):
+		super(EMDLoss, self).__init__()
+	def forward(self, xyz1, xyz2):
+		assert xyz1.shape[-1] == xyz2.shape[-1], 'Both point sets must have the same dimensions!'
+		assert xyz1.shape[1] == xyz2.shape[1], 'Both Point Clouds must have same number of points in it.'
+		return EMDFunction.apply(xyz1, xyz2)

learning3d/losses/cuda/emd_torch/pkg/include/cuda/emd.cuh ADDED Viewed

@@ -0,0 +1,347 @@
+#ifndef EMD_CUH_
+#define EMD_CUH_
+#include "cuda_helper.h"
+template<typename T>
+__global__ void approxmatch(const int b, const int n, const int m, const T * __restrict__ xyz1, const T * __restrict__ xyz2, T * __restrict__ match, T * temp){
+	T * remainL=temp+blockIdx.x*(n+m)*2, * remainR=temp+blockIdx.x*(n+m)*2+n,*ratioL=temp+blockIdx.x*(n+m)*2+n+m,*ratioR=temp+blockIdx.x*(n+m)*2+n+m+n;
+	T multiL,multiR;
+	if (n>=m){
+		multiL=1;
+		multiR=n/m;
+	}else{
+		multiL=m/n;
+		multiR=1;
+	}
+	const int Block=1024;
+	__shared__ T buf[Block*4];
+	for (int i=blockIdx.x;i<b;i+=gridDim.x){
+		for (int j=threadIdx.x;j<n*m;j+=blockDim.x)
+			match[i*n*m+j]=0;
+		for (int j=threadIdx.x;j<n;j+=blockDim.x)
+			remainL[j]=multiL;
+		for (int j=threadIdx.x;j<m;j+=blockDim.x)
+			remainR[j]=multiR;
+		__syncthreads();
+		for (int j=7;j>=-2;j--){
+			T level=-powf(4.0f,j);
+			if (j==-2){
+				level=0;
+			}
+			for (int k0=0;k0<n;k0+=blockDim.x){
+				int k=k0+threadIdx.x;
+				T x1=0,y1=0,z1=0;
+				if (k<n){
+					x1=xyz1[i*n*3+k*3+0];
+					y1=xyz1[i*n*3+k*3+1];
+					z1=xyz1[i*n*3+k*3+2];
+				}
+				T suml=T(1e-9f);
+				for (int l0=0;l0<m;l0+=Block){
+					int lend=min(m,l0+Block)-l0;
+					for (int l=threadIdx.x;l<lend;l+=blockDim.x){
+						T x2=xyz2[i*m*3+l0*3+l*3+0];
+						T y2=xyz2[i*m*3+l0*3+l*3+1];
+						T z2=xyz2[i*m*3+l0*3+l*3+2];
+						buf[l*4+0]=x2;
+						buf[l*4+1]=y2;
+						buf[l*4+2]=z2;
+						buf[l*4+3]=remainR[l0+l];
+					}
+					__syncthreads();
+					for (int l=0;l<lend;l++){
+						T x2=buf[l*4+0];
+						T y2=buf[l*4+1];
+						T z2=buf[l*4+2];
+						T d=level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1));
+						T w=__expf(d)*buf[l*4+3];
+						suml+=w;
+					}
+					__syncthreads();
+				}
+				if (k<n)
+					ratioL[k]=remainL[k]/suml;
+			}
+			/*for (int k=threadIdx.x;k<n;k+=gridDim.x){
+				T x1=xyz1[i*n*3+k*3+0];
+				T y1=xyz1[i*n*3+k*3+1];
+				T z1=xyz1[i*n*3+k*3+2];
+				T suml=1e-9f;
+				for (int l=0;l<m;l++){
+					T x2=xyz2[i*m*3+l*3+0];
+					T y2=xyz2[i*m*3+l*3+1];
+					T z2=xyz2[i*m*3+l*3+2];
+					T w=expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*remainR[l];
+					suml+=w;
+				}
+				ratioL[k]=remainL[k]/suml;
+			}*/
+			__syncthreads();
+			for (int l0=0;l0<m;l0+=blockDim.x){
+				int l=l0+threadIdx.x;
+				T x2=0,y2=0,z2=0;
+				if (l<m){
+					x2=xyz2[i*m*3+l*3+0];
+					y2=xyz2[i*m*3+l*3+1];
+					z2=xyz2[i*m*3+l*3+2];
+				}
+				T sumr=0;
+				for (int k0=0;k0<n;k0+=Block){
+					int kend=min(n,k0+Block)-k0;
+					for (int k=threadIdx.x;k<kend;k+=blockDim.x){
+						buf[k*4+0]=xyz1[i*n*3+k0*3+k*3+0];
+						buf[k*4+1]=xyz1[i*n*3+k0*3+k*3+1];
+						buf[k*4+2]=xyz1[i*n*3+k0*3+k*3+2];
+						buf[k*4+3]=ratioL[k0+k];
+					}
+					__syncthreads();
+					for (int k=0;k<kend;k++){
+						T x1=buf[k*4+0];
+						T y1=buf[k*4+1];
+						T z1=buf[k*4+2];
+						T w=__expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*buf[k*4+3];
+						sumr+=w;
+					}
+					__syncthreads();
+				}
+				if (l<m){
+					sumr*=remainR[l];
+					T consumption=fminf(remainR[l]/(sumr+1e-9f),1.0f);
+					ratioR[l]=consumption*remainR[l];
+					remainR[l]=fmaxf(0.0f,remainR[l]-sumr);
+				}
+			}
+			/*for (int l=threadIdx.x;l<m;l+=blockDim.x){
+				T x2=xyz2[i*m*3+l*3+0];
+				T y2=xyz2[i*m*3+l*3+1];
+				T z2=xyz2[i*m*3+l*3+2];
+				T sumr=0;
+				for (int k=0;k<n;k++){
+					T x1=xyz1[i*n*3+k*3+0];
+					T y1=xyz1[i*n*3+k*3+1];
+					T z1=xyz1[i*n*3+k*3+2];
+					T w=expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*ratioL[k];
+					sumr+=w;
+				}
+				sumr*=remainR[l];
+				T consumption=fminf(remainR[l]/(sumr+1e-9f),1.0f);
+				ratioR[l]=consumption*remainR[l];
+				remainR[l]=fmaxf(0.0f,remainR[l]-sumr);
+			}*/
+			__syncthreads();
+			for (int k0=0;k0<n;k0+=blockDim.x){
+				int k=k0+threadIdx.x;
+				T x1=0,y1=0,z1=0;
+				if (k<n){
+					x1=xyz1[i*n*3+k*3+0];
+					y1=xyz1[i*n*3+k*3+1];
+					z1=xyz1[i*n*3+k*3+2];
+				}
+				T suml=0;
+				for (int l0=0;l0<m;l0+=Block){
+					int lend=min(m,l0+Block)-l0;
+					for (int l=threadIdx.x;l<lend;l+=blockDim.x){
+						buf[l*4+0]=xyz2[i*m*3+l0*3+l*3+0];
+						buf[l*4+1]=xyz2[i*m*3+l0*3+l*3+1];
+						buf[l*4+2]=xyz2[i*m*3+l0*3+l*3+2];
+						buf[l*4+3]=ratioR[l0+l];
+					}
+					__syncthreads();
+					T rl=ratioL[k];
+					if (k<n){
+						for (int l=0;l<lend;l++){
+							T x2=buf[l*4+0];
+							T y2=buf[l*4+1];
+							T z2=buf[l*4+2];
+							T w=__expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*rl*buf[l*4+3];
+							match[i*n*m+(l0+l)*n+k]+=w;
+							suml+=w;
+						}
+					}
+					__syncthreads();
+				}
+				if (k<n)
+					remainL[k]=fmaxf(0.0f,remainL[k]-suml);
+			}
+			/*for (int k=threadIdx.x;k<n;k+=blockDim.x){
+				T x1=xyz1[i*n*3+k*3+0];
+				T y1=xyz1[i*n*3+k*3+1];
+				T z1=xyz1[i*n*3+k*3+2];
+				T suml=0;
+				for (int l=0;l<m;l++){
+					T x2=xyz2[i*m*3+l*3+0];
+					T y2=xyz2[i*m*3+l*3+1];
+					T z2=xyz2[i*m*3+l*3+2];
+					T w=expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*ratioL[k]*ratioR[l];
+					match[i*n*m+l*n+k]+=w;
+					suml+=w;
+				}
+				remainL[k]=fmaxf(0.0f,remainL[k]-suml);
+			}*/
+			__syncthreads();
+		}
+	}
+}
+void approxmatchLauncher(const int b, const int n, const int m, const at::Tensor xyz1, const at::Tensor xyz2, at::Tensor match, at::Tensor temp){
+	AT_DISPATCH_FLOATING_TYPES(match.type(), "approxmatch", ([&] {
+		approxmatch
+			<<<32, 512>>>(
+			b, n, m,
+			xyz1.data<scalar_t>(),
+			xyz2.data<scalar_t>(),
+			match.data<scalar_t>(),
+			temp.data<scalar_t>());
+	}));
+	cudaDeviceSynchronize();
+	CUDA_CHECK(cudaGetLastError())
+}
+template<typename T>
+__global__ void matchcost(const int b, const int n, const int m, const T * __restrict__ xyz1, const T * __restrict__ xyz2, const T * __restrict__ match, T * __restrict__ out){
+	__shared__ T allsum[512];
+	const int Block=1024;
+	__shared__ T buf[Block*3];
+	for (int i=blockIdx.x;i<b;i+=gridDim.x){
+		T subsum=0;
+		for (int k0=0;k0<n;k0+=blockDim.x){
+			int k=k0+threadIdx.x;
+			T x1=0,y1=0,z1=0;
+			if (k<n){
+				x1=xyz1[i*n*3+k*3+0];
+				y1=xyz1[i*n*3+k*3+1];
+				z1=xyz1[i*n*3+k*3+2];
+			}
+			for (int l0=0;l0<m;l0+=Block){
+				int lend=min(m,l0+Block)-l0;
+				for (int l=threadIdx.x;l<lend*3;l+=blockDim.x)
+					buf[l]=xyz2[i*m*3+l0*3+l];
+				__syncthreads();
+				if (k<n){
+					for (int l=0;l<lend;l++){
+						T x2=buf[l*3+0];
+						T y2=buf[l*3+1];
+						T z2=buf[l*3+2];
+						T d=sqrtf((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1));
+						subsum+=d*match[i*n*m+(l0+l)*n+k];
+					}
+				}
+				__syncthreads();
+			}
+		}
+		allsum[threadIdx.x]=subsum;
+		for (int j=1;j<blockDim.x;j<<=1){
+			__syncthreads();
+			if ((threadIdx.x&j)==0 && threadIdx.x+j<blockDim.x){
+				allsum[threadIdx.x]+=allsum[threadIdx.x+j];
+			}
+		}
+		if (threadIdx.x==0)
+			out[i]=allsum[0];
+		__syncthreads();
+	}
+}
+void matchcostLauncher(const int b, const int n, const int m, const at::Tensor xyz1, const at::Tensor xyz2, const at::Tensor match, at::Tensor out){
+	AT_DISPATCH_FLOATING_TYPES(xyz1.type(), "matchcost", ([&] {
+		matchcost<<<32, 512>>>(
+			b, n, m,
+			xyz1.data<scalar_t>(),
+			xyz2.data<scalar_t>(),
+			match.data<scalar_t>(),
+			out.data<scalar_t>());
+	}));
+	CUDA_CHECK(cudaGetLastError())
+}
+template <typename T>
+__global__ void matchcostgrad2(const int b, const int n, const int m,const T * __restrict__ xyz1, const T * __restrict__ xyz2, const T * __restrict__ match, T * __restrict__ grad2){
+	__shared__ T sum_grad[256*3];
+	for (int i=blockIdx.x;i<b;i+=gridDim.x){
+		int kbeg=m*blockIdx.y/gridDim.y;
+		int kend=m*(blockIdx.y+1)/gridDim.y;
+		for (int k=kbeg;k<kend;k++){
+			T x2=xyz2[(i*m+k)*3+0];
+			T y2=xyz2[(i*m+k)*3+1];
+			T z2=xyz2[(i*m+k)*3+2];
+			T subsumx=0,subsumy=0,subsumz=0;
+			for (int j=threadIdx.x;j<n;j+=blockDim.x){
+				T x1=x2-xyz1[(i*n+j)*3+0];
+				T y1=y2-xyz1[(i*n+j)*3+1];
+				T z1=z2-xyz1[(i*n+j)*3+2];
+				T d=match[i*n*m+k*n+j]*rsqrtf(fmaxf(x1*x1+y1*y1+z1*z1,1e-20f));
+				subsumx+=x1*d;
+				subsumy+=y1*d;
+				subsumz+=z1*d;
+			}
+			sum_grad[threadIdx.x*3+0]=subsumx;
+			sum_grad[threadIdx.x*3+1]=subsumy;
+			sum_grad[threadIdx.x*3+2]=subsumz;
+			for (int j=1;j<blockDim.x;j<<=1){
+				__syncthreads();
+				int j1=threadIdx.x;
+				int j2=threadIdx.x+j;
+				if ((j1&j)==0 && j2<blockDim.x){
+					sum_grad[j1*3+0]+=sum_grad[j2*3+0];
+					sum_grad[j1*3+1]+=sum_grad[j2*3+1];
+					sum_grad[j1*3+2]+=sum_grad[j2*3+2];
+				}
+			}
+			if (threadIdx.x==0){
+				grad2[(i*m+k)*3+0]=sum_grad[0];
+				grad2[(i*m+k)*3+1]=sum_grad[1];
+				grad2[(i*m+k)*3+2]=sum_grad[2];
+			}
+			__syncthreads();
+		}
+	}
+}
+template <typename T>
+__global__ void matchcostgrad1(const int b, const int n, const int m, const T * __restrict__ xyz1, const T * __restrict__ xyz2, const T * __restrict__ match, T * __restrict__ grad1){
+	for (int i=blockIdx.x;i<b;i+=gridDim.x){
+		for (int l=threadIdx.x;l<n;l+=blockDim.x){
+			T x1=xyz1[i*n*3+l*3+0];
+			T y1=xyz1[i*n*3+l*3+1];
+			T z1=xyz1[i*n*3+l*3+2];
+			T dx=0,dy=0,dz=0;
+			for (int k=0;k<m;k++){
+				T x2=xyz2[i*m*3+k*3+0];
+				T y2=xyz2[i*m*3+k*3+1];
+				T z2=xyz2[i*m*3+k*3+2];
+				T d=match[i*n*m+k*n+l]*rsqrtf(fmaxf((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2)+(z1-z2)*(z1-z2),1e-20f));
+				dx+=(x1-x2)*d;
+				dy+=(y1-y2)*d;
+				dz+=(z1-z2)*d;
+			}
+			grad1[i*n*3+l*3+0]=dx;
+			grad1[i*n*3+l*3+1]=dy;
+			grad1[i*n*3+l*3+2]=dz;
+		}
+	}
+}
+void matchcostgradLauncher(const int b, const int n, const int m, const at::Tensor xyz1, const at::Tensor xyz2, const at::Tensor match, at::Tensor grad1, at::Tensor grad2){
+	AT_DISPATCH_FLOATING_TYPES(xyz1.type(), "matchcostgrad1", ([&] {
+		matchcostgrad1<<<32,512>>>(
+			b, n, m,
+			xyz1.data<scalar_t>(),
+			xyz2.data<scalar_t>(),
+			match.data<scalar_t>(),
+			grad1.data<scalar_t>());
+	}));
+	CUDA_CHECK(cudaGetLastError())
+	AT_DISPATCH_FLOATING_TYPES(xyz1.type(), "matchcostgrad2", ([&] {
+		matchcostgrad2<<<dim3(32,32),256>>>(
+			b, n, m,
+			xyz1.data<scalar_t>(),
+			xyz2.data<scalar_t>(),
+			match.data<scalar_t>(),
+			grad2.data<scalar_t>());
+	}));
+	CUDA_CHECK(cudaGetLastError())
+}
+#endif

learning3d/losses/cuda/emd_torch/pkg/include/cuda_helper.h ADDED Viewed

@@ -0,0 +1,18 @@
+#ifndef CUDA_HELPER_H_
+#define CUDA_HELPER_H_
+#define CUDA_CHECK(err) \
+	if (cudaSuccess != err) \
+	{ \
+		fprintf(stderr, "CUDA kernel failed: %s (%s:%d)\n", \
+			cudaGetErrorString(err),  __FILE__, __LINE__); \
+		std::exit(-1); \
+	}
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), \
+	#x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), \
+	#x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#endif