scs 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +84 -0
- data/ext/scs/Rakefile +11 -0
- data/lib/scs/ffi.rb +117 -0
- data/lib/scs/solver.rb +178 -0
- data/lib/scs/version.rb +3 -0
- data/lib/scs.rb +17 -0
- data/vendor/scs/LICENSE.txt +21 -0
- data/vendor/scs/Makefile +164 -0
- data/vendor/scs/README.md +220 -0
- data/vendor/scs/include/aa.h +56 -0
- data/vendor/scs/include/cones.h +46 -0
- data/vendor/scs/include/ctrlc.h +33 -0
- data/vendor/scs/include/glbopts.h +177 -0
- data/vendor/scs/include/linalg.h +26 -0
- data/vendor/scs/include/linsys.h +64 -0
- data/vendor/scs/include/normalize.h +18 -0
- data/vendor/scs/include/rw.h +17 -0
- data/vendor/scs/include/scs.h +161 -0
- data/vendor/scs/include/scs_blas.h +51 -0
- data/vendor/scs/include/util.h +65 -0
- data/vendor/scs/linsys/amatrix.c +305 -0
- data/vendor/scs/linsys/amatrix.h +36 -0
- data/vendor/scs/linsys/amatrix.o +0 -0
- data/vendor/scs/linsys/cpu/direct/private.c +366 -0
- data/vendor/scs/linsys/cpu/direct/private.h +26 -0
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.c +256 -0
- data/vendor/scs/linsys/cpu/indirect/private.h +31 -0
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/external/amd/LICENSE.txt +934 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +469 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +254 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd.h +400 -0
- data/vendor/scs/linsys/external/amd/amd_1.c +180 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.c +1842 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.c +184 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.c +64 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.c +37 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.c +179 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.c +16 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.c +119 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_internal.h +304 -0
- data/vendor/scs/linsys/external/amd/amd_order.c +199 -0
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.c +120 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.c +206 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.c +118 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.c +92 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/amd/changes +11 -0
- data/vendor/scs/linsys/external/qdldl/LICENSE +201 -0
- data/vendor/scs/linsys/external/qdldl/README.md +120 -0
- data/vendor/scs/linsys/external/qdldl/changes +4 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.c +298 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.h +177 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl_types.h +21 -0
- data/vendor/scs/linsys/gpu/gpu.c +41 -0
- data/vendor/scs/linsys/gpu/gpu.h +85 -0
- data/vendor/scs/linsys/gpu/indirect/private.c +304 -0
- data/vendor/scs/linsys/gpu/indirect/private.h +36 -0
- data/vendor/scs/scs.mk +181 -0
- data/vendor/scs/src/aa.c +224 -0
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.c +802 -0
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.c +77 -0
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.c +84 -0
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.c +93 -0
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.c +167 -0
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.c +975 -0
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_version.c +5 -0
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.c +196 -0
- data/vendor/scs/src/util.o +0 -0
- data/vendor/scs/test/data/small_random_socp +0 -0
- data/vendor/scs/test/minunit.h +13 -0
- data/vendor/scs/test/problem_utils.h +93 -0
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +85 -0
- data/vendor/scs/test/problems/small_lp.h +50 -0
- data/vendor/scs/test/problems/small_random_socp.h +33 -0
- data/vendor/scs/test/random_socp_prob.c +171 -0
- data/vendor/scs/test/run_from_file.c +69 -0
- data/vendor/scs/test/run_tests +2 -0
- data/vendor/scs/test/run_tests.c +32 -0
- metadata +203 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#ifndef SCSGPU_H_GUARD
|
|
2
|
+
#define SCSGPU_H_GUARD
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
#include <cublas_v2.h>
|
|
9
|
+
#include <cuda.h>
|
|
10
|
+
#include <cuda_runtime_api.h>
|
|
11
|
+
#include <cusparse.h>
|
|
12
|
+
|
|
13
|
+
#include "amatrix.h"
|
|
14
|
+
#include "glbopts.h"
|
|
15
|
+
#include "linalg.h"
|
|
16
|
+
#include "linsys.h"
|
|
17
|
+
#include "scs.h"
|
|
18
|
+
#include "util.h"
|
|
19
|
+
|
|
20
|
+
#define CUDA_CHECK_ERR \
|
|
21
|
+
do { \
|
|
22
|
+
cudaError_t err = cudaGetLastError(); \
|
|
23
|
+
if (err != cudaSuccess) { \
|
|
24
|
+
printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
|
|
25
|
+
cudaGetErrorString(err)); \
|
|
26
|
+
} \
|
|
27
|
+
} while (0)
|
|
28
|
+
|
|
29
|
+
#ifndef EXTRA_VERBOSE
|
|
30
|
+
#ifndef SFLOAT
|
|
31
|
+
#define CUBLAS(x) cublasD##x
|
|
32
|
+
#define CUSPARSE(x) cusparseD##x
|
|
33
|
+
#else
|
|
34
|
+
#define CUBLAS(x) cublasS##x
|
|
35
|
+
#define CUSPARSE(x) cusparseS##x
|
|
36
|
+
#endif
|
|
37
|
+
#else
|
|
38
|
+
#ifndef SFLOAT
|
|
39
|
+
#define CUBLAS(x) \
|
|
40
|
+
CUDA_CHECK_ERR; \
|
|
41
|
+
cublasD##x
|
|
42
|
+
#define CUSPARSE(x) \
|
|
43
|
+
CUDA_CHECK_ERR; \
|
|
44
|
+
cusparseD##x
|
|
45
|
+
#else
|
|
46
|
+
#define CUBLAS(x) \
|
|
47
|
+
CUDA_CHECK_ERR; \
|
|
48
|
+
cublasS##x
|
|
49
|
+
#define CUSPARSE(x) \
|
|
50
|
+
CUDA_CHECK_ERR; \
|
|
51
|
+
cusparseS##x
|
|
52
|
+
#endif
|
|
53
|
+
#endif
|
|
54
|
+
|
|
55
|
+
/*
|
|
56
|
+
CUDA matrix routines only for CSR, not CSC matrices:
|
|
57
|
+
CSC CSR GPU Mult
|
|
58
|
+
A (m x n) A' (n x m) Ag accum_by_a_trans_gpu
|
|
59
|
+
A'(n x m) A (m x n) Agt accum_by_a_gpu
|
|
60
|
+
*/
|
|
61
|
+
|
|
62
|
+
/* this struct defines the data matrix A on GPU */
|
|
63
|
+
typedef struct SCS_GPU_A_DATA_MATRIX {
|
|
64
|
+
/* A is supplied in column compressed format */
|
|
65
|
+
scs_float *x; /* A values, size: NNZ A */
|
|
66
|
+
scs_int *i; /* A row index, size: NNZ A */
|
|
67
|
+
scs_int *p; /* A column pointer, size: n+1 */
|
|
68
|
+
scs_int m, n; /* m rows, n cols */
|
|
69
|
+
scs_int Annz; /* num non-zeros in A matrix */
|
|
70
|
+
/* CUDA */
|
|
71
|
+
cusparseMatDescr_t descr;
|
|
72
|
+
} ScsGpuMatrix;
|
|
73
|
+
|
|
74
|
+
void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const scs_float *x,
|
|
75
|
+
scs_float *y, cusparseHandle_t cusparse_handle);
|
|
76
|
+
|
|
77
|
+
void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const scs_float *x,
|
|
78
|
+
scs_float *y, cusparseHandle_t cusparse_handle);
|
|
79
|
+
|
|
80
|
+
void SCS(free_gpu_matrix)(ScsGpuMatrix *A);
|
|
81
|
+
|
|
82
|
+
#ifdef __cplusplus
|
|
83
|
+
}
|
|
84
|
+
#endif
|
|
85
|
+
#endif
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
#include "private.h"
|
|
2
|
+
|
|
3
|
+
#define CG_BEST_TOL 1e-9
|
|
4
|
+
#define CG_MIN_TOL 1e-1
|
|
5
|
+
|
|
6
|
+
/* do not use within pcg, reuses memory */
|
|
7
|
+
void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
|
|
8
|
+
const scs_float *x, scs_float *y) {
|
|
9
|
+
scs_float *v_m = p->tmp_m;
|
|
10
|
+
scs_float *v_n = p->r;
|
|
11
|
+
cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
12
|
+
cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
13
|
+
SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
|
|
14
|
+
cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/* do not use within pcg, reuses memory */
|
|
18
|
+
void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
|
|
19
|
+
scs_float *y) {
|
|
20
|
+
scs_float *v_m = p->tmp_m;
|
|
21
|
+
scs_float *v_n = p->r;
|
|
22
|
+
cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
23
|
+
cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
24
|
+
#if GPU_TRANSPOSE_MAT > 0
|
|
25
|
+
SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
|
|
26
|
+
#else
|
|
27
|
+
SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
|
|
28
|
+
#endif
|
|
29
|
+
cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
|
|
33
|
+
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
|
34
|
+
sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
|
|
35
|
+
(long)A->p[A->n], stgs->cg_rate);
|
|
36
|
+
return str;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
|
40
|
+
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
|
41
|
+
sprintf(str,
|
|
42
|
+
"\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
|
|
43
|
+
(scs_float)p->tot_cg_its / (info->iter + 1),
|
|
44
|
+
p->total_solve_time / (info->iter + 1) / 1e3);
|
|
45
|
+
p->tot_cg_its = 0;
|
|
46
|
+
p->total_solve_time = 0;
|
|
47
|
+
return str;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
51
|
+
if (p) {
|
|
52
|
+
cudaFree(p->p);
|
|
53
|
+
cudaFree(p->r);
|
|
54
|
+
cudaFree(p->Gp);
|
|
55
|
+
cudaFree(p->bg);
|
|
56
|
+
cudaFree(p->tmp_m);
|
|
57
|
+
cudaFree(p->z);
|
|
58
|
+
cudaFree(p->M);
|
|
59
|
+
if (p->Ag) {
|
|
60
|
+
SCS(free_gpu_matrix)(p->Ag);
|
|
61
|
+
scs_free(p->Ag);
|
|
62
|
+
}
|
|
63
|
+
if (p->Agt) {
|
|
64
|
+
SCS(free_gpu_matrix)(p->Agt);
|
|
65
|
+
scs_free(p->Agt);
|
|
66
|
+
}
|
|
67
|
+
cusparseDestroy(p->cusparse_handle);
|
|
68
|
+
cublasDestroy(p->cublas_handle);
|
|
69
|
+
/* Don't reset because it interferes with other GPU programs. */
|
|
70
|
+
/* cudaDeviceReset(); */
|
|
71
|
+
scs_free(p);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/*y = (RHO_X * I + A'A)x */
|
|
76
|
+
static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
|
|
77
|
+
ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
78
|
+
/* x and y MUST already be loaded to GPU */
|
|
79
|
+
scs_float *tmp_m = p->tmp_m; /* temp memory */
|
|
80
|
+
cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
|
|
81
|
+
SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
|
|
82
|
+
cudaMemset(y, 0, A->n * sizeof(scs_float));
|
|
83
|
+
SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
|
|
84
|
+
CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/* M = inv ( diag ( RHO_X * I + A'A ) ) */
|
|
88
|
+
static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
|
|
89
|
+
ScsLinSysWork *p) {
|
|
90
|
+
scs_int i;
|
|
91
|
+
scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
|
|
92
|
+
|
|
93
|
+
#if EXTRA_VERBOSE > 0
|
|
94
|
+
scs_printf("getting pre-conditioner\n");
|
|
95
|
+
#endif
|
|
96
|
+
|
|
97
|
+
for (i = 0; i < A->n; ++i) {
|
|
98
|
+
M[i] = 1 / (stgs->rho_x +
|
|
99
|
+
SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
|
|
100
|
+
/* M[i] = 1; */
|
|
101
|
+
}
|
|
102
|
+
cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
103
|
+
scs_free(M);
|
|
104
|
+
|
|
105
|
+
#if EXTRA_VERBOSE > 0
|
|
106
|
+
scs_printf("finished getting pre-conditioner\n");
|
|
107
|
+
#endif
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
111
|
+
const ScsSettings *stgs) {
|
|
112
|
+
cudaError_t err;
|
|
113
|
+
ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
|
114
|
+
ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
|
115
|
+
|
|
116
|
+
p->cublas_handle = 0;
|
|
117
|
+
p->cusparse_handle = 0;
|
|
118
|
+
|
|
119
|
+
p->total_solve_time = 0;
|
|
120
|
+
p->tot_cg_its = 0;
|
|
121
|
+
|
|
122
|
+
/* Get handle to the CUBLAS context */
|
|
123
|
+
cublasCreate(&p->cublas_handle);
|
|
124
|
+
|
|
125
|
+
/* Get handle to the CUSPARSE context */
|
|
126
|
+
cusparseCreate(&p->cusparse_handle);
|
|
127
|
+
|
|
128
|
+
Ag->n = A->n;
|
|
129
|
+
Ag->m = A->m;
|
|
130
|
+
Ag->Annz = A->p[A->n];
|
|
131
|
+
Ag->descr = 0;
|
|
132
|
+
/* Matrix description */
|
|
133
|
+
cusparseCreateMatDescr(&Ag->descr);
|
|
134
|
+
cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
|
135
|
+
cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
|
|
136
|
+
p->Ag = Ag;
|
|
137
|
+
p->Agt = SCS_NULL;
|
|
138
|
+
|
|
139
|
+
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
|
|
140
|
+
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
|
|
141
|
+
cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
|
|
142
|
+
|
|
143
|
+
cudaMalloc((void **)&p->p, A->n * sizeof(scs_float));
|
|
144
|
+
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
|
|
145
|
+
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
|
|
146
|
+
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
|
147
|
+
cudaMalloc((void **)&p->tmp_m,
|
|
148
|
+
A->m * sizeof(scs_float)); /* intermediate result */
|
|
149
|
+
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
|
150
|
+
cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
|
|
151
|
+
|
|
152
|
+
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
|
153
|
+
cudaMemcpyHostToDevice);
|
|
154
|
+
cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
|
|
155
|
+
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
|
156
|
+
cudaMemcpyHostToDevice);
|
|
157
|
+
|
|
158
|
+
get_preconditioner(A, stgs, p);
|
|
159
|
+
|
|
160
|
+
#if GPU_TRANSPOSE_MAT > 0
|
|
161
|
+
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
|
162
|
+
p->Agt->n = A->m;
|
|
163
|
+
p->Agt->m = A->n;
|
|
164
|
+
p->Agt->Annz = A->p[A->n];
|
|
165
|
+
p->Agt->descr = 0;
|
|
166
|
+
/* Matrix description */
|
|
167
|
+
cusparseCreateMatDescr(&p->Agt->descr);
|
|
168
|
+
cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
|
169
|
+
cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
|
|
170
|
+
|
|
171
|
+
cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
|
|
172
|
+
cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
|
|
173
|
+
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
|
|
174
|
+
/* transpose Ag into Agt for faster multiplies */
|
|
175
|
+
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
|
|
176
|
+
CUSPARSE(csr2csc)
|
|
177
|
+
(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
|
|
178
|
+
p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
|
|
179
|
+
#endif
|
|
180
|
+
|
|
181
|
+
err = cudaGetLastError();
|
|
182
|
+
if (err != cudaSuccess) {
|
|
183
|
+
printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
|
|
184
|
+
cudaGetErrorString(err));
|
|
185
|
+
SCS(free_lin_sys_work)(p);
|
|
186
|
+
return SCS_NULL;
|
|
187
|
+
}
|
|
188
|
+
return p;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
|
|
192
|
+
scs_float *z, scs_float *r, scs_int n) {
|
|
193
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
194
|
+
CUBLAS(tbmv)
|
|
195
|
+
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
|
196
|
+
0, M, 1, z, 1);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
|
|
200
|
+
static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
|
|
201
|
+
ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
202
|
+
scs_int max_its, scs_float tol) {
|
|
203
|
+
scs_int i, n = A->n;
|
|
204
|
+
scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
|
|
205
|
+
scs_float onef = 1.0, neg_onef = -1.0;
|
|
206
|
+
scs_float *p = pr->p; /* cg direction */
|
|
207
|
+
scs_float *Gp = pr->Gp; /* updated CG direction */
|
|
208
|
+
scs_float *r = pr->r; /* cg residual */
|
|
209
|
+
scs_float *z = pr->z; /* preconditioned */
|
|
210
|
+
scs_float *M = pr->M; /* preconditioner */
|
|
211
|
+
cublasHandle_t cublas_handle = pr->cublas_handle;
|
|
212
|
+
|
|
213
|
+
if (s == SCS_NULL) {
|
|
214
|
+
cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
215
|
+
cudaMemset(bg, 0, n * sizeof(scs_float));
|
|
216
|
+
} else {
|
|
217
|
+
/* p contains bg temporarily */
|
|
218
|
+
cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
219
|
+
/* bg contains s */
|
|
220
|
+
cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
221
|
+
mat_vec(A, stgs, pr, bg, r);
|
|
222
|
+
CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
|
|
223
|
+
CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/* for some reason nrm2 is VERY slow */
|
|
227
|
+
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
|
228
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
|
229
|
+
nrm_r = SQRTF(nrm_r);
|
|
230
|
+
/* check to see if we need to run CG at all */
|
|
231
|
+
if (nrm_r < MIN(tol, 1e-18)) {
|
|
232
|
+
return 0;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
|
236
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
|
237
|
+
/* put z in p, replacing temp mem */
|
|
238
|
+
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
239
|
+
|
|
240
|
+
for (i = 0; i < max_its; ++i) {
|
|
241
|
+
mat_vec(A, stgs, pr, p, Gp);
|
|
242
|
+
|
|
243
|
+
CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
|
|
244
|
+
|
|
245
|
+
alpha = ipzr / p_gp;
|
|
246
|
+
neg_alpha = -alpha;
|
|
247
|
+
|
|
248
|
+
CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
|
|
249
|
+
CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
|
|
250
|
+
|
|
251
|
+
/* for some reason nrm2 is VERY slow */
|
|
252
|
+
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
|
253
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
|
254
|
+
nrm_r = SQRTF(nrm_r);
|
|
255
|
+
if (nrm_r < tol) {
|
|
256
|
+
i++;
|
|
257
|
+
break;
|
|
258
|
+
}
|
|
259
|
+
ipzr_old = ipzr;
|
|
260
|
+
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
|
261
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
|
262
|
+
|
|
263
|
+
beta = ipzr / ipzr_old;
|
|
264
|
+
CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
|
|
265
|
+
CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
|
|
266
|
+
}
|
|
267
|
+
#if EXTRA_VERBOSE > 0
|
|
268
|
+
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
|
|
269
|
+
#endif
|
|
270
|
+
return i;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
|
|
274
|
+
ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
275
|
+
scs_int iter) {
|
|
276
|
+
scs_int cg_its;
|
|
277
|
+
SCS(timer) linsys_timer;
|
|
278
|
+
scs_float *bg = p->bg;
|
|
279
|
+
scs_float neg_onef = -1.0;
|
|
280
|
+
ScsGpuMatrix *Ag = p->Ag;
|
|
281
|
+
scs_float cg_tol =
|
|
282
|
+
SCS(norm)(b, Ag->n) *
|
|
283
|
+
(iter < 0 ? CG_BEST_TOL
|
|
284
|
+
: CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
|
|
285
|
+
SCS(tic)(&linsys_timer);
|
|
286
|
+
/* all on GPU */
|
|
287
|
+
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
288
|
+
SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
|
|
289
|
+
/* solves (I+A'A)x = b, s warm start, solution stored in b */
|
|
290
|
+
cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
|
|
291
|
+
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
|
292
|
+
SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
|
|
293
|
+
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
294
|
+
|
|
295
|
+
if (iter >= 0) {
|
|
296
|
+
p->tot_cg_its += cg_its;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
p->total_solve_time += SCS(tocq)(&linsys_timer);
|
|
300
|
+
#if EXTRAVERBOSE > 0
|
|
301
|
+
scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
|
|
302
|
+
#endif
|
|
303
|
+
return 0;
|
|
304
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
#ifndef PRIV_H_GUARD
|
|
2
|
+
#define PRIV_H_GUARD
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
#include "gpu.h"
|
|
9
|
+
#include "glbopts.h"
|
|
10
|
+
#include "linalg.h"
|
|
11
|
+
#include "scs.h"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
struct SCS_LIN_SYS_WORK {
|
|
15
|
+
/* reporting */
|
|
16
|
+
scs_int tot_cg_its;
|
|
17
|
+
scs_float total_solve_time;
|
|
18
|
+
/* ALL BELOW HOSTED ON THE GPU */
|
|
19
|
+
scs_float *p; /* cg iterate, n */
|
|
20
|
+
scs_float *r; /* cg residual, n */
|
|
21
|
+
scs_float *Gp; /* G * p, n */
|
|
22
|
+
scs_float *bg; /* b, n */
|
|
23
|
+
scs_float *tmp_m; /* m, used in mat_vec */
|
|
24
|
+
scs_float *z; /* preconditioned */
|
|
25
|
+
scs_float *M; /* preconditioner */
|
|
26
|
+
ScsGpuMatrix *Ag; /* A matrix on GPU */
|
|
27
|
+
ScsGpuMatrix *Agt; /* A trans matrix on GPU */
|
|
28
|
+
/* CUDA */
|
|
29
|
+
cublasHandle_t cublas_handle;
|
|
30
|
+
cusparseHandle_t cusparse_handle;
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
#ifdef __cplusplus
|
|
34
|
+
}
|
|
35
|
+
#endif
|
|
36
|
+
#endif
|
data/vendor/scs/scs.mk
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
ifeq ($(OS),Windows_NT)
|
|
2
|
+
UNAME = CYGWINorMINGWorMSYS
|
|
3
|
+
else
|
|
4
|
+
UNAME = $(shell uname -s)
|
|
5
|
+
endif
|
|
6
|
+
|
|
7
|
+
#CC = gcc
|
|
8
|
+
# For cross-compiling with mingw use these.
|
|
9
|
+
#CC = i686-w64-mingw32-gcc -m32
|
|
10
|
+
#CC = x86_64-w64-mingw32-gcc-4.8
|
|
11
|
+
CUCC = $(CC) #Don't need to use nvcc, since using cuda blas APIs
|
|
12
|
+
|
|
13
|
+
# For GPU must add cuda libs to path, e.g.
|
|
14
|
+
# export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
|
|
15
|
+
|
|
16
|
+
ifneq (, $(findstring CYGWIN, $(UNAME)))
|
|
17
|
+
ISWINDOWS := 1
|
|
18
|
+
else
|
|
19
|
+
ifneq (, $(findstring MINGW, $(UNAME)))
|
|
20
|
+
ISWINDOWS := 1
|
|
21
|
+
else
|
|
22
|
+
ifneq (, $(findstring MSYS, $(UNAME)))
|
|
23
|
+
ISWINDOWS := 1
|
|
24
|
+
else
|
|
25
|
+
ifneq (, $(findstring mingw, $(CC)))
|
|
26
|
+
ISWINDOWS := 1
|
|
27
|
+
else
|
|
28
|
+
ISWINDOWS := 0
|
|
29
|
+
endif
|
|
30
|
+
endif
|
|
31
|
+
endif
|
|
32
|
+
endif
|
|
33
|
+
|
|
34
|
+
ifeq ($(UNAME), Darwin)
|
|
35
|
+
# we're on apple, no need to link rt library
|
|
36
|
+
LDFLAGS += -lm
|
|
37
|
+
SHARED = dylib
|
|
38
|
+
SONAME = -install_name
|
|
39
|
+
else
|
|
40
|
+
ifeq ($(ISWINDOWS), 1)
|
|
41
|
+
# we're on windows (cygwin or msys)
|
|
42
|
+
LDFLAGS += -lm
|
|
43
|
+
SHARED = dll
|
|
44
|
+
SONAME = -soname
|
|
45
|
+
else
|
|
46
|
+
# we're on a linux system, use accurate timer provided by clock_gettime()
|
|
47
|
+
LDFLAGS += -lm -lrt
|
|
48
|
+
SHARED = so
|
|
49
|
+
SONAME = -soname
|
|
50
|
+
endif
|
|
51
|
+
endif
|
|
52
|
+
|
|
53
|
+
#TODO: check if this works for all platforms:
|
|
54
|
+
ifeq ($(CUDA_PATH), )
|
|
55
|
+
CUDA_PATH=/usr/local/cuda
|
|
56
|
+
endif
|
|
57
|
+
CULDFLAGS = -L$(CUDA_PATH)/lib -L$(CUDA_PATH)/lib64 -lcudart -lcublas -lcusparse
|
|
58
|
+
CUDAFLAGS = $(CFLAGS) -I$(CUDA_PATH)/include -Ilinsys/gpu -Wno-c++11-long-long # turn off annoying long-long warnings in cuda header files
|
|
59
|
+
|
|
60
|
+
# Add on default CFLAGS
|
|
61
|
+
OPT = -O3
|
|
62
|
+
override CFLAGS += -g -Wall -Wwrite-strings -pedantic -funroll-loops -Wstrict-prototypes -I. -Iinclude -Ilinsys $(OPT)
|
|
63
|
+
ifneq ($(ISWINDOWS), 1)
|
|
64
|
+
override CFLAGS += -fPIC
|
|
65
|
+
endif
|
|
66
|
+
|
|
67
|
+
LINSYS = linsys
|
|
68
|
+
DIRSRC = $(LINSYS)/cpu/direct
|
|
69
|
+
INDIRSRC = $(LINSYS)/cpu/indirect
|
|
70
|
+
GPUDIR = $(LINSYS)/gpu/direct
|
|
71
|
+
GPUINDIR = $(LINSYS)/gpu/indirect
|
|
72
|
+
|
|
73
|
+
EXTSRC = $(LINSYS)/external
|
|
74
|
+
|
|
75
|
+
OUT = out
|
|
76
|
+
AR = ar
|
|
77
|
+
ARFLAGS = rv
|
|
78
|
+
ARCHIVE = $(AR) $(ARFLAGS)
|
|
79
|
+
RANLIB = ranlib
|
|
80
|
+
INSTALL = install
|
|
81
|
+
|
|
82
|
+
ifeq ($(PREFIX),)
|
|
83
|
+
PREFIX = /usr/local
|
|
84
|
+
endif
|
|
85
|
+
|
|
86
|
+
OPT_FLAGS =
|
|
87
|
+
########### OPTIONAL FLAGS ##########
|
|
88
|
+
# these can all be override from the command line
|
|
89
|
+
# e.g. make DLONG=1 will override the setting below
|
|
90
|
+
DLONG = 0
|
|
91
|
+
ifneq ($(DLONG), 0)
|
|
92
|
+
OPT_FLAGS += -DDLONG=$(DLONG) # use longs rather than ints
|
|
93
|
+
endif
|
|
94
|
+
CTRLC = 1
|
|
95
|
+
ifneq ($(CTRLC), 0)
|
|
96
|
+
OPT_FLAGS += -DCTRLC=$(CTRLC) # graceful interrupts with ctrl-c
|
|
97
|
+
endif
|
|
98
|
+
SFLOAT = 0
|
|
99
|
+
ifneq ($(SFLOAT), 0)
|
|
100
|
+
OPT_FLAGS += -DSFLOAT=$(SFLOAT) # use floats rather than doubles
|
|
101
|
+
endif
|
|
102
|
+
NOVALIDATE = 0
|
|
103
|
+
ifneq ($(NOVALIDATE), 0)
|
|
104
|
+
OPT_FLAGS += -DNOVALIDATE=$(NOVALIDATE)$ # remove data validation step
|
|
105
|
+
endif
|
|
106
|
+
NOTIMER = 0
|
|
107
|
+
ifneq ($(NOTIMER), 0)
|
|
108
|
+
OPT_FLAGS += -DNOTIMER=$(NOTIMER) # no timing, times reported as nan
|
|
109
|
+
endif
|
|
110
|
+
COPYAMATRIX = 1
|
|
111
|
+
ifneq ($(COPYAMATRIX), 0)
|
|
112
|
+
OPT_FLAGS += -DCOPYAMATRIX=$(COPYAMATRIX) # if normalize, copy A
|
|
113
|
+
endif
|
|
114
|
+
GPU_TRANSPOSE_MAT = 1
|
|
115
|
+
ifneq ($(GPU_TRANSPOSE_MAT), 0)
|
|
116
|
+
OPT_FLAGS += -DGPU_TRANSPOSE_MAT=$(GPU_TRANSPOSE_MAT) # tranpose A mat in GPU memory
|
|
117
|
+
endif
|
|
118
|
+
|
|
119
|
+
### VERBOSITY LEVELS: 0,1,2
|
|
120
|
+
EXTRA_VERBOSE = 0
|
|
121
|
+
ifneq ($(EXTRA_VERBOSE), 0)
|
|
122
|
+
OPT_FLAGS += -DEXTRA_VERBOSE=$(EXTRA_VERBOSE) # extra verbosity level
|
|
123
|
+
endif
|
|
124
|
+
|
|
125
|
+
############ OPENMP: ############
|
|
126
|
+
# set USE_OPENMP = 1 to allow openmp (multi-threaded matrix multiplies):
|
|
127
|
+
# set the number of threads to, for example, 4 by entering the command:
|
|
128
|
+
# export OMP_NUM_THREADS=4
|
|
129
|
+
|
|
130
|
+
USE_OPENMP = 0
|
|
131
|
+
ifneq ($(USE_OPENMP), 0)
|
|
132
|
+
override CFLAGS += -fopenmp
|
|
133
|
+
LDFLAGS += -lgomp
|
|
134
|
+
endif
|
|
135
|
+
|
|
136
|
+
############ SDPS: BLAS + LAPACK ############
|
|
137
|
+
# set USE_LAPACK = 1 below to enable solving SDPs
|
|
138
|
+
# NB: point the libraries to the locations where
|
|
139
|
+
# you have blas and lapack installed
|
|
140
|
+
|
|
141
|
+
USE_LAPACK = 1
|
|
142
|
+
ifneq ($(USE_LAPACK), 0)
|
|
143
|
+
# edit these for your setup:
|
|
144
|
+
BLASLDFLAGS = -lblas -llapack #-lgfortran
|
|
145
|
+
LDFLAGS += $(BLASLDFLAGS)
|
|
146
|
+
OPT_FLAGS += -DUSE_LAPACK
|
|
147
|
+
|
|
148
|
+
BLAS64 = 0
|
|
149
|
+
ifneq ($(BLAS64), 0)
|
|
150
|
+
OPT_FLAGS += -DBLAS64=$(BLAS64) # if blas/lapack lib uses 64 bit ints
|
|
151
|
+
endif
|
|
152
|
+
|
|
153
|
+
NOBLASSUFFIX = 0
|
|
154
|
+
ifneq ($(NOBLASSUFFIX), 0)
|
|
155
|
+
OPT_FLAGS += -DNOBLASSUFFIX=$(NOBLASSUFFIX) # hack to strip blas suffix
|
|
156
|
+
endif
|
|
157
|
+
|
|
158
|
+
BLASSUFFIX = "_"
|
|
159
|
+
ifneq ($(BLASSUFFIX), "_")
|
|
160
|
+
OPT_FLAGS += -DBLASSUFFIX=$(BLASSUFFIX) # blas suffix (underscore usually)
|
|
161
|
+
endif
|
|
162
|
+
endif
|
|
163
|
+
|
|
164
|
+
MATLAB_MEX_FILE = 0
|
|
165
|
+
ifneq ($(MATLAB_MEX_FILE), 0)
|
|
166
|
+
OPT_FLAGS += -DMATLAB_MEX_FILE=$(MATLAB_MEX_FILE) # matlab mex
|
|
167
|
+
endif
|
|
168
|
+
PYTHON = 0
|
|
169
|
+
ifneq ($(PYTHON), 0)
|
|
170
|
+
OPT_FLAGS += -DPYTHON=$(PYTHON) # python extension
|
|
171
|
+
endif
|
|
172
|
+
USING_R = 0
|
|
173
|
+
ifneq ($(USING_R), 0)
|
|
174
|
+
OPT_FLAGS += -DUSING_R=$(USING_R) # R extension
|
|
175
|
+
endif
|
|
176
|
+
|
|
177
|
+
# debug to see var values, e.g. 'make print-OBJECTS' shows OBJECTS value
|
|
178
|
+
print-%: ; @echo $*=$($*)
|
|
179
|
+
|
|
180
|
+
override CFLAGS += $(OPT_FLAGS)
|
|
181
|
+
CUDAFLAGS += $(OPT_FLAGS)
|