scs 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +84 -0
- data/ext/scs/Rakefile +11 -0
- data/lib/scs/ffi.rb +117 -0
- data/lib/scs/solver.rb +178 -0
- data/lib/scs/version.rb +3 -0
- data/lib/scs.rb +17 -0
- data/vendor/scs/LICENSE.txt +21 -0
- data/vendor/scs/Makefile +164 -0
- data/vendor/scs/README.md +220 -0
- data/vendor/scs/include/aa.h +56 -0
- data/vendor/scs/include/cones.h +46 -0
- data/vendor/scs/include/ctrlc.h +33 -0
- data/vendor/scs/include/glbopts.h +177 -0
- data/vendor/scs/include/linalg.h +26 -0
- data/vendor/scs/include/linsys.h +64 -0
- data/vendor/scs/include/normalize.h +18 -0
- data/vendor/scs/include/rw.h +17 -0
- data/vendor/scs/include/scs.h +161 -0
- data/vendor/scs/include/scs_blas.h +51 -0
- data/vendor/scs/include/util.h +65 -0
- data/vendor/scs/linsys/amatrix.c +305 -0
- data/vendor/scs/linsys/amatrix.h +36 -0
- data/vendor/scs/linsys/amatrix.o +0 -0
- data/vendor/scs/linsys/cpu/direct/private.c +366 -0
- data/vendor/scs/linsys/cpu/direct/private.h +26 -0
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.c +256 -0
- data/vendor/scs/linsys/cpu/indirect/private.h +31 -0
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/external/amd/LICENSE.txt +934 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +469 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +254 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd.h +400 -0
- data/vendor/scs/linsys/external/amd/amd_1.c +180 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.c +1842 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.c +184 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.c +64 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.c +37 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.c +179 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.c +16 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.c +119 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_internal.h +304 -0
- data/vendor/scs/linsys/external/amd/amd_order.c +199 -0
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.c +120 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.c +206 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.c +118 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.c +92 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/amd/changes +11 -0
- data/vendor/scs/linsys/external/qdldl/LICENSE +201 -0
- data/vendor/scs/linsys/external/qdldl/README.md +120 -0
- data/vendor/scs/linsys/external/qdldl/changes +4 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.c +298 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.h +177 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl_types.h +21 -0
- data/vendor/scs/linsys/gpu/gpu.c +41 -0
- data/vendor/scs/linsys/gpu/gpu.h +85 -0
- data/vendor/scs/linsys/gpu/indirect/private.c +304 -0
- data/vendor/scs/linsys/gpu/indirect/private.h +36 -0
- data/vendor/scs/scs.mk +181 -0
- data/vendor/scs/src/aa.c +224 -0
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.c +802 -0
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.c +77 -0
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.c +84 -0
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.c +93 -0
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.c +167 -0
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.c +975 -0
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_version.c +5 -0
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.c +196 -0
- data/vendor/scs/src/util.o +0 -0
- data/vendor/scs/test/data/small_random_socp +0 -0
- data/vendor/scs/test/minunit.h +13 -0
- data/vendor/scs/test/problem_utils.h +93 -0
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +85 -0
- data/vendor/scs/test/problems/small_lp.h +50 -0
- data/vendor/scs/test/problems/small_random_socp.h +33 -0
- data/vendor/scs/test/random_socp_prob.c +171 -0
- data/vendor/scs/test/run_from_file.c +69 -0
- data/vendor/scs/test/run_tests +2 -0
- data/vendor/scs/test/run_tests.c +32 -0
- metadata +203 -0
@@ -0,0 +1,85 @@
|
|
1
|
+
#ifndef SCSGPU_H_GUARD
|
2
|
+
#define SCSGPU_H_GUARD
|
3
|
+
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#include <cublas_v2.h>
|
9
|
+
#include <cuda.h>
|
10
|
+
#include <cuda_runtime_api.h>
|
11
|
+
#include <cusparse.h>
|
12
|
+
|
13
|
+
#include "amatrix.h"
|
14
|
+
#include "glbopts.h"
|
15
|
+
#include "linalg.h"
|
16
|
+
#include "linsys.h"
|
17
|
+
#include "scs.h"
|
18
|
+
#include "util.h"
|
19
|
+
|
20
|
+
#define CUDA_CHECK_ERR \
|
21
|
+
do { \
|
22
|
+
cudaError_t err = cudaGetLastError(); \
|
23
|
+
if (err != cudaSuccess) { \
|
24
|
+
printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
|
25
|
+
cudaGetErrorString(err)); \
|
26
|
+
} \
|
27
|
+
} while (0)
|
28
|
+
|
29
|
+
#ifndef EXTRA_VERBOSE
|
30
|
+
#ifndef SFLOAT
|
31
|
+
#define CUBLAS(x) cublasD##x
|
32
|
+
#define CUSPARSE(x) cusparseD##x
|
33
|
+
#else
|
34
|
+
#define CUBLAS(x) cublasS##x
|
35
|
+
#define CUSPARSE(x) cusparseS##x
|
36
|
+
#endif
|
37
|
+
#else
|
38
|
+
#ifndef SFLOAT
|
39
|
+
#define CUBLAS(x) \
|
40
|
+
CUDA_CHECK_ERR; \
|
41
|
+
cublasD##x
|
42
|
+
#define CUSPARSE(x) \
|
43
|
+
CUDA_CHECK_ERR; \
|
44
|
+
cusparseD##x
|
45
|
+
#else
|
46
|
+
#define CUBLAS(x) \
|
47
|
+
CUDA_CHECK_ERR; \
|
48
|
+
cublasS##x
|
49
|
+
#define CUSPARSE(x) \
|
50
|
+
CUDA_CHECK_ERR; \
|
51
|
+
cusparseS##x
|
52
|
+
#endif
|
53
|
+
#endif
|
54
|
+
|
55
|
+
/*
|
56
|
+
CUDA matrix routines only for CSR, not CSC matrices:
|
57
|
+
CSC CSR GPU Mult
|
58
|
+
A (m x n) A' (n x m) Ag accum_by_a_trans_gpu
|
59
|
+
A'(n x m) A (m x n) Agt accum_by_a_gpu
|
60
|
+
*/
|
61
|
+
|
62
|
+
/* this struct defines the data matrix A on GPU */
|
63
|
+
typedef struct SCS_GPU_A_DATA_MATRIX {
|
64
|
+
/* A is supplied in column compressed format */
|
65
|
+
scs_float *x; /* A values, size: NNZ A */
|
66
|
+
scs_int *i; /* A row index, size: NNZ A */
|
67
|
+
scs_int *p; /* A column pointer, size: n+1 */
|
68
|
+
scs_int m, n; /* m rows, n cols */
|
69
|
+
scs_int Annz; /* num non-zeros in A matrix */
|
70
|
+
/* CUDA */
|
71
|
+
cusparseMatDescr_t descr;
|
72
|
+
} ScsGpuMatrix;
|
73
|
+
|
74
|
+
void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const scs_float *x,
|
75
|
+
scs_float *y, cusparseHandle_t cusparse_handle);
|
76
|
+
|
77
|
+
void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const scs_float *x,
|
78
|
+
scs_float *y, cusparseHandle_t cusparse_handle);
|
79
|
+
|
80
|
+
void SCS(free_gpu_matrix)(ScsGpuMatrix *A);
|
81
|
+
|
82
|
+
#ifdef __cplusplus
|
83
|
+
}
|
84
|
+
#endif
|
85
|
+
#endif
|
@@ -0,0 +1,304 @@
|
|
1
|
+
#include "private.h"
|
2
|
+
|
3
|
+
#define CG_BEST_TOL 1e-9
|
4
|
+
#define CG_MIN_TOL 1e-1
|
5
|
+
|
6
|
+
/* do not use within pcg, reuses memory */
|
7
|
+
void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
|
8
|
+
const scs_float *x, scs_float *y) {
|
9
|
+
scs_float *v_m = p->tmp_m;
|
10
|
+
scs_float *v_n = p->r;
|
11
|
+
cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
12
|
+
cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
13
|
+
SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
|
14
|
+
cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
15
|
+
}
|
16
|
+
|
17
|
+
/* do not use within pcg, reuses memory */
|
18
|
+
void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
|
19
|
+
scs_float *y) {
|
20
|
+
scs_float *v_m = p->tmp_m;
|
21
|
+
scs_float *v_n = p->r;
|
22
|
+
cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
23
|
+
cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
24
|
+
#if GPU_TRANSPOSE_MAT > 0
|
25
|
+
SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
|
26
|
+
#else
|
27
|
+
SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
|
28
|
+
#endif
|
29
|
+
cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
30
|
+
}
|
31
|
+
|
32
|
+
char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
|
33
|
+
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
34
|
+
sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
|
35
|
+
(long)A->p[A->n], stgs->cg_rate);
|
36
|
+
return str;
|
37
|
+
}
|
38
|
+
|
39
|
+
char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
40
|
+
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
41
|
+
sprintf(str,
|
42
|
+
"\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
|
43
|
+
(scs_float)p->tot_cg_its / (info->iter + 1),
|
44
|
+
p->total_solve_time / (info->iter + 1) / 1e3);
|
45
|
+
p->tot_cg_its = 0;
|
46
|
+
p->total_solve_time = 0;
|
47
|
+
return str;
|
48
|
+
}
|
49
|
+
|
50
|
+
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
51
|
+
if (p) {
|
52
|
+
cudaFree(p->p);
|
53
|
+
cudaFree(p->r);
|
54
|
+
cudaFree(p->Gp);
|
55
|
+
cudaFree(p->bg);
|
56
|
+
cudaFree(p->tmp_m);
|
57
|
+
cudaFree(p->z);
|
58
|
+
cudaFree(p->M);
|
59
|
+
if (p->Ag) {
|
60
|
+
SCS(free_gpu_matrix)(p->Ag);
|
61
|
+
scs_free(p->Ag);
|
62
|
+
}
|
63
|
+
if (p->Agt) {
|
64
|
+
SCS(free_gpu_matrix)(p->Agt);
|
65
|
+
scs_free(p->Agt);
|
66
|
+
}
|
67
|
+
cusparseDestroy(p->cusparse_handle);
|
68
|
+
cublasDestroy(p->cublas_handle);
|
69
|
+
/* Don't reset because it interferes with other GPU programs. */
|
70
|
+
/* cudaDeviceReset(); */
|
71
|
+
scs_free(p);
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
/*y = (RHO_X * I + A'A)x */
|
76
|
+
static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
|
77
|
+
ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
78
|
+
/* x and y MUST already be loaded to GPU */
|
79
|
+
scs_float *tmp_m = p->tmp_m; /* temp memory */
|
80
|
+
cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
|
81
|
+
SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
|
82
|
+
cudaMemset(y, 0, A->n * sizeof(scs_float));
|
83
|
+
SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
|
84
|
+
CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
|
85
|
+
}
|
86
|
+
|
87
|
+
/* M = inv ( diag ( RHO_X * I + A'A ) ) */
|
88
|
+
static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
|
89
|
+
ScsLinSysWork *p) {
|
90
|
+
scs_int i;
|
91
|
+
scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
|
92
|
+
|
93
|
+
#if EXTRA_VERBOSE > 0
|
94
|
+
scs_printf("getting pre-conditioner\n");
|
95
|
+
#endif
|
96
|
+
|
97
|
+
for (i = 0; i < A->n; ++i) {
|
98
|
+
M[i] = 1 / (stgs->rho_x +
|
99
|
+
SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
|
100
|
+
/* M[i] = 1; */
|
101
|
+
}
|
102
|
+
cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
103
|
+
scs_free(M);
|
104
|
+
|
105
|
+
#if EXTRA_VERBOSE > 0
|
106
|
+
scs_printf("finished getting pre-conditioner\n");
|
107
|
+
#endif
|
108
|
+
}
|
109
|
+
|
110
|
+
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
111
|
+
const ScsSettings *stgs) {
|
112
|
+
cudaError_t err;
|
113
|
+
ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
114
|
+
ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
115
|
+
|
116
|
+
p->cublas_handle = 0;
|
117
|
+
p->cusparse_handle = 0;
|
118
|
+
|
119
|
+
p->total_solve_time = 0;
|
120
|
+
p->tot_cg_its = 0;
|
121
|
+
|
122
|
+
/* Get handle to the CUBLAS context */
|
123
|
+
cublasCreate(&p->cublas_handle);
|
124
|
+
|
125
|
+
/* Get handle to the CUSPARSE context */
|
126
|
+
cusparseCreate(&p->cusparse_handle);
|
127
|
+
|
128
|
+
Ag->n = A->n;
|
129
|
+
Ag->m = A->m;
|
130
|
+
Ag->Annz = A->p[A->n];
|
131
|
+
Ag->descr = 0;
|
132
|
+
/* Matrix description */
|
133
|
+
cusparseCreateMatDescr(&Ag->descr);
|
134
|
+
cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
135
|
+
cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
|
136
|
+
p->Ag = Ag;
|
137
|
+
p->Agt = SCS_NULL;
|
138
|
+
|
139
|
+
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
|
140
|
+
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
|
141
|
+
cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
|
142
|
+
|
143
|
+
cudaMalloc((void **)&p->p, A->n * sizeof(scs_float));
|
144
|
+
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
|
145
|
+
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
|
146
|
+
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
147
|
+
cudaMalloc((void **)&p->tmp_m,
|
148
|
+
A->m * sizeof(scs_float)); /* intermediate result */
|
149
|
+
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
150
|
+
cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
|
151
|
+
|
152
|
+
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
153
|
+
cudaMemcpyHostToDevice);
|
154
|
+
cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
|
155
|
+
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
156
|
+
cudaMemcpyHostToDevice);
|
157
|
+
|
158
|
+
get_preconditioner(A, stgs, p);
|
159
|
+
|
160
|
+
#if GPU_TRANSPOSE_MAT > 0
|
161
|
+
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
162
|
+
p->Agt->n = A->m;
|
163
|
+
p->Agt->m = A->n;
|
164
|
+
p->Agt->Annz = A->p[A->n];
|
165
|
+
p->Agt->descr = 0;
|
166
|
+
/* Matrix description */
|
167
|
+
cusparseCreateMatDescr(&p->Agt->descr);
|
168
|
+
cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
169
|
+
cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
|
170
|
+
|
171
|
+
cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
|
172
|
+
cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
|
173
|
+
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
|
174
|
+
/* transpose Ag into Agt for faster multiplies */
|
175
|
+
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
|
176
|
+
CUSPARSE(csr2csc)
|
177
|
+
(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
|
178
|
+
p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
|
179
|
+
#endif
|
180
|
+
|
181
|
+
err = cudaGetLastError();
|
182
|
+
if (err != cudaSuccess) {
|
183
|
+
printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
|
184
|
+
cudaGetErrorString(err));
|
185
|
+
SCS(free_lin_sys_work)(p);
|
186
|
+
return SCS_NULL;
|
187
|
+
}
|
188
|
+
return p;
|
189
|
+
}
|
190
|
+
|
191
|
+
static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
|
192
|
+
scs_float *z, scs_float *r, scs_int n) {
|
193
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
194
|
+
CUBLAS(tbmv)
|
195
|
+
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
196
|
+
0, M, 1, z, 1);
|
197
|
+
}
|
198
|
+
|
199
|
+
/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
|
200
|
+
static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
|
201
|
+
ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
202
|
+
scs_int max_its, scs_float tol) {
|
203
|
+
scs_int i, n = A->n;
|
204
|
+
scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
|
205
|
+
scs_float onef = 1.0, neg_onef = -1.0;
|
206
|
+
scs_float *p = pr->p; /* cg direction */
|
207
|
+
scs_float *Gp = pr->Gp; /* updated CG direction */
|
208
|
+
scs_float *r = pr->r; /* cg residual */
|
209
|
+
scs_float *z = pr->z; /* preconditioned */
|
210
|
+
scs_float *M = pr->M; /* preconditioner */
|
211
|
+
cublasHandle_t cublas_handle = pr->cublas_handle;
|
212
|
+
|
213
|
+
if (s == SCS_NULL) {
|
214
|
+
cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
215
|
+
cudaMemset(bg, 0, n * sizeof(scs_float));
|
216
|
+
} else {
|
217
|
+
/* p contains bg temporarily */
|
218
|
+
cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
219
|
+
/* bg contains s */
|
220
|
+
cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
221
|
+
mat_vec(A, stgs, pr, bg, r);
|
222
|
+
CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
|
223
|
+
CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
|
224
|
+
}
|
225
|
+
|
226
|
+
/* for some reason nrm2 is VERY slow */
|
227
|
+
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
228
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
229
|
+
nrm_r = SQRTF(nrm_r);
|
230
|
+
/* check to see if we need to run CG at all */
|
231
|
+
if (nrm_r < MIN(tol, 1e-18)) {
|
232
|
+
return 0;
|
233
|
+
}
|
234
|
+
|
235
|
+
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
236
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
237
|
+
/* put z in p, replacing temp mem */
|
238
|
+
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
239
|
+
|
240
|
+
for (i = 0; i < max_its; ++i) {
|
241
|
+
mat_vec(A, stgs, pr, p, Gp);
|
242
|
+
|
243
|
+
CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
|
244
|
+
|
245
|
+
alpha = ipzr / p_gp;
|
246
|
+
neg_alpha = -alpha;
|
247
|
+
|
248
|
+
CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
|
249
|
+
CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
|
250
|
+
|
251
|
+
/* for some reason nrm2 is VERY slow */
|
252
|
+
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
253
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
254
|
+
nrm_r = SQRTF(nrm_r);
|
255
|
+
if (nrm_r < tol) {
|
256
|
+
i++;
|
257
|
+
break;
|
258
|
+
}
|
259
|
+
ipzr_old = ipzr;
|
260
|
+
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
261
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
262
|
+
|
263
|
+
beta = ipzr / ipzr_old;
|
264
|
+
CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
|
265
|
+
CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
|
266
|
+
}
|
267
|
+
#if EXTRA_VERBOSE > 0
|
268
|
+
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
|
269
|
+
#endif
|
270
|
+
return i;
|
271
|
+
}
|
272
|
+
|
273
|
+
scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
|
274
|
+
ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
275
|
+
scs_int iter) {
|
276
|
+
scs_int cg_its;
|
277
|
+
SCS(timer) linsys_timer;
|
278
|
+
scs_float *bg = p->bg;
|
279
|
+
scs_float neg_onef = -1.0;
|
280
|
+
ScsGpuMatrix *Ag = p->Ag;
|
281
|
+
scs_float cg_tol =
|
282
|
+
SCS(norm)(b, Ag->n) *
|
283
|
+
(iter < 0 ? CG_BEST_TOL
|
284
|
+
: CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
|
285
|
+
SCS(tic)(&linsys_timer);
|
286
|
+
/* all on GPU */
|
287
|
+
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
|
288
|
+
SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
|
289
|
+
/* solves (I+A'A)x = b, s warm start, solution stored in b */
|
290
|
+
cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
|
291
|
+
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
292
|
+
SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
|
293
|
+
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
294
|
+
|
295
|
+
if (iter >= 0) {
|
296
|
+
p->tot_cg_its += cg_its;
|
297
|
+
}
|
298
|
+
|
299
|
+
p->total_solve_time += SCS(tocq)(&linsys_timer);
|
300
|
+
#if EXTRAVERBOSE > 0
|
301
|
+
scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
|
302
|
+
#endif
|
303
|
+
return 0;
|
304
|
+
}
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#ifndef PRIV_H_GUARD
|
2
|
+
#define PRIV_H_GUARD
|
3
|
+
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#include "gpu.h"
|
9
|
+
#include "glbopts.h"
|
10
|
+
#include "linalg.h"
|
11
|
+
#include "scs.h"
|
12
|
+
|
13
|
+
|
14
|
+
struct SCS_LIN_SYS_WORK {
|
15
|
+
/* reporting */
|
16
|
+
scs_int tot_cg_its;
|
17
|
+
scs_float total_solve_time;
|
18
|
+
/* ALL BELOW HOSTED ON THE GPU */
|
19
|
+
scs_float *p; /* cg iterate, n */
|
20
|
+
scs_float *r; /* cg residual, n */
|
21
|
+
scs_float *Gp; /* G * p, n */
|
22
|
+
scs_float *bg; /* b, n */
|
23
|
+
scs_float *tmp_m; /* m, used in mat_vec */
|
24
|
+
scs_float *z; /* preconditioned */
|
25
|
+
scs_float *M; /* preconditioner */
|
26
|
+
ScsGpuMatrix *Ag; /* A matrix on GPU */
|
27
|
+
ScsGpuMatrix *Agt; /* A trans matrix on GPU */
|
28
|
+
/* CUDA */
|
29
|
+
cublasHandle_t cublas_handle;
|
30
|
+
cusparseHandle_t cusparse_handle;
|
31
|
+
};
|
32
|
+
|
33
|
+
#ifdef __cplusplus
|
34
|
+
}
|
35
|
+
#endif
|
36
|
+
#endif
|
data/vendor/scs/scs.mk
ADDED
@@ -0,0 +1,181 @@
|
|
1
|
+
ifeq ($(OS),Windows_NT)
|
2
|
+
UNAME = CYGWINorMINGWorMSYS
|
3
|
+
else
|
4
|
+
UNAME = $(shell uname -s)
|
5
|
+
endif
|
6
|
+
|
7
|
+
#CC = gcc
|
8
|
+
# For cross-compiling with mingw use these.
|
9
|
+
#CC = i686-w64-mingw32-gcc -m32
|
10
|
+
#CC = x86_64-w64-mingw32-gcc-4.8
|
11
|
+
CUCC = $(CC) #Don't need to use nvcc, since using cuda blas APIs
|
12
|
+
|
13
|
+
# For GPU must add cuda libs to path, e.g.
|
14
|
+
# export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
|
15
|
+
|
16
|
+
ifneq (, $(findstring CYGWIN, $(UNAME)))
|
17
|
+
ISWINDOWS := 1
|
18
|
+
else
|
19
|
+
ifneq (, $(findstring MINGW, $(UNAME)))
|
20
|
+
ISWINDOWS := 1
|
21
|
+
else
|
22
|
+
ifneq (, $(findstring MSYS, $(UNAME)))
|
23
|
+
ISWINDOWS := 1
|
24
|
+
else
|
25
|
+
ifneq (, $(findstring mingw, $(CC)))
|
26
|
+
ISWINDOWS := 1
|
27
|
+
else
|
28
|
+
ISWINDOWS := 0
|
29
|
+
endif
|
30
|
+
endif
|
31
|
+
endif
|
32
|
+
endif
|
33
|
+
|
34
|
+
ifeq ($(UNAME), Darwin)
|
35
|
+
# we're on apple, no need to link rt library
|
36
|
+
LDFLAGS += -lm
|
37
|
+
SHARED = dylib
|
38
|
+
SONAME = -install_name
|
39
|
+
else
|
40
|
+
ifeq ($(ISWINDOWS), 1)
|
41
|
+
# we're on windows (cygwin or msys)
|
42
|
+
LDFLAGS += -lm
|
43
|
+
SHARED = dll
|
44
|
+
SONAME = -soname
|
45
|
+
else
|
46
|
+
# we're on a linux system, use accurate timer provided by clock_gettime()
|
47
|
+
LDFLAGS += -lm -lrt
|
48
|
+
SHARED = so
|
49
|
+
SONAME = -soname
|
50
|
+
endif
|
51
|
+
endif
|
52
|
+
|
53
|
+
#TODO: check if this works for all platforms:
|
54
|
+
ifeq ($(CUDA_PATH), )
|
55
|
+
CUDA_PATH=/usr/local/cuda
|
56
|
+
endif
|
57
|
+
CULDFLAGS = -L$(CUDA_PATH)/lib -L$(CUDA_PATH)/lib64 -lcudart -lcublas -lcusparse
|
58
|
+
CUDAFLAGS = $(CFLAGS) -I$(CUDA_PATH)/include -Ilinsys/gpu -Wno-c++11-long-long # turn off annoying long-long warnings in cuda header files
|
59
|
+
|
60
|
+
# Add on default CFLAGS
|
61
|
+
OPT = -O3
|
62
|
+
override CFLAGS += -g -Wall -Wwrite-strings -pedantic -funroll-loops -Wstrict-prototypes -I. -Iinclude -Ilinsys $(OPT)
|
63
|
+
ifneq ($(ISWINDOWS), 1)
|
64
|
+
override CFLAGS += -fPIC
|
65
|
+
endif
|
66
|
+
|
67
|
+
LINSYS = linsys
|
68
|
+
DIRSRC = $(LINSYS)/cpu/direct
|
69
|
+
INDIRSRC = $(LINSYS)/cpu/indirect
|
70
|
+
GPUDIR = $(LINSYS)/gpu/direct
|
71
|
+
GPUINDIR = $(LINSYS)/gpu/indirect
|
72
|
+
|
73
|
+
EXTSRC = $(LINSYS)/external
|
74
|
+
|
75
|
+
OUT = out
|
76
|
+
AR = ar
|
77
|
+
ARFLAGS = rv
|
78
|
+
ARCHIVE = $(AR) $(ARFLAGS)
|
79
|
+
RANLIB = ranlib
|
80
|
+
INSTALL = install
|
81
|
+
|
82
|
+
ifeq ($(PREFIX),)
|
83
|
+
PREFIX = /usr/local
|
84
|
+
endif
|
85
|
+
|
86
|
+
OPT_FLAGS =
|
87
|
+
########### OPTIONAL FLAGS ##########
|
88
|
+
# these can all be override from the command line
|
89
|
+
# e.g. make DLONG=1 will override the setting below
|
90
|
+
DLONG = 0
|
91
|
+
ifneq ($(DLONG), 0)
|
92
|
+
OPT_FLAGS += -DDLONG=$(DLONG) # use longs rather than ints
|
93
|
+
endif
|
94
|
+
CTRLC = 1
|
95
|
+
ifneq ($(CTRLC), 0)
|
96
|
+
OPT_FLAGS += -DCTRLC=$(CTRLC) # graceful interrupts with ctrl-c
|
97
|
+
endif
|
98
|
+
SFLOAT = 0
|
99
|
+
ifneq ($(SFLOAT), 0)
|
100
|
+
OPT_FLAGS += -DSFLOAT=$(SFLOAT) # use floats rather than doubles
|
101
|
+
endif
|
102
|
+
NOVALIDATE = 0
|
103
|
+
ifneq ($(NOVALIDATE), 0)
|
104
|
+
OPT_FLAGS += -DNOVALIDATE=$(NOVALIDATE)$ # remove data validation step
|
105
|
+
endif
|
106
|
+
NOTIMER = 0
|
107
|
+
ifneq ($(NOTIMER), 0)
|
108
|
+
OPT_FLAGS += -DNOTIMER=$(NOTIMER) # no timing, times reported as nan
|
109
|
+
endif
|
110
|
+
COPYAMATRIX = 1
|
111
|
+
ifneq ($(COPYAMATRIX), 0)
|
112
|
+
OPT_FLAGS += -DCOPYAMATRIX=$(COPYAMATRIX) # if normalize, copy A
|
113
|
+
endif
|
114
|
+
GPU_TRANSPOSE_MAT = 1
|
115
|
+
ifneq ($(GPU_TRANSPOSE_MAT), 0)
|
116
|
+
OPT_FLAGS += -DGPU_TRANSPOSE_MAT=$(GPU_TRANSPOSE_MAT) # tranpose A mat in GPU memory
|
117
|
+
endif
|
118
|
+
|
119
|
+
### VERBOSITY LEVELS: 0,1,2
|
120
|
+
EXTRA_VERBOSE = 0
|
121
|
+
ifneq ($(EXTRA_VERBOSE), 0)
|
122
|
+
OPT_FLAGS += -DEXTRA_VERBOSE=$(EXTRA_VERBOSE) # extra verbosity level
|
123
|
+
endif
|
124
|
+
|
125
|
+
############ OPENMP: ############
|
126
|
+
# set USE_OPENMP = 1 to allow openmp (multi-threaded matrix multiplies):
|
127
|
+
# set the number of threads to, for example, 4 by entering the command:
|
128
|
+
# export OMP_NUM_THREADS=4
|
129
|
+
|
130
|
+
USE_OPENMP = 0
|
131
|
+
ifneq ($(USE_OPENMP), 0)
|
132
|
+
override CFLAGS += -fopenmp
|
133
|
+
LDFLAGS += -lgomp
|
134
|
+
endif
|
135
|
+
|
136
|
+
############ SDPS: BLAS + LAPACK ############
|
137
|
+
# set USE_LAPACK = 1 below to enable solving SDPs
|
138
|
+
# NB: point the libraries to the locations where
|
139
|
+
# you have blas and lapack installed
|
140
|
+
|
141
|
+
USE_LAPACK = 1
|
142
|
+
ifneq ($(USE_LAPACK), 0)
|
143
|
+
# edit these for your setup:
|
144
|
+
BLASLDFLAGS = -lblas -llapack #-lgfortran
|
145
|
+
LDFLAGS += $(BLASLDFLAGS)
|
146
|
+
OPT_FLAGS += -DUSE_LAPACK
|
147
|
+
|
148
|
+
BLAS64 = 0
|
149
|
+
ifneq ($(BLAS64), 0)
|
150
|
+
OPT_FLAGS += -DBLAS64=$(BLAS64) # if blas/lapack lib uses 64 bit ints
|
151
|
+
endif
|
152
|
+
|
153
|
+
NOBLASSUFFIX = 0
|
154
|
+
ifneq ($(NOBLASSUFFIX), 0)
|
155
|
+
OPT_FLAGS += -DNOBLASSUFFIX=$(NOBLASSUFFIX) # hack to strip blas suffix
|
156
|
+
endif
|
157
|
+
|
158
|
+
BLASSUFFIX = "_"
|
159
|
+
ifneq ($(BLASSUFFIX), "_")
|
160
|
+
OPT_FLAGS += -DBLASSUFFIX=$(BLASSUFFIX) # blas suffix (underscore usually)
|
161
|
+
endif
|
162
|
+
endif
|
163
|
+
|
164
|
+
MATLAB_MEX_FILE = 0
|
165
|
+
ifneq ($(MATLAB_MEX_FILE), 0)
|
166
|
+
OPT_FLAGS += -DMATLAB_MEX_FILE=$(MATLAB_MEX_FILE) # matlab mex
|
167
|
+
endif
|
168
|
+
PYTHON = 0
|
169
|
+
ifneq ($(PYTHON), 0)
|
170
|
+
OPT_FLAGS += -DPYTHON=$(PYTHON) # python extension
|
171
|
+
endif
|
172
|
+
USING_R = 0
|
173
|
+
ifneq ($(USING_R), 0)
|
174
|
+
OPT_FLAGS += -DUSING_R=$(USING_R) # R extension
|
175
|
+
endif
|
176
|
+
|
177
|
+
# debug to see var values, e.g. 'make print-OBJECTS' shows OBJECTS value
|
178
|
+
print-%: ; @echo $*=$($*)
|
179
|
+
|
180
|
+
override CFLAGS += $(OPT_FLAGS)
|
181
|
+
CUDAFLAGS += $(OPT_FLAGS)
|