scs 0.2.2 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE.txt +18 -18
- data/README.md +19 -14
- data/lib/scs/ffi.rb +31 -20
- data/lib/scs/solver.rb +32 -9
- data/lib/scs/version.rb +1 -1
- data/vendor/scs/CITATION.cff +39 -0
- data/vendor/scs/CMakeLists.txt +320 -0
- data/vendor/scs/Makefile +32 -23
- data/vendor/scs/README.md +9 -218
- data/vendor/scs/include/aa.h +67 -23
- data/vendor/scs/include/cones.h +22 -19
- data/vendor/scs/include/glbopts.h +107 -79
- data/vendor/scs/include/linalg.h +3 -4
- data/vendor/scs/include/linsys.h +58 -44
- data/vendor/scs/include/normalize.h +6 -5
- data/vendor/scs/include/rw.h +8 -2
- data/vendor/scs/include/scs.h +257 -141
- data/vendor/scs/include/scs_types.h +34 -0
- data/vendor/scs/include/scs_work.h +83 -0
- data/vendor/scs/include/util.h +3 -15
- data/vendor/scs/linsys/cpu/direct/private.c +241 -232
- data/vendor/scs/linsys/cpu/direct/private.h +13 -7
- data/vendor/scs/linsys/cpu/indirect/private.c +194 -118
- data/vendor/scs/linsys/cpu/indirect/private.h +7 -4
- data/vendor/scs/linsys/csparse.c +87 -0
- data/vendor/scs/linsys/csparse.h +34 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +6 -6
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +6 -1
- data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
- data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
- data/vendor/scs/linsys/external/qdldl/changes +2 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
- data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
- data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
- data/vendor/scs/linsys/gpu/gpu.c +58 -21
- data/vendor/scs/linsys/gpu/gpu.h +70 -35
- data/vendor/scs/linsys/gpu/indirect/private.c +394 -157
- data/vendor/scs/linsys/gpu/indirect/private.h +27 -12
- data/vendor/scs/linsys/scs_matrix.c +478 -0
- data/vendor/scs/linsys/scs_matrix.h +70 -0
- data/vendor/scs/scs.mk +14 -10
- data/vendor/scs/src/aa.c +394 -110
- data/vendor/scs/src/cones.c +497 -359
- data/vendor/scs/src/ctrlc.c +15 -5
- data/vendor/scs/src/linalg.c +107 -26
- data/vendor/scs/src/normalize.c +30 -72
- data/vendor/scs/src/rw.c +202 -27
- data/vendor/scs/src/scs.c +769 -571
- data/vendor/scs/src/scs_version.c +11 -3
- data/vendor/scs/src/util.c +37 -106
- data/vendor/scs/test/minunit.h +22 -8
- data/vendor/scs/test/problem_utils.h +180 -25
- data/vendor/scs/test/problems/degenerate.h +130 -0
- data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
- data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
- data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
- data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
- data/vendor/scs/test/problems/random_prob +0 -0
- data/vendor/scs/test/problems/random_prob.h +45 -0
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
- data/vendor/scs/test/problems/small_lp.h +14 -13
- data/vendor/scs/test/problems/small_qp.h +352 -0
- data/vendor/scs/test/problems/test_validation.h +43 -0
- data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
- data/vendor/scs/test/random_socp_prob.c +54 -53
- data/vendor/scs/test/rng.h +109 -0
- data/vendor/scs/test/run_from_file.c +20 -11
- data/vendor/scs/test/run_tests.c +35 -2
- metadata +29 -98
- data/vendor/scs/linsys/amatrix.c +0 -305
- data/vendor/scs/linsys/amatrix.h +0 -36
- data/vendor/scs/linsys/amatrix.o +0 -0
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.o +0 -0
- data/vendor/scs/test/data/small_random_socp +0 -0
- data/vendor/scs/test/problems/small_random_socp.h +0 -33
- data/vendor/scs/test/run_tests +0 -2
@@ -1,61 +1,115 @@
|
|
1
1
|
#include "private.h"
|
2
|
+
#include "linsys.h"
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
/* do not use within pcg, reuses memory */
|
7
|
-
void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
|
8
|
-
const scs_float *x, scs_float *y) {
|
9
|
-
scs_float *v_m = p->tmp_m;
|
10
|
-
scs_float *v_n = p->r;
|
11
|
-
cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
12
|
-
cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
13
|
-
SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
|
14
|
-
cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
15
|
-
}
|
4
|
+
/* norm to use when deciding convergence */
|
5
|
+
/* should be consistent with CG_NORM in glbopts.h */
|
6
|
+
#define USE_L2_NORM (0)
|
16
7
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
scs_float
|
21
|
-
|
22
|
-
cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
23
|
-
cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
24
|
-
#if GPU_TRANSPOSE_MAT > 0
|
25
|
-
SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
|
8
|
+
static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
|
9
|
+
scs_int n) {
|
10
|
+
#if USE_L2_NORM > 0
|
11
|
+
scs_float nrm;
|
12
|
+
CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
|
26
13
|
#else
|
27
|
-
|
14
|
+
scs_int idx;
|
15
|
+
scs_float nrm;
|
16
|
+
CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
|
17
|
+
/* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
|
18
|
+
cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
|
19
|
+
nrm = ABS(nrm);
|
28
20
|
#endif
|
29
|
-
|
21
|
+
return nrm;
|
30
22
|
}
|
31
23
|
|
32
|
-
char *SCS(get_lin_sys_method)(
|
33
|
-
|
34
|
-
sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
|
35
|
-
(long)A->p[A->n], stgs->cg_rate);
|
36
|
-
return str;
|
24
|
+
const char *SCS(get_lin_sys_method)() {
|
25
|
+
return "sparse-indirect GPU";
|
37
26
|
}
|
38
27
|
|
28
|
+
/*
|
39
29
|
char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
40
30
|
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
41
|
-
sprintf(str,
|
42
|
-
|
43
|
-
(scs_float)p->tot_cg_its / (info->iter + 1),
|
44
|
-
p->total_solve_time / (info->iter + 1) / 1e3);
|
31
|
+
sprintf(str, "lin-sys: avg cg its: %2.2f\n",
|
32
|
+
(scs_float)p->tot_cg_its / (info->iter + 1));
|
45
33
|
p->tot_cg_its = 0;
|
46
|
-
p->total_solve_time = 0;
|
47
34
|
return str;
|
48
35
|
}
|
36
|
+
*/
|
37
|
+
|
38
|
+
/* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
|
39
|
+
/* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
|
40
|
+
static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
|
41
|
+
scs_int i, k;
|
42
|
+
const ScsMatrix *A = p->A;
|
43
|
+
const ScsMatrix *P = p->P;
|
44
|
+
scs_float *M = p->M;
|
45
|
+
|
46
|
+
#if VERBOSITY > 0
|
47
|
+
scs_printf("getting pre-conditioner\n");
|
48
|
+
#endif
|
49
|
+
|
50
|
+
/* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
|
51
|
+
for (i = 0; i < A->n; ++i) { /* cols */
|
52
|
+
/* M_ii = (R_x)_i */
|
53
|
+
M[i] = diag_r[i];
|
54
|
+
/* M_ii += a_i' (R_y)^-1 a_i */
|
55
|
+
for (k = A->p[i]; k < A->p[i + 1]; ++k) {
|
56
|
+
/* A->i[k] is row of entry k with value A->x[k] */
|
57
|
+
M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
|
58
|
+
}
|
59
|
+
if (P) {
|
60
|
+
for (k = P->p[i]; k < P->p[i + 1]; k++) {
|
61
|
+
/* diagonal element only */
|
62
|
+
if (P->i[k] == i) { /* row == col */
|
63
|
+
/* M_ii += P_ii */
|
64
|
+
M[i] += P->x[k];
|
65
|
+
break;
|
66
|
+
}
|
67
|
+
}
|
68
|
+
}
|
69
|
+
/* finally invert for pre-conditioner */
|
70
|
+
M[i] = 1. / M[i];
|
71
|
+
}
|
72
|
+
cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
73
|
+
#if VERBOSITY > 0
|
74
|
+
scs_printf("finished getting pre-conditioner\n");
|
75
|
+
#endif
|
76
|
+
}
|
77
|
+
|
78
|
+
/* no need to update anything in this case */
|
79
|
+
void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
|
80
|
+
scs_int i;
|
81
|
+
|
82
|
+
/* R_x to gpu */
|
83
|
+
cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
|
84
|
+
cudaMemcpyHostToDevice);
|
85
|
+
|
86
|
+
/* 1/R_y to gpu */
|
87
|
+
for (i = 0; i < p->m; ++i)
|
88
|
+
p->inv_r_y[i] = 1. / diag_r[p->n + i];
|
89
|
+
cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
|
90
|
+
cudaMemcpyHostToDevice);
|
91
|
+
|
92
|
+
/* set preconditioner M on gpu */
|
93
|
+
set_preconditioner(p, diag_r);
|
94
|
+
}
|
49
95
|
|
50
96
|
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
51
97
|
if (p) {
|
98
|
+
scs_free(p->M);
|
99
|
+
scs_free(p->inv_r_y);
|
52
100
|
cudaFree(p->p);
|
53
101
|
cudaFree(p->r);
|
54
102
|
cudaFree(p->Gp);
|
55
103
|
cudaFree(p->bg);
|
56
104
|
cudaFree(p->tmp_m);
|
57
105
|
cudaFree(p->z);
|
58
|
-
cudaFree(p->
|
106
|
+
cudaFree(p->M_gpu);
|
107
|
+
cudaFree(p->r_x_gpu);
|
108
|
+
cudaFree(p->inv_r_y_gpu);
|
109
|
+
if (p->Pg) {
|
110
|
+
SCS(free_gpu_matrix)(p->Pg);
|
111
|
+
scs_free(p->Pg);
|
112
|
+
}
|
59
113
|
if (p->Ag) {
|
60
114
|
SCS(free_gpu_matrix)(p->Ag);
|
61
115
|
scs_free(p->Ag);
|
@@ -64,6 +118,12 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
64
118
|
SCS(free_gpu_matrix)(p->Agt);
|
65
119
|
scs_free(p->Agt);
|
66
120
|
}
|
121
|
+
if (p->buffer != SCS_NULL) {
|
122
|
+
cudaFree(p->buffer);
|
123
|
+
}
|
124
|
+
cusparseDestroyDnVec(p->dn_vec_m);
|
125
|
+
cusparseDestroyDnVec(p->dn_vec_n);
|
126
|
+
cusparseDestroyDnVec(p->dn_vec_n_p);
|
67
127
|
cusparseDestroy(p->cusparse_handle);
|
68
128
|
cublasDestroy(p->cublas_handle);
|
69
129
|
/* Don't reset because it interferes with other GPU programs. */
|
@@ -72,53 +132,127 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
72
132
|
}
|
73
133
|
}
|
74
134
|
|
75
|
-
/*
|
76
|
-
static void
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
|
82
|
-
cudaMemset(y, 0, A->n * sizeof(scs_float));
|
83
|
-
SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
|
84
|
-
CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
|
135
|
+
/* z = M * z elementwise in place, assumes M, z on GPU */
|
136
|
+
static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
|
137
|
+
scs_float *z, scs_int n) {
|
138
|
+
CUBLAS(tbmv)
|
139
|
+
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
140
|
+
0, M, 1, z, 1);
|
85
141
|
}
|
86
142
|
|
87
|
-
/*
|
88
|
-
static void
|
89
|
-
|
90
|
-
|
91
|
-
|
143
|
+
/* y = (R_x + P + A' R_y^{-1} A) x */
|
144
|
+
static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
145
|
+
/* x and y MUST already be loaded to GPU */
|
146
|
+
scs_float *z = p->tmp_m; /* temp memory */
|
147
|
+
cudaMemset(z, 0, p->m * sizeof(scs_float));
|
148
|
+
|
149
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
|
150
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
|
151
|
+
cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
|
152
|
+
|
153
|
+
/* y = x */
|
154
|
+
cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
155
|
+
/* y = R_x * x */
|
156
|
+
scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
|
157
|
+
|
158
|
+
if (p->Pg) {
|
159
|
+
/* y = R_x * x + P x */
|
160
|
+
SCS(accum_by_p_gpu)
|
161
|
+
(p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
162
|
+
&p->buffer);
|
163
|
+
}
|
92
164
|
|
93
|
-
|
94
|
-
|
165
|
+
/* z = Ax */
|
166
|
+
#if GPU_TRANSPOSE_MAT > 0
|
167
|
+
SCS(accum_by_atrans_gpu)
|
168
|
+
(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
169
|
+
&p->buffer);
|
170
|
+
#else
|
171
|
+
SCS(accum_by_a_gpu)
|
172
|
+
(p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
173
|
+
&p->buffer);
|
95
174
|
#endif
|
175
|
+
/* z = R_y^{-1} A x */
|
176
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
|
96
177
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
103
|
-
scs_free(M);
|
178
|
+
/* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
|
179
|
+
SCS(accum_by_atrans_gpu)
|
180
|
+
(p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
181
|
+
&p->buffer);
|
182
|
+
}
|
104
183
|
|
105
|
-
|
106
|
-
|
107
|
-
|
184
|
+
/* P comes in upper triangular, expand to full
|
185
|
+
* First compute triplet version of full matrix, then compress to csc
|
186
|
+
* */
|
187
|
+
static csc *fill_p_matrix(const ScsMatrix *P) {
|
188
|
+
scs_int i, j, k, kk;
|
189
|
+
scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
|
190
|
+
csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
|
191
|
+
csc *P_full;
|
192
|
+
kk = 0;
|
193
|
+
for (j = 0; j < P->n; j++) { /* cols */
|
194
|
+
for (k = P->p[j]; k < P->p[j + 1]; k++) {
|
195
|
+
i = P->i[k]; /* row */
|
196
|
+
if (i > j) { /* only upper triangular needed */
|
197
|
+
break;
|
198
|
+
}
|
199
|
+
P_tmp->i[kk] = i;
|
200
|
+
P_tmp->p[kk] = j;
|
201
|
+
P_tmp->x[kk] = P->x[k];
|
202
|
+
kk++;
|
203
|
+
if (i == j) { /* diagonal */
|
204
|
+
continue;
|
205
|
+
}
|
206
|
+
P_tmp->i[kk] = j;
|
207
|
+
P_tmp->p[kk] = i;
|
208
|
+
P_tmp->x[kk] = P->x[k];
|
209
|
+
kk++;
|
210
|
+
}
|
211
|
+
}
|
212
|
+
P_tmp->nz = kk; /* set number of nonzeros */
|
213
|
+
P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
|
214
|
+
SCS(cs_spfree)(P_tmp);
|
215
|
+
return P_full;
|
108
216
|
}
|
109
217
|
|
110
|
-
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
111
|
-
const
|
218
|
+
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
219
|
+
const scs_float *diag_r) {
|
112
220
|
cudaError_t err;
|
113
|
-
|
114
|
-
|
221
|
+
csc *P_full;
|
222
|
+
ScsLinSysWork *p = SCS_NULL;
|
223
|
+
ScsGpuMatrix *Ag = SCS_NULL;
|
224
|
+
ScsGpuMatrix *Pg = SCS_NULL;
|
225
|
+
int device_count;
|
226
|
+
|
227
|
+
err = cudaGetDeviceCount(&device_count);
|
228
|
+
if (err > 0) {
|
229
|
+
scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
|
230
|
+
return SCS_NULL;
|
231
|
+
}
|
232
|
+
|
233
|
+
p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
234
|
+
Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
235
|
+
|
236
|
+
p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
|
237
|
+
p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
|
238
|
+
|
239
|
+
p->A = A;
|
240
|
+
p->P = P;
|
241
|
+
p->m = A->m;
|
242
|
+
p->n = A->n;
|
243
|
+
|
244
|
+
#if GPU_TRANSPOSE_MAT > 0
|
245
|
+
size_t new_buffer_size = 0;
|
246
|
+
#endif
|
115
247
|
|
116
248
|
p->cublas_handle = 0;
|
117
249
|
p->cusparse_handle = 0;
|
118
250
|
|
119
|
-
p->total_solve_time = 0;
|
120
251
|
p->tot_cg_its = 0;
|
121
252
|
|
253
|
+
p->buffer_size = 0;
|
254
|
+
p->buffer = SCS_NULL;
|
255
|
+
|
122
256
|
/* Get handle to the CUBLAS context */
|
123
257
|
cublasCreate(&p->cublas_handle);
|
124
258
|
|
@@ -127,15 +261,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
127
261
|
|
128
262
|
Ag->n = A->n;
|
129
263
|
Ag->m = A->m;
|
130
|
-
Ag->
|
264
|
+
Ag->nnz = A->p[A->n];
|
131
265
|
Ag->descr = 0;
|
132
|
-
/* Matrix description */
|
133
|
-
cusparseCreateMatDescr(&Ag->descr);
|
134
|
-
cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
135
|
-
cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
|
136
|
-
p->Ag = Ag;
|
137
|
-
p->Agt = SCS_NULL;
|
138
|
-
|
139
266
|
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
|
140
267
|
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
|
141
268
|
cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
|
@@ -144,10 +271,11 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
144
271
|
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
|
145
272
|
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
|
146
273
|
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
147
|
-
cudaMalloc((void **)&p->tmp_m,
|
148
|
-
A->m * sizeof(scs_float)); /* intermediate result */
|
274
|
+
cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
|
149
275
|
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
150
|
-
cudaMalloc((void **)&p->
|
276
|
+
cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
|
277
|
+
cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
|
278
|
+
cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
|
151
279
|
|
152
280
|
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
153
281
|
cudaMemcpyHostToDevice);
|
@@ -155,32 +283,89 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
155
283
|
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
156
284
|
cudaMemcpyHostToDevice);
|
157
285
|
|
158
|
-
|
286
|
+
cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
|
287
|
+
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
288
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
289
|
+
|
290
|
+
if (P) {
|
291
|
+
Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
292
|
+
P_full = fill_p_matrix(P);
|
293
|
+
Pg->n = P_full->n;
|
294
|
+
Pg->m = P_full->m;
|
295
|
+
Pg->nnz = P_full->p[P_full->n];
|
296
|
+
Pg->descr = 0;
|
297
|
+
cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
|
298
|
+
cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
|
299
|
+
cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
|
300
|
+
|
301
|
+
cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
|
302
|
+
cudaMemcpyHostToDevice);
|
303
|
+
cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
|
304
|
+
cudaMemcpyHostToDevice);
|
305
|
+
cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
|
306
|
+
cudaMemcpyHostToDevice);
|
307
|
+
|
308
|
+
cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
|
309
|
+
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
310
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
311
|
+
|
312
|
+
SCS(cs_spfree)(P_full);
|
313
|
+
} else {
|
314
|
+
Pg = SCS_NULL;
|
315
|
+
}
|
316
|
+
|
317
|
+
p->Ag = Ag;
|
318
|
+
p->Pg = Pg;
|
319
|
+
p->Agt = SCS_NULL;
|
320
|
+
|
321
|
+
/* we initialize with tmp_m but always overwrite it so it doesn't matter */
|
322
|
+
cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
323
|
+
cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
324
|
+
cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
|
325
|
+
|
326
|
+
/* Form preconditioner and copy R_x, 1/R_y to gpu */
|
327
|
+
SCS(update_lin_sys_diag_r)(p, diag_r);
|
159
328
|
|
160
329
|
#if GPU_TRANSPOSE_MAT > 0
|
161
330
|
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
162
331
|
p->Agt->n = A->m;
|
163
332
|
p->Agt->m = A->n;
|
164
|
-
p->Agt->
|
333
|
+
p->Agt->nnz = A->p[A->n];
|
165
334
|
p->Agt->descr = 0;
|
166
335
|
/* Matrix description */
|
167
|
-
cusparseCreateMatDescr(&p->Agt->descr);
|
168
|
-
cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
169
|
-
cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
|
170
336
|
|
171
337
|
cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
|
172
338
|
cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
|
173
339
|
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
|
174
340
|
/* transpose Ag into Agt for faster multiplies */
|
175
341
|
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
|
176
|
-
|
177
|
-
|
178
|
-
|
342
|
+
cusparseCsr2cscEx2_bufferSize(
|
343
|
+
p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
|
344
|
+
p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
|
345
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
|
346
|
+
|
347
|
+
if (new_buffer_size > p->buffer_size) {
|
348
|
+
if (p->buffer != SCS_NULL) {
|
349
|
+
cudaFree(p->buffer);
|
350
|
+
}
|
351
|
+
cudaMalloc(&p->buffer, new_buffer_size);
|
352
|
+
p->buffer_size = new_buffer_size;
|
353
|
+
}
|
354
|
+
|
355
|
+
cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
|
356
|
+
Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
|
357
|
+
CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
|
358
|
+
SCS_CSR2CSC_ALG, p->buffer);
|
359
|
+
|
360
|
+
cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
|
361
|
+
p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
|
362
|
+
SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
|
363
|
+
SCS_CUDA_FLOAT);
|
179
364
|
#endif
|
180
365
|
|
181
366
|
err = cudaGetLastError();
|
182
367
|
if (err != cudaSuccess) {
|
183
|
-
printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
|
368
|
+
printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
|
184
369
|
cudaGetErrorString(err));
|
185
370
|
SCS(free_lin_sys_work)(p);
|
186
371
|
return SCS_NULL;
|
@@ -188,117 +373,169 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
188
373
|
return p;
|
189
374
|
}
|
190
375
|
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
CUBLAS(tbmv)
|
195
|
-
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
196
|
-
0, M, 1, z, 1);
|
197
|
-
}
|
198
|
-
|
199
|
-
/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
|
200
|
-
static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
|
201
|
-
ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
376
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
|
377
|
+
* b, on GPU */
|
378
|
+
static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
202
379
|
scs_int max_its, scs_float tol) {
|
203
|
-
scs_int i, n =
|
204
|
-
scs_float
|
380
|
+
scs_int i, n = pr->n;
|
381
|
+
scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
|
205
382
|
scs_float onef = 1.0, neg_onef = -1.0;
|
206
383
|
scs_float *p = pr->p; /* cg direction */
|
207
384
|
scs_float *Gp = pr->Gp; /* updated CG direction */
|
208
385
|
scs_float *r = pr->r; /* cg residual */
|
209
386
|
scs_float *z = pr->z; /* preconditioned */
|
210
|
-
scs_float *M = pr->M; /* preconditioner */
|
211
387
|
cublasHandle_t cublas_handle = pr->cublas_handle;
|
212
388
|
|
213
|
-
if (s
|
389
|
+
if (!s) {
|
390
|
+
/* take s = 0 */
|
391
|
+
/* r = b */
|
214
392
|
cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
393
|
+
/* b = 0 */
|
215
394
|
cudaMemset(bg, 0, n * sizeof(scs_float));
|
216
395
|
} else {
|
217
396
|
/* p contains bg temporarily */
|
218
397
|
cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
219
|
-
/* bg
|
398
|
+
/* bg = s */
|
220
399
|
cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
221
|
-
|
400
|
+
/* r = Mat * s */
|
401
|
+
mat_vec(pr, bg, r);
|
402
|
+
/* r = Mat * s - b */
|
222
403
|
CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
|
404
|
+
/* r = b - Mat * s */
|
223
405
|
CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
|
224
406
|
}
|
225
407
|
|
226
|
-
/* for some reason nrm2 is VERY slow */
|
227
|
-
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
228
|
-
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
229
|
-
nrm_r = SQRTF(nrm_r);
|
230
408
|
/* check to see if we need to run CG at all */
|
231
|
-
if (
|
409
|
+
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
|
232
410
|
return 0;
|
233
411
|
}
|
234
412
|
|
235
|
-
|
236
|
-
|
237
|
-
|
413
|
+
/* z = M r */
|
414
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
415
|
+
scale_by_diag(cublas_handle, pr->M_gpu, z, n);
|
416
|
+
/* ztr = z'r */
|
417
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
418
|
+
/* p = z */
|
238
419
|
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
239
420
|
|
240
421
|
for (i = 0; i < max_its; ++i) {
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
alpha =
|
422
|
+
/* Gp = Mat * p */
|
423
|
+
mat_vec(pr, p, Gp);
|
424
|
+
/* ptGp = p'Gp */
|
425
|
+
CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
|
426
|
+
/* alpha = z'r / p'G p */
|
427
|
+
alpha = ztr / ptGp;
|
246
428
|
neg_alpha = -alpha;
|
247
|
-
|
429
|
+
/* b += alpha * p */
|
248
430
|
CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
|
431
|
+
/* r -= alpha * G p */
|
249
432
|
CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
|
250
433
|
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
if (nrm_r < tol) {
|
256
|
-
i++;
|
257
|
-
break;
|
258
|
-
}
|
259
|
-
ipzr_old = ipzr;
|
260
|
-
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
261
|
-
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
434
|
+
#if VERBOSITY > 3
|
435
|
+
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
|
436
|
+
cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
|
437
|
+
#endif
|
262
438
|
|
263
|
-
|
439
|
+
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
|
440
|
+
return i + 1;
|
441
|
+
}
|
442
|
+
/* z = M r */
|
443
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
444
|
+
scale_by_diag(cublas_handle, pr->M_gpu, z, n);
|
445
|
+
ztr_prev = ztr;
|
446
|
+
/* ztr = z'r */
|
447
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
448
|
+
beta = ztr / ztr_prev;
|
449
|
+
/* p = beta * p, where beta = ztr / ztr_prev */
|
264
450
|
CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
|
451
|
+
/* p = z + beta * p */
|
265
452
|
CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
|
266
453
|
}
|
267
|
-
#if EXTRA_VERBOSE > 0
|
268
|
-
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
|
269
|
-
#endif
|
270
454
|
return i;
|
271
455
|
}
|
272
456
|
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
457
|
+
/* solves Mx = b, for x but stores result in b */
|
458
|
+
/* s contains warm-start (if available) */
|
459
|
+
/*
|
460
|
+
* [x] = [R_x + P A' ]^{-1} [rx]
|
461
|
+
* [y] [ A -R_y ] [ry]
|
462
|
+
*
|
463
|
+
* becomes:
|
464
|
+
*
|
465
|
+
* x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
|
466
|
+
* y = R_y^{-1} (Ax - ry)
|
467
|
+
*
|
468
|
+
*/
|
469
|
+
scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
470
|
+
scs_float tol) {
|
471
|
+
scs_int cg_its, max_iters;
|
279
472
|
scs_float neg_onef = -1.0;
|
473
|
+
|
474
|
+
/* these are on GPU */
|
475
|
+
scs_float *bg = p->bg;
|
476
|
+
scs_float *tmp_m = p->tmp_m;
|
280
477
|
ScsGpuMatrix *Ag = p->Ag;
|
281
|
-
scs_float cg_tol =
|
282
|
-
SCS(norm)(b, Ag->n) *
|
283
|
-
(iter < 0 ? CG_BEST_TOL
|
284
|
-
: CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
|
285
|
-
SCS(tic)(&linsys_timer);
|
286
|
-
/* all on GPU */
|
287
|
-
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
|
288
|
-
SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
|
289
|
-
/* solves (I+A'A)x = b, s warm start, solution stored in b */
|
290
|
-
cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
|
291
|
-
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
292
|
-
SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
|
293
|
-
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
294
478
|
|
295
|
-
if (
|
296
|
-
p->
|
479
|
+
if (CG_NORM(b, p->n + p->m) <= 1e-12) {
|
480
|
+
memset(b, 0, (p->n + p->m) * sizeof(scs_float));
|
481
|
+
return 0;
|
482
|
+
}
|
483
|
+
|
484
|
+
if (tol <= 0.) {
|
485
|
+
scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
|
486
|
+
"INDIRECT flag.\n",
|
487
|
+
tol);
|
297
488
|
}
|
298
489
|
|
299
|
-
|
300
|
-
|
301
|
-
|
490
|
+
/* bg = b = [rx; ry] */
|
491
|
+
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
|
492
|
+
cudaMemcpyHostToDevice);
|
493
|
+
/* tmp = ry */
|
494
|
+
cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
|
495
|
+
cudaMemcpyDeviceToDevice);
|
496
|
+
/* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
|
497
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
|
498
|
+
|
499
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
|
500
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
|
501
|
+
/* bg[:n] = rx + A' R ry */
|
502
|
+
SCS(accum_by_atrans_gpu)
|
503
|
+
(Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
|
504
|
+
&p->buffer);
|
505
|
+
|
506
|
+
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
|
507
|
+
max_iters = 10 * Ag->n;
|
508
|
+
|
509
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
|
510
|
+
* in bg */
|
511
|
+
cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
|
512
|
+
|
513
|
+
/* bg[n:] = -ry */
|
514
|
+
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
515
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
|
516
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
|
517
|
+
|
518
|
+
/* b[n:] = Ax - ry */
|
519
|
+
#if GPU_TRANSPOSE_MAT > 0
|
520
|
+
SCS(accum_by_atrans_gpu)
|
521
|
+
(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
522
|
+
&p->buffer);
|
523
|
+
#else
|
524
|
+
SCS(accum_by_a_gpu)
|
525
|
+
(Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
526
|
+
&p->buffer);
|
527
|
+
#endif
|
528
|
+
|
529
|
+
/* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
|
530
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
|
531
|
+
|
532
|
+
/* copy bg = [x; y] back to b */
|
533
|
+
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
|
534
|
+
cudaMemcpyDeviceToHost);
|
535
|
+
p->tot_cg_its += cg_its;
|
536
|
+
#if VERBOSITY > 1
|
537
|
+
scs_printf("tol %.3e\n", tol);
|
538
|
+
scs_printf("cg_its %i\n", (int)cg_its);
|
302
539
|
#endif
|
303
540
|
return 0;
|
304
541
|
}
|