scs 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +11 -6
- data/lib/scs/ffi.rb +30 -13
- data/lib/scs/solver.rb +32 -9
- data/lib/scs/version.rb +1 -1
- data/vendor/scs/CITATION.cff +39 -0
- data/vendor/scs/CMakeLists.txt +7 -8
- data/vendor/scs/Makefile +24 -15
- data/vendor/scs/README.md +5 -263
- data/vendor/scs/include/aa.h +67 -23
- data/vendor/scs/include/cones.h +17 -17
- data/vendor/scs/include/glbopts.h +98 -32
- data/vendor/scs/include/linalg.h +2 -4
- data/vendor/scs/include/linsys.h +58 -44
- data/vendor/scs/include/normalize.h +3 -3
- data/vendor/scs/include/rw.h +8 -2
- data/vendor/scs/include/scs.h +293 -133
- data/vendor/scs/include/util.h +3 -15
- data/vendor/scs/linsys/cpu/direct/private.c +220 -224
- data/vendor/scs/linsys/cpu/direct/private.h +13 -7
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.c +177 -110
- data/vendor/scs/linsys/cpu/indirect/private.h +8 -4
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/csparse.c +87 -0
- data/vendor/scs/linsys/csparse.h +34 -0
- data/vendor/scs/linsys/csparse.o +0 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +1 -1
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/qdldl/changes +2 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
- data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
- data/vendor/scs/linsys/gpu/gpu.c +31 -33
- data/vendor/scs/linsys/gpu/gpu.h +48 -31
- data/vendor/scs/linsys/gpu/indirect/private.c +338 -232
- data/vendor/scs/linsys/gpu/indirect/private.h +23 -14
- data/vendor/scs/linsys/scs_matrix.c +498 -0
- data/vendor/scs/linsys/scs_matrix.h +70 -0
- data/vendor/scs/linsys/scs_matrix.o +0 -0
- data/vendor/scs/scs.mk +13 -9
- data/vendor/scs/src/aa.c +384 -109
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.c +440 -353
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.c +15 -5
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.c +84 -28
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.c +22 -64
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.c +160 -21
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.c +767 -563
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_indir.o +0 -0
- data/vendor/scs/src/scs_version.c +9 -3
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.c +37 -106
- data/vendor/scs/src/util.o +0 -0
- data/vendor/scs/test/minunit.h +17 -8
- data/vendor/scs/test/problem_utils.h +176 -14
- data/vendor/scs/test/problems/degenerate.h +130 -0
- data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
- data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
- data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
- data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
- data/vendor/scs/test/problems/random_prob +0 -0
- data/vendor/scs/test/problems/random_prob.h +45 -0
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
- data/vendor/scs/test/problems/small_lp.h +13 -14
- data/vendor/scs/test/problems/test_fails.h +43 -0
- data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
- data/vendor/scs/test/random_socp_prob.c +54 -53
- data/vendor/scs/test/rng.h +109 -0
- data/vendor/scs/test/run_from_file.c +19 -10
- data/vendor/scs/test/run_tests.c +27 -3
- metadata +20 -8
- data/vendor/scs/linsys/amatrix.c +0 -305
- data/vendor/scs/linsys/amatrix.h +0 -36
- data/vendor/scs/linsys/amatrix.o +0 -0
- data/vendor/scs/test/data/small_random_socp +0 -0
- data/vendor/scs/test/problems/small_random_socp.h +0 -33
- data/vendor/scs/test/run_tests +0 -2
@@ -1,71 +1,89 @@
|
|
1
1
|
#include "private.h"
|
2
|
+
#include "linsys.h"
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
/* do not use within pcg, reuses memory */
|
7
|
-
void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
|
8
|
-
const scs_float *x, scs_float *y) {
|
9
|
-
scs_float *v_m = p->tmp_m;
|
10
|
-
scs_float *v_n = p->r;
|
11
|
-
cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
12
|
-
cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
13
|
-
|
14
|
-
cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m);
|
15
|
-
cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n);
|
16
|
-
SCS(_accum_by_atrans_gpu)(
|
17
|
-
p->Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
|
18
|
-
&p->buffer_size, &p->buffer
|
19
|
-
);
|
20
|
-
|
21
|
-
cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
22
|
-
}
|
23
|
-
|
24
|
-
/* do not use within pcg, reuses memory */
|
25
|
-
void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
|
26
|
-
scs_float *y) {
|
27
|
-
scs_float *v_m = p->tmp_m;
|
28
|
-
scs_float *v_n = p->r;
|
29
|
-
cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
30
|
-
cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
4
|
+
/* norm to use when deciding convergence */
|
5
|
+
/* should be consistent with CG_NORM in glbopts.h */
|
6
|
+
#define USE_L2_NORM (0)
|
31
7
|
|
32
|
-
|
33
|
-
|
34
|
-
#if
|
35
|
-
|
36
|
-
|
37
|
-
&p->buffer_size, &p->buffer
|
38
|
-
);
|
8
|
+
static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
|
9
|
+
scs_int n) {
|
10
|
+
#if USE_L2_NORM > 0
|
11
|
+
scs_float nrm;
|
12
|
+
CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
|
39
13
|
#else
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
14
|
+
scs_int idx;
|
15
|
+
scs_float nrm;
|
16
|
+
CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
|
17
|
+
/* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
|
18
|
+
cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
|
19
|
+
nrm = ABS(nrm);
|
44
20
|
#endif
|
45
|
-
|
46
|
-
cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
21
|
+
return nrm;
|
47
22
|
}
|
48
23
|
|
49
|
-
char *SCS(get_lin_sys_method)(
|
50
|
-
|
51
|
-
sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
|
52
|
-
(long)A->p[A->n], stgs->cg_rate);
|
53
|
-
return str;
|
24
|
+
const char *SCS(get_lin_sys_method)() {
|
25
|
+
return "sparse-indirect GPU";
|
54
26
|
}
|
55
27
|
|
28
|
+
/*
|
56
29
|
char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
57
30
|
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
58
|
-
sprintf(str,
|
59
|
-
|
60
|
-
(scs_float)p->tot_cg_its / (info->iter + 1),
|
61
|
-
p->total_solve_time / (info->iter + 1) / 1e3);
|
31
|
+
sprintf(str, "lin-sys: avg cg its: %2.2f\n",
|
32
|
+
(scs_float)p->tot_cg_its / (info->iter + 1));
|
62
33
|
p->tot_cg_its = 0;
|
63
|
-
p->total_solve_time = 0;
|
64
34
|
return str;
|
65
35
|
}
|
36
|
+
*/
|
37
|
+
|
38
|
+
/* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
|
39
|
+
static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
|
40
|
+
scs_int i, k;
|
41
|
+
const ScsMatrix *A = p->A;
|
42
|
+
const ScsMatrix *P = p->P;
|
43
|
+
scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
|
44
|
+
|
45
|
+
#if VERBOSITY > 0
|
46
|
+
scs_printf("getting pre-conditioner\n");
|
47
|
+
#endif
|
48
|
+
|
49
|
+
for (i = 0; i < A->n; ++i) { /* cols */
|
50
|
+
M[i] = p->rho_x;
|
51
|
+
/* diag(A' R_y^{-1} A) */
|
52
|
+
for (k = A->p[i]; k < A->p[i + 1]; ++k) {
|
53
|
+
/* A->i[k] is row of entry k with value A->x[k] */
|
54
|
+
M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
|
55
|
+
}
|
56
|
+
if (P) {
|
57
|
+
for (k = P->p[i]; k < P->p[i + 1]; k++) {
|
58
|
+
/* diagonal element only */
|
59
|
+
if (P->i[k] == i) { /* row == col */
|
60
|
+
M[i] += P->x[k];
|
61
|
+
break;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
M[i] = 1. / M[i];
|
66
|
+
}
|
67
|
+
cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
68
|
+
scs_free(M);
|
69
|
+
#if VERBOSITY > 0
|
70
|
+
scs_printf("finished getting pre-conditioner\n");
|
71
|
+
#endif
|
72
|
+
}
|
73
|
+
|
74
|
+
/* no need to update anything in this case */
|
75
|
+
void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
|
76
|
+
scs_int i;
|
77
|
+
for (i = 0; i < p->m; ++i)
|
78
|
+
p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
|
79
|
+
cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
|
80
|
+
cudaMemcpyHostToDevice);
|
81
|
+
set_preconditioner(p, rho_y_vec);
|
82
|
+
}
|
66
83
|
|
67
84
|
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
68
85
|
if (p) {
|
86
|
+
scs_free(p->inv_rho_y_vec);
|
69
87
|
cudaFree(p->p);
|
70
88
|
cudaFree(p->r);
|
71
89
|
cudaFree(p->Gp);
|
@@ -73,6 +91,11 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
73
91
|
cudaFree(p->tmp_m);
|
74
92
|
cudaFree(p->z);
|
75
93
|
cudaFree(p->M);
|
94
|
+
cudaFree(p->inv_rho_y_vec_gpu);
|
95
|
+
if (p->Pg) {
|
96
|
+
SCS(free_gpu_matrix)(p->Pg);
|
97
|
+
scs_free(p->Pg);
|
98
|
+
}
|
76
99
|
if (p->Ag) {
|
77
100
|
SCS(free_gpu_matrix)(p->Ag);
|
78
101
|
scs_free(p->Ag);
|
@@ -86,6 +109,7 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
86
109
|
}
|
87
110
|
cusparseDestroyDnVec(p->dn_vec_m);
|
88
111
|
cusparseDestroyDnVec(p->dn_vec_n);
|
112
|
+
cusparseDestroyDnVec(p->dn_vec_n_p);
|
89
113
|
cusparseDestroy(p->cusparse_handle);
|
90
114
|
cublasDestroy(p->cublas_handle);
|
91
115
|
/* Don't reset because it interferes with other GPU programs. */
|
@@ -94,80 +118,105 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
94
118
|
}
|
95
119
|
}
|
96
120
|
|
97
|
-
/*
|
98
|
-
static void
|
99
|
-
|
121
|
+
/* z = M * z elementwise in place, assumes M, z on GPU */
|
122
|
+
static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
|
123
|
+
scs_float *z, scs_int n) {
|
124
|
+
CUBLAS(tbmv)
|
125
|
+
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
126
|
+
0, M, 1, z, 1);
|
127
|
+
}
|
128
|
+
|
129
|
+
/* y = (rho_x * I + P + A' R_y^{-1} A) x */
|
130
|
+
static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
100
131
|
/* x and y MUST already be loaded to GPU */
|
101
|
-
scs_float *
|
102
|
-
cudaMemset(
|
132
|
+
scs_float *z = p->tmp_m; /* temp memory */
|
133
|
+
cudaMemset(y, 0, p->n * sizeof(scs_float));
|
134
|
+
cudaMemset(z, 0, p->m * sizeof(scs_float));
|
135
|
+
|
136
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
|
137
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
|
138
|
+
cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
|
139
|
+
|
140
|
+
/* y = rho_x * x */
|
141
|
+
CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
|
142
|
+
|
143
|
+
if (p->Pg) {
|
144
|
+
/* y = rho_x * x + Px */
|
145
|
+
SCS(accum_by_p_gpu)
|
146
|
+
(p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
147
|
+
&p->buffer);
|
148
|
+
}
|
103
149
|
|
104
|
-
|
105
|
-
cusparseDnVecSetValues(p->dn_vec_n, (void *) x);
|
150
|
+
/* z = Ax */
|
106
151
|
#if GPU_TRANSPOSE_MAT > 0
|
107
|
-
SCS(
|
108
|
-
|
109
|
-
|
110
|
-
);
|
152
|
+
SCS(accum_by_atrans_gpu)
|
153
|
+
(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
154
|
+
&p->buffer);
|
111
155
|
#else
|
112
|
-
SCS(
|
113
|
-
|
114
|
-
|
115
|
-
);
|
156
|
+
SCS(accum_by_a_gpu)
|
157
|
+
(p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
158
|
+
&p->buffer);
|
116
159
|
#endif
|
160
|
+
/* z = R_y^{-1} A x */
|
161
|
+
scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
|
117
162
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
SCS(_accum_by_atrans_gpu)(
|
123
|
-
A, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
|
124
|
-
&p->buffer_size, &p->buffer
|
125
|
-
);
|
126
|
-
|
127
|
-
CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
|
163
|
+
/* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
|
164
|
+
SCS(accum_by_atrans_gpu)
|
165
|
+
(p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
166
|
+
&p->buffer);
|
128
167
|
}
|
129
168
|
|
130
|
-
/*
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
169
|
+
/* P comes in upper triangular, expand to full
|
170
|
+
* First compute triplet version of full matrix, then compress to csc
|
171
|
+
* */
|
172
|
+
static csc *fill_p_matrix(const ScsMatrix *P) {
|
173
|
+
scs_int i, j, k, kk;
|
174
|
+
scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
|
175
|
+
csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
|
176
|
+
csc *P_full;
|
177
|
+
kk = 0;
|
178
|
+
for (j = 0; j < P->n; j++) { /* cols */
|
179
|
+
for (k = P->p[j]; k < P->p[j + 1]; k++) {
|
180
|
+
i = P->i[k]; /* row */
|
181
|
+
if (i > j) { /* only upper triangular needed */
|
182
|
+
break;
|
183
|
+
}
|
184
|
+
P_tmp->i[kk] = i;
|
185
|
+
P_tmp->p[kk] = j;
|
186
|
+
P_tmp->x[kk] = P->x[k];
|
187
|
+
kk++;
|
188
|
+
if (i == j) { /* diagonal */
|
189
|
+
continue;
|
190
|
+
}
|
191
|
+
P_tmp->i[kk] = j;
|
192
|
+
P_tmp->p[kk] = i;
|
193
|
+
P_tmp->x[kk] = P->x[k];
|
194
|
+
kk++;
|
195
|
+
}
|
144
196
|
}
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
scs_printf("finished getting pre-conditioner\n");
|
150
|
-
#endif
|
197
|
+
P_tmp->nz = kk; /* set number of nonzeros */
|
198
|
+
P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
|
199
|
+
SCS(cs_spfree)(P_tmp);
|
200
|
+
return P_full;
|
151
201
|
}
|
152
202
|
|
153
|
-
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
154
|
-
|
203
|
+
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
204
|
+
scs_float *rho_y_vec, scs_float rho_x) {
|
155
205
|
cudaError_t err;
|
206
|
+
scs_int i;
|
207
|
+
csc *P_full;
|
156
208
|
ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
157
|
-
ScsGpuMatrix *Ag = (ScsGpuMatrix *)
|
158
|
-
|
159
|
-
/* Used for initializing dense vectors */
|
160
|
-
scs_float *tmp_null_n = SCS_NULL;
|
161
|
-
scs_float *tmp_null_m = SCS_NULL;
|
209
|
+
ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
210
|
+
ScsGpuMatrix *Pg = SCS_NULL;
|
162
211
|
|
163
212
|
#if GPU_TRANSPOSE_MAT > 0
|
164
213
|
size_t new_buffer_size = 0;
|
165
214
|
#endif
|
166
215
|
|
216
|
+
p->rho_x = rho_x;
|
167
217
|
p->cublas_handle = 0;
|
168
218
|
p->cusparse_handle = 0;
|
169
219
|
|
170
|
-
p->total_solve_time = 0;
|
171
220
|
p->tot_cg_its = 0;
|
172
221
|
|
173
222
|
p->buffer_size = 0;
|
@@ -181,13 +230,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
181
230
|
|
182
231
|
Ag->n = A->n;
|
183
232
|
Ag->m = A->m;
|
184
|
-
Ag->
|
233
|
+
Ag->nnz = A->p[A->n];
|
185
234
|
Ag->descr = 0;
|
186
|
-
/* Matrix description */
|
187
|
-
|
188
|
-
p->Ag = Ag;
|
189
|
-
p->Agt = SCS_NULL;
|
190
|
-
|
191
235
|
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
|
192
236
|
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
|
193
237
|
cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
|
@@ -196,10 +240,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
196
240
|
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
|
197
241
|
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
|
198
242
|
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
199
|
-
cudaMalloc((void **)&p->tmp_m,
|
200
|
-
A->m * sizeof(scs_float)); /* intermediate result */
|
243
|
+
cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
|
201
244
|
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
202
245
|
cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
|
246
|
+
cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
|
203
247
|
|
204
248
|
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
205
249
|
cudaMemcpyHostToDevice);
|
@@ -207,25 +251,59 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
207
251
|
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
208
252
|
cudaMemcpyHostToDevice);
|
209
253
|
|
210
|
-
|
211
|
-
(
|
212
|
-
|
213
|
-
|
254
|
+
p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
|
255
|
+
for (i = 0; i < A->m; ++i)
|
256
|
+
p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
|
257
|
+
cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
|
258
|
+
cudaMemcpyHostToDevice);
|
214
259
|
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
260
|
+
cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
|
261
|
+
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
262
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
263
|
+
|
264
|
+
if (P) {
|
265
|
+
Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
266
|
+
P_full = fill_p_matrix(P);
|
267
|
+
Pg->n = P_full->n;
|
268
|
+
Pg->m = P_full->m;
|
269
|
+
Pg->nnz = P_full->p[P_full->n];
|
270
|
+
Pg->descr = 0;
|
271
|
+
cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
|
272
|
+
cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
|
273
|
+
cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
|
274
|
+
|
275
|
+
cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
|
276
|
+
cudaMemcpyHostToDevice);
|
277
|
+
cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
|
278
|
+
cudaMemcpyHostToDevice);
|
279
|
+
cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
|
280
|
+
cudaMemcpyHostToDevice);
|
281
|
+
|
282
|
+
cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
|
283
|
+
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
284
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
285
|
+
|
286
|
+
SCS(cs_spfree)(P_full);
|
287
|
+
} else {
|
288
|
+
Pg = SCS_NULL;
|
289
|
+
}
|
221
290
|
|
222
|
-
|
291
|
+
p->Ag = Ag;
|
292
|
+
p->Pg = Pg;
|
293
|
+
p->Agt = SCS_NULL;
|
294
|
+
|
295
|
+
/* we initialize with tmp_m but always overwrite it so it doesn't matter */
|
296
|
+
cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
297
|
+
cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
298
|
+
cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
|
299
|
+
|
300
|
+
set_preconditioner(p, rho_y_vec);
|
223
301
|
|
224
302
|
#if GPU_TRANSPOSE_MAT > 0
|
225
303
|
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
226
304
|
p->Agt->n = A->m;
|
227
305
|
p->Agt->m = A->n;
|
228
|
-
p->Agt->
|
306
|
+
p->Agt->nnz = A->p[A->n];
|
229
307
|
p->Agt->descr = 0;
|
230
308
|
/* Matrix description */
|
231
309
|
|
@@ -234,13 +312,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
234
312
|
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
|
235
313
|
/* transpose Ag into Agt for faster multiplies */
|
236
314
|
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
|
237
|
-
cusparseCsr2cscEx2_bufferSize
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
|
242
|
-
CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG,
|
243
|
-
&new_buffer_size);
|
315
|
+
cusparseCsr2cscEx2_bufferSize(
|
316
|
+
p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
|
317
|
+
p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
|
318
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
|
244
319
|
|
245
320
|
if (new_buffer_size > p->buffer_size) {
|
246
321
|
if (p->buffer != SCS_NULL) {
|
@@ -250,24 +325,20 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
250
325
|
p->buffer_size = new_buffer_size;
|
251
326
|
}
|
252
327
|
|
253
|
-
cusparseCsr2cscEx2
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->Annz,
|
263
|
-
p->Agt->p, p->Agt->i, p->Agt->x,
|
264
|
-
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
265
|
-
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
328
|
+
cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
|
329
|
+
Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
|
330
|
+
CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
|
331
|
+
SCS_CSR2CSC_ALG, p->buffer);
|
332
|
+
|
333
|
+
cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
|
334
|
+
p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
|
335
|
+
SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
|
336
|
+
SCS_CUDA_FLOAT);
|
266
337
|
#endif
|
267
338
|
|
268
339
|
err = cudaGetLastError();
|
269
340
|
if (err != cudaSuccess) {
|
270
|
-
printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
|
341
|
+
printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
|
271
342
|
cudaGetErrorString(err));
|
272
343
|
SCS(free_lin_sys_work)(p);
|
273
344
|
return SCS_NULL;
|
@@ -275,138 +346,173 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
275
346
|
return p;
|
276
347
|
}
|
277
348
|
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
283
|
-
0, M, 1, z, 1);
|
284
|
-
}
|
285
|
-
|
286
|
-
/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
|
287
|
-
static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
|
288
|
-
ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
349
|
+
/* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
|
350
|
+
* b */
|
351
|
+
/* on GPU */
|
352
|
+
static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
289
353
|
scs_int max_its, scs_float tol) {
|
290
|
-
scs_int i, n =
|
291
|
-
scs_float
|
354
|
+
scs_int i, n = pr->n;
|
355
|
+
scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
|
292
356
|
scs_float onef = 1.0, neg_onef = -1.0;
|
293
357
|
scs_float *p = pr->p; /* cg direction */
|
294
358
|
scs_float *Gp = pr->Gp; /* updated CG direction */
|
295
359
|
scs_float *r = pr->r; /* cg residual */
|
296
360
|
scs_float *z = pr->z; /* preconditioned */
|
297
|
-
scs_float *M = pr->M; /* preconditioner */
|
298
361
|
cublasHandle_t cublas_handle = pr->cublas_handle;
|
299
362
|
|
300
|
-
if (s
|
363
|
+
if (!s) {
|
364
|
+
/* take s = 0 */
|
365
|
+
/* r = b */
|
301
366
|
cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
367
|
+
/* b = 0 */
|
302
368
|
cudaMemset(bg, 0, n * sizeof(scs_float));
|
303
369
|
} else {
|
304
370
|
/* p contains bg temporarily */
|
305
371
|
cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
306
|
-
/* bg
|
372
|
+
/* bg = s */
|
307
373
|
cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
308
|
-
|
374
|
+
/* r = Mat * s */
|
375
|
+
mat_vec(pr, bg, r);
|
376
|
+
/* r = Mat * s - b */
|
309
377
|
CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
|
378
|
+
/* r = b - Mat * s */
|
310
379
|
CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
|
311
380
|
}
|
312
381
|
|
313
|
-
/* for some reason nrm2 is VERY slow */
|
314
|
-
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
315
|
-
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
316
|
-
nrm_r = SQRTF(nrm_r);
|
317
382
|
/* check to see if we need to run CG at all */
|
318
|
-
if (
|
383
|
+
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
|
319
384
|
return 0;
|
320
385
|
}
|
321
386
|
|
322
|
-
|
323
|
-
|
324
|
-
|
387
|
+
/* z = M r */
|
388
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
389
|
+
scale_by_diag(cublas_handle, pr->M, z, n);
|
390
|
+
/* ztr = z'r */
|
391
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
392
|
+
/* p = z */
|
325
393
|
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
326
394
|
|
327
395
|
for (i = 0; i < max_its; ++i) {
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
alpha =
|
396
|
+
/* Gp = Mat * p */
|
397
|
+
mat_vec(pr, p, Gp);
|
398
|
+
/* ptGp = p'Gp */
|
399
|
+
CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
|
400
|
+
/* alpha = z'r / p'G p */
|
401
|
+
alpha = ztr / ptGp;
|
333
402
|
neg_alpha = -alpha;
|
334
|
-
|
403
|
+
/* b += alpha * p */
|
335
404
|
CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
|
405
|
+
/* r -= alpha * G p */
|
336
406
|
CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
|
337
407
|
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
if (nrm_r < tol) {
|
343
|
-
i++;
|
344
|
-
break;
|
345
|
-
}
|
346
|
-
ipzr_old = ipzr;
|
347
|
-
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
348
|
-
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
408
|
+
#if VERBOSITY > 3
|
409
|
+
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
|
410
|
+
cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
|
411
|
+
#endif
|
349
412
|
|
350
|
-
|
413
|
+
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
|
414
|
+
return i + 1;
|
415
|
+
}
|
416
|
+
/* z = M r */
|
417
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
418
|
+
scale_by_diag(cublas_handle, pr->M, z, n);
|
419
|
+
ztr_prev = ztr;
|
420
|
+
/* ztr = z'r */
|
421
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
422
|
+
beta = ztr / ztr_prev;
|
423
|
+
/* p = beta * p, where beta = ztr / ztr_prev */
|
351
424
|
CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
|
425
|
+
/* p = z + beta * p */
|
352
426
|
CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
|
353
427
|
}
|
354
|
-
#if EXTRA_VERBOSE > 0
|
355
|
-
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
|
356
|
-
#endif
|
357
428
|
return i;
|
358
429
|
}
|
359
430
|
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
431
|
+
/* solves Mx = b, for x but stores result in b */
|
432
|
+
/* s contains warm-start (if available) */
|
433
|
+
/*
|
434
|
+
* [x] = [rho_x I + P A' ]^{-1} [rx]
|
435
|
+
* [y] [ A -R_y ] [ry]
|
436
|
+
*
|
437
|
+
* R_y = diag(rho_y_vec)
|
438
|
+
*
|
439
|
+
* becomes:
|
440
|
+
*
|
441
|
+
* x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
|
442
|
+
* y = R_y^{-1} (Ax - ry)
|
443
|
+
*
|
444
|
+
*/
|
445
|
+
scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
446
|
+
scs_float tol) {
|
447
|
+
scs_int cg_its, max_iters;
|
366
448
|
scs_float neg_onef = -1.0;
|
449
|
+
|
450
|
+
/* these are on GPU */
|
451
|
+
scs_float *bg = p->bg;
|
452
|
+
scs_float *tmp_m = p->tmp_m;
|
367
453
|
ScsGpuMatrix *Ag = p->Ag;
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
454
|
+
ScsGpuMatrix *Pg = p->Pg;
|
455
|
+
|
456
|
+
if (CG_NORM(b, p->n + p->m) <= 1e-12) {
|
457
|
+
memset(b, 0, (p->n + p->m) * sizeof(scs_float));
|
458
|
+
return 0;
|
459
|
+
}
|
460
|
+
|
461
|
+
if (tol <= 0.) {
|
462
|
+
scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
|
463
|
+
"INDIRECT flag.\n",
|
464
|
+
tol);
|
465
|
+
}
|
466
|
+
|
467
|
+
/* bg = b = [rx; ry] */
|
468
|
+
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
|
469
|
+
cudaMemcpyHostToDevice);
|
470
|
+
/* tmp = ry */
|
471
|
+
cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
|
472
|
+
cudaMemcpyDeviceToDevice);
|
473
|
+
/* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
|
474
|
+
scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
|
475
|
+
|
476
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
|
477
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
|
478
|
+
/* bg[:n] = rx + A' R ry */
|
479
|
+
SCS(accum_by_atrans_gpu)
|
480
|
+
(Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
|
481
|
+
&p->buffer);
|
482
|
+
|
483
|
+
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
|
484
|
+
max_iters = 10 * Ag->n;
|
485
|
+
|
486
|
+
/* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
|
487
|
+
* in bg */
|
488
|
+
cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
|
489
|
+
|
490
|
+
/* bg[n:] = -ry */
|
385
491
|
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
492
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
|
493
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
|
386
494
|
|
387
|
-
|
388
|
-
cusparseDnVecSetValues(p->dn_vec_n, (void *) bg);
|
495
|
+
/* b[n:] = Ax - ry */
|
389
496
|
#if GPU_TRANSPOSE_MAT > 0
|
390
|
-
SCS(
|
391
|
-
|
392
|
-
|
393
|
-
);
|
497
|
+
SCS(accum_by_atrans_gpu)
|
498
|
+
(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
499
|
+
&p->buffer);
|
394
500
|
#else
|
395
|
-
SCS(
|
396
|
-
|
397
|
-
|
398
|
-
);
|
501
|
+
SCS(accum_by_a_gpu)
|
502
|
+
(Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
503
|
+
&p->buffer);
|
399
504
|
#endif
|
400
505
|
|
401
|
-
|
402
|
-
|
403
|
-
if (iter >= 0) {
|
404
|
-
p->tot_cg_its += cg_its;
|
405
|
-
}
|
506
|
+
/* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
|
507
|
+
scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
|
406
508
|
|
407
|
-
|
408
|
-
|
409
|
-
|
509
|
+
/* copy bg = [x; y] back to b */
|
510
|
+
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
|
511
|
+
cudaMemcpyDeviceToHost);
|
512
|
+
p->tot_cg_its += cg_its;
|
513
|
+
#if VERBOSITY > 1
|
514
|
+
scs_printf("tol %.3e\n", tol);
|
515
|
+
scs_printf("cg_its %i\n", (int)cg_its);
|
410
516
|
#endif
|
411
517
|
return 0;
|
412
518
|
}
|