scs 0.2.2 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE.txt +18 -18
- data/README.md +19 -14
- data/lib/scs/ffi.rb +31 -20
- data/lib/scs/solver.rb +32 -9
- data/lib/scs/version.rb +1 -1
- data/vendor/scs/CITATION.cff +39 -0
- data/vendor/scs/CMakeLists.txt +320 -0
- data/vendor/scs/Makefile +32 -23
- data/vendor/scs/README.md +9 -218
- data/vendor/scs/include/aa.h +67 -23
- data/vendor/scs/include/cones.h +22 -19
- data/vendor/scs/include/glbopts.h +107 -79
- data/vendor/scs/include/linalg.h +3 -4
- data/vendor/scs/include/linsys.h +58 -44
- data/vendor/scs/include/normalize.h +6 -5
- data/vendor/scs/include/rw.h +8 -2
- data/vendor/scs/include/scs.h +257 -141
- data/vendor/scs/include/scs_types.h +34 -0
- data/vendor/scs/include/scs_work.h +83 -0
- data/vendor/scs/include/util.h +3 -15
- data/vendor/scs/linsys/cpu/direct/private.c +241 -232
- data/vendor/scs/linsys/cpu/direct/private.h +13 -7
- data/vendor/scs/linsys/cpu/indirect/private.c +194 -118
- data/vendor/scs/linsys/cpu/indirect/private.h +7 -4
- data/vendor/scs/linsys/csparse.c +87 -0
- data/vendor/scs/linsys/csparse.h +34 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +6 -6
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +6 -1
- data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
- data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
- data/vendor/scs/linsys/external/qdldl/changes +2 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
- data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
- data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
- data/vendor/scs/linsys/gpu/gpu.c +58 -21
- data/vendor/scs/linsys/gpu/gpu.h +70 -35
- data/vendor/scs/linsys/gpu/indirect/private.c +394 -157
- data/vendor/scs/linsys/gpu/indirect/private.h +27 -12
- data/vendor/scs/linsys/scs_matrix.c +478 -0
- data/vendor/scs/linsys/scs_matrix.h +70 -0
- data/vendor/scs/scs.mk +14 -10
- data/vendor/scs/src/aa.c +394 -110
- data/vendor/scs/src/cones.c +497 -359
- data/vendor/scs/src/ctrlc.c +15 -5
- data/vendor/scs/src/linalg.c +107 -26
- data/vendor/scs/src/normalize.c +30 -72
- data/vendor/scs/src/rw.c +202 -27
- data/vendor/scs/src/scs.c +769 -571
- data/vendor/scs/src/scs_version.c +11 -3
- data/vendor/scs/src/util.c +37 -106
- data/vendor/scs/test/minunit.h +22 -8
- data/vendor/scs/test/problem_utils.h +180 -25
- data/vendor/scs/test/problems/degenerate.h +130 -0
- data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
- data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
- data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
- data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
- data/vendor/scs/test/problems/random_prob +0 -0
- data/vendor/scs/test/problems/random_prob.h +45 -0
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
- data/vendor/scs/test/problems/small_lp.h +14 -13
- data/vendor/scs/test/problems/small_qp.h +352 -0
- data/vendor/scs/test/problems/test_validation.h +43 -0
- data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
- data/vendor/scs/test/random_socp_prob.c +54 -53
- data/vendor/scs/test/rng.h +109 -0
- data/vendor/scs/test/run_from_file.c +20 -11
- data/vendor/scs/test/run_tests.c +35 -2
- metadata +29 -98
- data/vendor/scs/linsys/amatrix.c +0 -305
- data/vendor/scs/linsys/amatrix.h +0 -36
- data/vendor/scs/linsys/amatrix.o +0 -0
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.o +0 -0
- data/vendor/scs/test/data/small_random_socp +0 -0
- data/vendor/scs/test/problems/small_random_socp.h +0 -33
- data/vendor/scs/test/run_tests +0 -2
|
@@ -1,61 +1,115 @@
|
|
|
1
1
|
#include "private.h"
|
|
2
|
+
#include "linsys.h"
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
/* do not use within pcg, reuses memory */
|
|
7
|
-
void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
|
|
8
|
-
const scs_float *x, scs_float *y) {
|
|
9
|
-
scs_float *v_m = p->tmp_m;
|
|
10
|
-
scs_float *v_n = p->r;
|
|
11
|
-
cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
12
|
-
cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
13
|
-
SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
|
|
14
|
-
cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
15
|
-
}
|
|
4
|
+
/* norm to use when deciding convergence */
|
|
5
|
+
/* should be consistent with CG_NORM in glbopts.h */
|
|
6
|
+
#define USE_L2_NORM (0)
|
|
16
7
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
scs_float
|
|
21
|
-
|
|
22
|
-
cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
23
|
-
cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
24
|
-
#if GPU_TRANSPOSE_MAT > 0
|
|
25
|
-
SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
|
|
8
|
+
static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
|
|
9
|
+
scs_int n) {
|
|
10
|
+
#if USE_L2_NORM > 0
|
|
11
|
+
scs_float nrm;
|
|
12
|
+
CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
|
|
26
13
|
#else
|
|
27
|
-
|
|
14
|
+
scs_int idx;
|
|
15
|
+
scs_float nrm;
|
|
16
|
+
CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
|
|
17
|
+
/* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
|
|
18
|
+
cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
19
|
+
nrm = ABS(nrm);
|
|
28
20
|
#endif
|
|
29
|
-
|
|
21
|
+
return nrm;
|
|
30
22
|
}
|
|
31
23
|
|
|
32
|
-
char *SCS(get_lin_sys_method)(
|
|
33
|
-
|
|
34
|
-
sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
|
|
35
|
-
(long)A->p[A->n], stgs->cg_rate);
|
|
36
|
-
return str;
|
|
24
|
+
const char *SCS(get_lin_sys_method)() {
|
|
25
|
+
return "sparse-indirect GPU";
|
|
37
26
|
}
|
|
38
27
|
|
|
28
|
+
/*
|
|
39
29
|
char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
|
40
30
|
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
|
41
|
-
sprintf(str,
|
|
42
|
-
|
|
43
|
-
(scs_float)p->tot_cg_its / (info->iter + 1),
|
|
44
|
-
p->total_solve_time / (info->iter + 1) / 1e3);
|
|
31
|
+
sprintf(str, "lin-sys: avg cg its: %2.2f\n",
|
|
32
|
+
(scs_float)p->tot_cg_its / (info->iter + 1));
|
|
45
33
|
p->tot_cg_its = 0;
|
|
46
|
-
p->total_solve_time = 0;
|
|
47
34
|
return str;
|
|
48
35
|
}
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
/* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
|
|
39
|
+
/* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
|
|
40
|
+
static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
|
|
41
|
+
scs_int i, k;
|
|
42
|
+
const ScsMatrix *A = p->A;
|
|
43
|
+
const ScsMatrix *P = p->P;
|
|
44
|
+
scs_float *M = p->M;
|
|
45
|
+
|
|
46
|
+
#if VERBOSITY > 0
|
|
47
|
+
scs_printf("getting pre-conditioner\n");
|
|
48
|
+
#endif
|
|
49
|
+
|
|
50
|
+
/* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
|
|
51
|
+
for (i = 0; i < A->n; ++i) { /* cols */
|
|
52
|
+
/* M_ii = (R_x)_i */
|
|
53
|
+
M[i] = diag_r[i];
|
|
54
|
+
/* M_ii += a_i' (R_y)^-1 a_i */
|
|
55
|
+
for (k = A->p[i]; k < A->p[i + 1]; ++k) {
|
|
56
|
+
/* A->i[k] is row of entry k with value A->x[k] */
|
|
57
|
+
M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
|
|
58
|
+
}
|
|
59
|
+
if (P) {
|
|
60
|
+
for (k = P->p[i]; k < P->p[i + 1]; k++) {
|
|
61
|
+
/* diagonal element only */
|
|
62
|
+
if (P->i[k] == i) { /* row == col */
|
|
63
|
+
/* M_ii += P_ii */
|
|
64
|
+
M[i] += P->x[k];
|
|
65
|
+
break;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
/* finally invert for pre-conditioner */
|
|
70
|
+
M[i] = 1. / M[i];
|
|
71
|
+
}
|
|
72
|
+
cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
73
|
+
#if VERBOSITY > 0
|
|
74
|
+
scs_printf("finished getting pre-conditioner\n");
|
|
75
|
+
#endif
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/* no need to update anything in this case */
|
|
79
|
+
void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
|
|
80
|
+
scs_int i;
|
|
81
|
+
|
|
82
|
+
/* R_x to gpu */
|
|
83
|
+
cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
|
|
84
|
+
cudaMemcpyHostToDevice);
|
|
85
|
+
|
|
86
|
+
/* 1/R_y to gpu */
|
|
87
|
+
for (i = 0; i < p->m; ++i)
|
|
88
|
+
p->inv_r_y[i] = 1. / diag_r[p->n + i];
|
|
89
|
+
cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
|
|
90
|
+
cudaMemcpyHostToDevice);
|
|
91
|
+
|
|
92
|
+
/* set preconditioner M on gpu */
|
|
93
|
+
set_preconditioner(p, diag_r);
|
|
94
|
+
}
|
|
49
95
|
|
|
50
96
|
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
51
97
|
if (p) {
|
|
98
|
+
scs_free(p->M);
|
|
99
|
+
scs_free(p->inv_r_y);
|
|
52
100
|
cudaFree(p->p);
|
|
53
101
|
cudaFree(p->r);
|
|
54
102
|
cudaFree(p->Gp);
|
|
55
103
|
cudaFree(p->bg);
|
|
56
104
|
cudaFree(p->tmp_m);
|
|
57
105
|
cudaFree(p->z);
|
|
58
|
-
cudaFree(p->
|
|
106
|
+
cudaFree(p->M_gpu);
|
|
107
|
+
cudaFree(p->r_x_gpu);
|
|
108
|
+
cudaFree(p->inv_r_y_gpu);
|
|
109
|
+
if (p->Pg) {
|
|
110
|
+
SCS(free_gpu_matrix)(p->Pg);
|
|
111
|
+
scs_free(p->Pg);
|
|
112
|
+
}
|
|
59
113
|
if (p->Ag) {
|
|
60
114
|
SCS(free_gpu_matrix)(p->Ag);
|
|
61
115
|
scs_free(p->Ag);
|
|
@@ -64,6 +118,12 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
|
64
118
|
SCS(free_gpu_matrix)(p->Agt);
|
|
65
119
|
scs_free(p->Agt);
|
|
66
120
|
}
|
|
121
|
+
if (p->buffer != SCS_NULL) {
|
|
122
|
+
cudaFree(p->buffer);
|
|
123
|
+
}
|
|
124
|
+
cusparseDestroyDnVec(p->dn_vec_m);
|
|
125
|
+
cusparseDestroyDnVec(p->dn_vec_n);
|
|
126
|
+
cusparseDestroyDnVec(p->dn_vec_n_p);
|
|
67
127
|
cusparseDestroy(p->cusparse_handle);
|
|
68
128
|
cublasDestroy(p->cublas_handle);
|
|
69
129
|
/* Don't reset because it interferes with other GPU programs. */
|
|
@@ -72,53 +132,127 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
|
72
132
|
}
|
|
73
133
|
}
|
|
74
134
|
|
|
75
|
-
/*
|
|
76
|
-
static void
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
|
|
82
|
-
cudaMemset(y, 0, A->n * sizeof(scs_float));
|
|
83
|
-
SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
|
|
84
|
-
CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
|
|
135
|
+
/* z = M * z elementwise in place, assumes M, z on GPU */
|
|
136
|
+
static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
|
|
137
|
+
scs_float *z, scs_int n) {
|
|
138
|
+
CUBLAS(tbmv)
|
|
139
|
+
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
|
140
|
+
0, M, 1, z, 1);
|
|
85
141
|
}
|
|
86
142
|
|
|
87
|
-
/*
|
|
88
|
-
static void
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
143
|
+
/* y = (R_x + P + A' R_y^{-1} A) x */
|
|
144
|
+
static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
145
|
+
/* x and y MUST already be loaded to GPU */
|
|
146
|
+
scs_float *z = p->tmp_m; /* temp memory */
|
|
147
|
+
cudaMemset(z, 0, p->m * sizeof(scs_float));
|
|
148
|
+
|
|
149
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
|
|
150
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
|
|
151
|
+
cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
|
|
152
|
+
|
|
153
|
+
/* y = x */
|
|
154
|
+
cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
155
|
+
/* y = R_x * x */
|
|
156
|
+
scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
|
|
157
|
+
|
|
158
|
+
if (p->Pg) {
|
|
159
|
+
/* y = R_x * x + P x */
|
|
160
|
+
SCS(accum_by_p_gpu)
|
|
161
|
+
(p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
|
162
|
+
&p->buffer);
|
|
163
|
+
}
|
|
92
164
|
|
|
93
|
-
|
|
94
|
-
|
|
165
|
+
/* z = Ax */
|
|
166
|
+
#if GPU_TRANSPOSE_MAT > 0
|
|
167
|
+
SCS(accum_by_atrans_gpu)
|
|
168
|
+
(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
169
|
+
&p->buffer);
|
|
170
|
+
#else
|
|
171
|
+
SCS(accum_by_a_gpu)
|
|
172
|
+
(p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
173
|
+
&p->buffer);
|
|
95
174
|
#endif
|
|
175
|
+
/* z = R_y^{-1} A x */
|
|
176
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
|
|
96
177
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
103
|
-
scs_free(M);
|
|
178
|
+
/* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
|
|
179
|
+
SCS(accum_by_atrans_gpu)
|
|
180
|
+
(p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
|
181
|
+
&p->buffer);
|
|
182
|
+
}
|
|
104
183
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
184
|
+
/* P comes in upper triangular, expand to full
|
|
185
|
+
* First compute triplet version of full matrix, then compress to csc
|
|
186
|
+
* */
|
|
187
|
+
static csc *fill_p_matrix(const ScsMatrix *P) {
|
|
188
|
+
scs_int i, j, k, kk;
|
|
189
|
+
scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
|
|
190
|
+
csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
|
|
191
|
+
csc *P_full;
|
|
192
|
+
kk = 0;
|
|
193
|
+
for (j = 0; j < P->n; j++) { /* cols */
|
|
194
|
+
for (k = P->p[j]; k < P->p[j + 1]; k++) {
|
|
195
|
+
i = P->i[k]; /* row */
|
|
196
|
+
if (i > j) { /* only upper triangular needed */
|
|
197
|
+
break;
|
|
198
|
+
}
|
|
199
|
+
P_tmp->i[kk] = i;
|
|
200
|
+
P_tmp->p[kk] = j;
|
|
201
|
+
P_tmp->x[kk] = P->x[k];
|
|
202
|
+
kk++;
|
|
203
|
+
if (i == j) { /* diagonal */
|
|
204
|
+
continue;
|
|
205
|
+
}
|
|
206
|
+
P_tmp->i[kk] = j;
|
|
207
|
+
P_tmp->p[kk] = i;
|
|
208
|
+
P_tmp->x[kk] = P->x[k];
|
|
209
|
+
kk++;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
P_tmp->nz = kk; /* set number of nonzeros */
|
|
213
|
+
P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
|
|
214
|
+
SCS(cs_spfree)(P_tmp);
|
|
215
|
+
return P_full;
|
|
108
216
|
}
|
|
109
217
|
|
|
110
|
-
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
111
|
-
const
|
|
218
|
+
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
219
|
+
const scs_float *diag_r) {
|
|
112
220
|
cudaError_t err;
|
|
113
|
-
|
|
114
|
-
|
|
221
|
+
csc *P_full;
|
|
222
|
+
ScsLinSysWork *p = SCS_NULL;
|
|
223
|
+
ScsGpuMatrix *Ag = SCS_NULL;
|
|
224
|
+
ScsGpuMatrix *Pg = SCS_NULL;
|
|
225
|
+
int device_count;
|
|
226
|
+
|
|
227
|
+
err = cudaGetDeviceCount(&device_count);
|
|
228
|
+
if (err > 0) {
|
|
229
|
+
scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
|
|
230
|
+
return SCS_NULL;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
|
234
|
+
Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
|
235
|
+
|
|
236
|
+
p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
|
|
237
|
+
p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
|
|
238
|
+
|
|
239
|
+
p->A = A;
|
|
240
|
+
p->P = P;
|
|
241
|
+
p->m = A->m;
|
|
242
|
+
p->n = A->n;
|
|
243
|
+
|
|
244
|
+
#if GPU_TRANSPOSE_MAT > 0
|
|
245
|
+
size_t new_buffer_size = 0;
|
|
246
|
+
#endif
|
|
115
247
|
|
|
116
248
|
p->cublas_handle = 0;
|
|
117
249
|
p->cusparse_handle = 0;
|
|
118
250
|
|
|
119
|
-
p->total_solve_time = 0;
|
|
120
251
|
p->tot_cg_its = 0;
|
|
121
252
|
|
|
253
|
+
p->buffer_size = 0;
|
|
254
|
+
p->buffer = SCS_NULL;
|
|
255
|
+
|
|
122
256
|
/* Get handle to the CUBLAS context */
|
|
123
257
|
cublasCreate(&p->cublas_handle);
|
|
124
258
|
|
|
@@ -127,15 +261,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
127
261
|
|
|
128
262
|
Ag->n = A->n;
|
|
129
263
|
Ag->m = A->m;
|
|
130
|
-
Ag->
|
|
264
|
+
Ag->nnz = A->p[A->n];
|
|
131
265
|
Ag->descr = 0;
|
|
132
|
-
/* Matrix description */
|
|
133
|
-
cusparseCreateMatDescr(&Ag->descr);
|
|
134
|
-
cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
|
135
|
-
cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
|
|
136
|
-
p->Ag = Ag;
|
|
137
|
-
p->Agt = SCS_NULL;
|
|
138
|
-
|
|
139
266
|
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
|
|
140
267
|
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
|
|
141
268
|
cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
|
|
@@ -144,10 +271,11 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
144
271
|
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
|
|
145
272
|
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
|
|
146
273
|
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
|
147
|
-
cudaMalloc((void **)&p->tmp_m,
|
|
148
|
-
A->m * sizeof(scs_float)); /* intermediate result */
|
|
274
|
+
cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
|
|
149
275
|
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
|
150
|
-
cudaMalloc((void **)&p->
|
|
276
|
+
cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
|
|
277
|
+
cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
|
|
278
|
+
cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
|
|
151
279
|
|
|
152
280
|
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
|
153
281
|
cudaMemcpyHostToDevice);
|
|
@@ -155,32 +283,89 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
155
283
|
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
|
156
284
|
cudaMemcpyHostToDevice);
|
|
157
285
|
|
|
158
|
-
|
|
286
|
+
cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
|
|
287
|
+
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
|
288
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
|
289
|
+
|
|
290
|
+
if (P) {
|
|
291
|
+
Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
|
292
|
+
P_full = fill_p_matrix(P);
|
|
293
|
+
Pg->n = P_full->n;
|
|
294
|
+
Pg->m = P_full->m;
|
|
295
|
+
Pg->nnz = P_full->p[P_full->n];
|
|
296
|
+
Pg->descr = 0;
|
|
297
|
+
cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
|
|
298
|
+
cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
|
|
299
|
+
cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
|
|
300
|
+
|
|
301
|
+
cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
|
|
302
|
+
cudaMemcpyHostToDevice);
|
|
303
|
+
cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
|
|
304
|
+
cudaMemcpyHostToDevice);
|
|
305
|
+
cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
|
|
306
|
+
cudaMemcpyHostToDevice);
|
|
307
|
+
|
|
308
|
+
cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
|
|
309
|
+
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
|
310
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
|
311
|
+
|
|
312
|
+
SCS(cs_spfree)(P_full);
|
|
313
|
+
} else {
|
|
314
|
+
Pg = SCS_NULL;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
p->Ag = Ag;
|
|
318
|
+
p->Pg = Pg;
|
|
319
|
+
p->Agt = SCS_NULL;
|
|
320
|
+
|
|
321
|
+
/* we initialize with tmp_m but always overwrite it so it doesn't matter */
|
|
322
|
+
cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
|
323
|
+
cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
|
324
|
+
cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
|
|
325
|
+
|
|
326
|
+
/* Form preconditioner and copy R_x, 1/R_y to gpu */
|
|
327
|
+
SCS(update_lin_sys_diag_r)(p, diag_r);
|
|
159
328
|
|
|
160
329
|
#if GPU_TRANSPOSE_MAT > 0
|
|
161
330
|
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
|
162
331
|
p->Agt->n = A->m;
|
|
163
332
|
p->Agt->m = A->n;
|
|
164
|
-
p->Agt->
|
|
333
|
+
p->Agt->nnz = A->p[A->n];
|
|
165
334
|
p->Agt->descr = 0;
|
|
166
335
|
/* Matrix description */
|
|
167
|
-
cusparseCreateMatDescr(&p->Agt->descr);
|
|
168
|
-
cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
|
169
|
-
cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
|
|
170
336
|
|
|
171
337
|
cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
|
|
172
338
|
cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
|
|
173
339
|
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
|
|
174
340
|
/* transpose Ag into Agt for faster multiplies */
|
|
175
341
|
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
342
|
+
cusparseCsr2cscEx2_bufferSize(
|
|
343
|
+
p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
|
|
344
|
+
p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
|
|
345
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
|
|
346
|
+
|
|
347
|
+
if (new_buffer_size > p->buffer_size) {
|
|
348
|
+
if (p->buffer != SCS_NULL) {
|
|
349
|
+
cudaFree(p->buffer);
|
|
350
|
+
}
|
|
351
|
+
cudaMalloc(&p->buffer, new_buffer_size);
|
|
352
|
+
p->buffer_size = new_buffer_size;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
|
|
356
|
+
Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
|
|
357
|
+
CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
|
|
358
|
+
SCS_CSR2CSC_ALG, p->buffer);
|
|
359
|
+
|
|
360
|
+
cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
|
|
361
|
+
p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
|
|
362
|
+
SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
|
|
363
|
+
SCS_CUDA_FLOAT);
|
|
179
364
|
#endif
|
|
180
365
|
|
|
181
366
|
err = cudaGetLastError();
|
|
182
367
|
if (err != cudaSuccess) {
|
|
183
|
-
printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
|
|
368
|
+
printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
|
|
184
369
|
cudaGetErrorString(err));
|
|
185
370
|
SCS(free_lin_sys_work)(p);
|
|
186
371
|
return SCS_NULL;
|
|
@@ -188,117 +373,169 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
188
373
|
return p;
|
|
189
374
|
}
|
|
190
375
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
CUBLAS(tbmv)
|
|
195
|
-
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
|
196
|
-
0, M, 1, z, 1);
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
|
|
200
|
-
static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
|
|
201
|
-
ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
376
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
|
|
377
|
+
* b, on GPU */
|
|
378
|
+
static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
202
379
|
scs_int max_its, scs_float tol) {
|
|
203
|
-
scs_int i, n =
|
|
204
|
-
scs_float
|
|
380
|
+
scs_int i, n = pr->n;
|
|
381
|
+
scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
|
|
205
382
|
scs_float onef = 1.0, neg_onef = -1.0;
|
|
206
383
|
scs_float *p = pr->p; /* cg direction */
|
|
207
384
|
scs_float *Gp = pr->Gp; /* updated CG direction */
|
|
208
385
|
scs_float *r = pr->r; /* cg residual */
|
|
209
386
|
scs_float *z = pr->z; /* preconditioned */
|
|
210
|
-
scs_float *M = pr->M; /* preconditioner */
|
|
211
387
|
cublasHandle_t cublas_handle = pr->cublas_handle;
|
|
212
388
|
|
|
213
|
-
if (s
|
|
389
|
+
if (!s) {
|
|
390
|
+
/* take s = 0 */
|
|
391
|
+
/* r = b */
|
|
214
392
|
cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
393
|
+
/* b = 0 */
|
|
215
394
|
cudaMemset(bg, 0, n * sizeof(scs_float));
|
|
216
395
|
} else {
|
|
217
396
|
/* p contains bg temporarily */
|
|
218
397
|
cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
219
|
-
/* bg
|
|
398
|
+
/* bg = s */
|
|
220
399
|
cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
221
|
-
|
|
400
|
+
/* r = Mat * s */
|
|
401
|
+
mat_vec(pr, bg, r);
|
|
402
|
+
/* r = Mat * s - b */
|
|
222
403
|
CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
|
|
404
|
+
/* r = b - Mat * s */
|
|
223
405
|
CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
|
|
224
406
|
}
|
|
225
407
|
|
|
226
|
-
/* for some reason nrm2 is VERY slow */
|
|
227
|
-
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
|
228
|
-
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
|
229
|
-
nrm_r = SQRTF(nrm_r);
|
|
230
408
|
/* check to see if we need to run CG at all */
|
|
231
|
-
if (
|
|
409
|
+
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
|
|
232
410
|
return 0;
|
|
233
411
|
}
|
|
234
412
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
413
|
+
/* z = M r */
|
|
414
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
415
|
+
scale_by_diag(cublas_handle, pr->M_gpu, z, n);
|
|
416
|
+
/* ztr = z'r */
|
|
417
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
|
418
|
+
/* p = z */
|
|
238
419
|
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
239
420
|
|
|
240
421
|
for (i = 0; i < max_its; ++i) {
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
alpha =
|
|
422
|
+
/* Gp = Mat * p */
|
|
423
|
+
mat_vec(pr, p, Gp);
|
|
424
|
+
/* ptGp = p'Gp */
|
|
425
|
+
CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
|
|
426
|
+
/* alpha = z'r / p'G p */
|
|
427
|
+
alpha = ztr / ptGp;
|
|
246
428
|
neg_alpha = -alpha;
|
|
247
|
-
|
|
429
|
+
/* b += alpha * p */
|
|
248
430
|
CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
|
|
431
|
+
/* r -= alpha * G p */
|
|
249
432
|
CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
|
|
250
433
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
if (nrm_r < tol) {
|
|
256
|
-
i++;
|
|
257
|
-
break;
|
|
258
|
-
}
|
|
259
|
-
ipzr_old = ipzr;
|
|
260
|
-
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
|
261
|
-
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
|
434
|
+
#if VERBOSITY > 3
|
|
435
|
+
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
|
|
436
|
+
cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
|
|
437
|
+
#endif
|
|
262
438
|
|
|
263
|
-
|
|
439
|
+
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
|
|
440
|
+
return i + 1;
|
|
441
|
+
}
|
|
442
|
+
/* z = M r */
|
|
443
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
444
|
+
scale_by_diag(cublas_handle, pr->M_gpu, z, n);
|
|
445
|
+
ztr_prev = ztr;
|
|
446
|
+
/* ztr = z'r */
|
|
447
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
|
448
|
+
beta = ztr / ztr_prev;
|
|
449
|
+
/* p = beta * p, where beta = ztr / ztr_prev */
|
|
264
450
|
CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
|
|
451
|
+
/* p = z + beta * p */
|
|
265
452
|
CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
|
|
266
453
|
}
|
|
267
|
-
#if EXTRA_VERBOSE > 0
|
|
268
|
-
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
|
|
269
|
-
#endif
|
|
270
454
|
return i;
|
|
271
455
|
}
|
|
272
456
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
457
|
+
/* solves Mx = b, for x but stores result in b */
|
|
458
|
+
/* s contains warm-start (if available) */
|
|
459
|
+
/*
|
|
460
|
+
* [x] = [R_x + P A' ]^{-1} [rx]
|
|
461
|
+
* [y] [ A -R_y ] [ry]
|
|
462
|
+
*
|
|
463
|
+
* becomes:
|
|
464
|
+
*
|
|
465
|
+
* x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
|
|
466
|
+
* y = R_y^{-1} (Ax - ry)
|
|
467
|
+
*
|
|
468
|
+
*/
|
|
469
|
+
scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
470
|
+
scs_float tol) {
|
|
471
|
+
scs_int cg_its, max_iters;
|
|
279
472
|
scs_float neg_onef = -1.0;
|
|
473
|
+
|
|
474
|
+
/* these are on GPU */
|
|
475
|
+
scs_float *bg = p->bg;
|
|
476
|
+
scs_float *tmp_m = p->tmp_m;
|
|
280
477
|
ScsGpuMatrix *Ag = p->Ag;
|
|
281
|
-
scs_float cg_tol =
|
|
282
|
-
SCS(norm)(b, Ag->n) *
|
|
283
|
-
(iter < 0 ? CG_BEST_TOL
|
|
284
|
-
: CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
|
|
285
|
-
SCS(tic)(&linsys_timer);
|
|
286
|
-
/* all on GPU */
|
|
287
|
-
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
288
|
-
SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
|
|
289
|
-
/* solves (I+A'A)x = b, s warm start, solution stored in b */
|
|
290
|
-
cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
|
|
291
|
-
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
|
292
|
-
SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
|
|
293
|
-
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
294
478
|
|
|
295
|
-
if (
|
|
296
|
-
p->
|
|
479
|
+
if (CG_NORM(b, p->n + p->m) <= 1e-12) {
|
|
480
|
+
memset(b, 0, (p->n + p->m) * sizeof(scs_float));
|
|
481
|
+
return 0;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
if (tol <= 0.) {
|
|
485
|
+
scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
|
|
486
|
+
"INDIRECT flag.\n",
|
|
487
|
+
tol);
|
|
297
488
|
}
|
|
298
489
|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
490
|
+
/* bg = b = [rx; ry] */
|
|
491
|
+
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
|
|
492
|
+
cudaMemcpyHostToDevice);
|
|
493
|
+
/* tmp = ry */
|
|
494
|
+
cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
|
|
495
|
+
cudaMemcpyDeviceToDevice);
|
|
496
|
+
/* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
|
|
497
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
|
|
498
|
+
|
|
499
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
|
|
500
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
|
|
501
|
+
/* bg[:n] = rx + A' R ry */
|
|
502
|
+
SCS(accum_by_atrans_gpu)
|
|
503
|
+
(Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
|
|
504
|
+
&p->buffer);
|
|
505
|
+
|
|
506
|
+
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
|
|
507
|
+
max_iters = 10 * Ag->n;
|
|
508
|
+
|
|
509
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
|
|
510
|
+
* in bg */
|
|
511
|
+
cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
|
|
512
|
+
|
|
513
|
+
/* bg[n:] = -ry */
|
|
514
|
+
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
|
515
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
|
|
516
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
|
|
517
|
+
|
|
518
|
+
/* b[n:] = Ax - ry */
|
|
519
|
+
#if GPU_TRANSPOSE_MAT > 0
|
|
520
|
+
SCS(accum_by_atrans_gpu)
|
|
521
|
+
(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
522
|
+
&p->buffer);
|
|
523
|
+
#else
|
|
524
|
+
SCS(accum_by_a_gpu)
|
|
525
|
+
(Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
526
|
+
&p->buffer);
|
|
527
|
+
#endif
|
|
528
|
+
|
|
529
|
+
/* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
|
|
530
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
|
|
531
|
+
|
|
532
|
+
/* copy bg = [x; y] back to b */
|
|
533
|
+
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
|
|
534
|
+
cudaMemcpyDeviceToHost);
|
|
535
|
+
p->tot_cg_its += cg_its;
|
|
536
|
+
#if VERBOSITY > 1
|
|
537
|
+
scs_printf("tol %.3e\n", tol);
|
|
538
|
+
scs_printf("cg_its %i\n", (int)cg_its);
|
|
302
539
|
#endif
|
|
303
540
|
return 0;
|
|
304
541
|
}
|