scs 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/LICENSE.txt +18 -18
- data/README.md +28 -9
- data/ext/scs/extconf.rb +29 -0
- data/lib/scs/ffi.rb +30 -13
- data/lib/scs/solver.rb +32 -14
- data/lib/scs/version.rb +1 -1
- data/vendor/scs/CITATION.cff +39 -0
- data/vendor/scs/CMakeLists.txt +272 -0
- data/vendor/scs/Makefile +24 -15
- data/vendor/scs/README.md +8 -216
- data/vendor/scs/include/aa.h +67 -23
- data/vendor/scs/include/cones.h +17 -17
- data/vendor/scs/include/glbopts.h +98 -32
- data/vendor/scs/include/linalg.h +2 -4
- data/vendor/scs/include/linsys.h +58 -44
- data/vendor/scs/include/normalize.h +3 -3
- data/vendor/scs/include/rw.h +8 -2
- data/vendor/scs/include/scs.h +293 -133
- data/vendor/scs/include/util.h +3 -15
- data/vendor/scs/linsys/cpu/direct/private.c +220 -224
- data/vendor/scs/linsys/cpu/direct/private.h +13 -7
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.c +177 -110
- data/vendor/scs/linsys/cpu/indirect/private.h +8 -4
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/csparse.c +87 -0
- data/vendor/scs/linsys/csparse.h +34 -0
- data/vendor/scs/linsys/csparse.o +0 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +1 -1
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/qdldl/changes +2 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
- data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
- data/vendor/scs/linsys/gpu/gpu.c +58 -21
- data/vendor/scs/linsys/gpu/gpu.h +66 -28
- data/vendor/scs/linsys/gpu/indirect/private.c +368 -154
- data/vendor/scs/linsys/gpu/indirect/private.h +26 -12
- data/vendor/scs/linsys/scs_matrix.c +498 -0
- data/vendor/scs/linsys/scs_matrix.h +70 -0
- data/vendor/scs/linsys/scs_matrix.o +0 -0
- data/vendor/scs/scs.mk +13 -9
- data/vendor/scs/src/aa.c +384 -109
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.c +440 -353
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.c +15 -5
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.c +84 -28
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.c +22 -64
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.c +161 -22
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.c +768 -561
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_indir.o +0 -0
- data/vendor/scs/src/scs_version.c +9 -3
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.c +37 -106
- data/vendor/scs/src/util.o +0 -0
- data/vendor/scs/test/minunit.h +17 -8
- data/vendor/scs/test/problem_utils.h +176 -14
- data/vendor/scs/test/problems/degenerate.h +130 -0
- data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
- data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
- data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
- data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
- data/vendor/scs/test/problems/random_prob +0 -0
- data/vendor/scs/test/problems/random_prob.h +45 -0
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
- data/vendor/scs/test/problems/small_lp.h +13 -14
- data/vendor/scs/test/problems/test_fails.h +43 -0
- data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
- data/vendor/scs/test/random_socp_prob.c +54 -53
- data/vendor/scs/test/rng.h +109 -0
- data/vendor/scs/test/run_from_file.c +19 -10
- data/vendor/scs/test/run_tests.c +27 -3
- metadata +30 -73
- data/ext/scs/Rakefile +0 -11
- data/vendor/scs/linsys/amatrix.c +0 -305
- data/vendor/scs/linsys/amatrix.h +0 -36
- data/vendor/scs/linsys/amatrix.o +0 -0
- data/vendor/scs/test/data/small_random_socp +0 -0
- data/vendor/scs/test/problems/small_random_socp.h +0 -33
- data/vendor/scs/test/run_tests +0 -2
|
@@ -1,54 +1,89 @@
|
|
|
1
1
|
#include "private.h"
|
|
2
|
+
#include "linsys.h"
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
/* do not use within pcg, reuses memory */
|
|
7
|
-
void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
|
|
8
|
-
const scs_float *x, scs_float *y) {
|
|
9
|
-
scs_float *v_m = p->tmp_m;
|
|
10
|
-
scs_float *v_n = p->r;
|
|
11
|
-
cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
12
|
-
cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
13
|
-
SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
|
|
14
|
-
cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
15
|
-
}
|
|
4
|
+
/* norm to use when deciding convergence */
|
|
5
|
+
/* should be consistent with CG_NORM in glbopts.h */
|
|
6
|
+
#define USE_L2_NORM (0)
|
|
16
7
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
scs_float
|
|
21
|
-
|
|
22
|
-
cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
23
|
-
cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
24
|
-
#if GPU_TRANSPOSE_MAT > 0
|
|
25
|
-
SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
|
|
8
|
+
static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
|
|
9
|
+
scs_int n) {
|
|
10
|
+
#if USE_L2_NORM > 0
|
|
11
|
+
scs_float nrm;
|
|
12
|
+
CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
|
|
26
13
|
#else
|
|
27
|
-
|
|
14
|
+
scs_int idx;
|
|
15
|
+
scs_float nrm;
|
|
16
|
+
CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
|
|
17
|
+
/* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
|
|
18
|
+
cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
19
|
+
nrm = ABS(nrm);
|
|
28
20
|
#endif
|
|
29
|
-
|
|
21
|
+
return nrm;
|
|
30
22
|
}
|
|
31
23
|
|
|
32
|
-
char *SCS(get_lin_sys_method)(
|
|
33
|
-
|
|
34
|
-
sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
|
|
35
|
-
(long)A->p[A->n], stgs->cg_rate);
|
|
36
|
-
return str;
|
|
24
|
+
const char *SCS(get_lin_sys_method)() {
|
|
25
|
+
return "sparse-indirect GPU";
|
|
37
26
|
}
|
|
38
27
|
|
|
28
|
+
/*
|
|
39
29
|
char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
|
40
30
|
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
|
41
|
-
sprintf(str,
|
|
42
|
-
|
|
43
|
-
(scs_float)p->tot_cg_its / (info->iter + 1),
|
|
44
|
-
p->total_solve_time / (info->iter + 1) / 1e3);
|
|
31
|
+
sprintf(str, "lin-sys: avg cg its: %2.2f\n",
|
|
32
|
+
(scs_float)p->tot_cg_its / (info->iter + 1));
|
|
45
33
|
p->tot_cg_its = 0;
|
|
46
|
-
p->total_solve_time = 0;
|
|
47
34
|
return str;
|
|
48
35
|
}
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
/* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
|
|
39
|
+
static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
|
|
40
|
+
scs_int i, k;
|
|
41
|
+
const ScsMatrix *A = p->A;
|
|
42
|
+
const ScsMatrix *P = p->P;
|
|
43
|
+
scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
|
|
44
|
+
|
|
45
|
+
#if VERBOSITY > 0
|
|
46
|
+
scs_printf("getting pre-conditioner\n");
|
|
47
|
+
#endif
|
|
48
|
+
|
|
49
|
+
for (i = 0; i < A->n; ++i) { /* cols */
|
|
50
|
+
M[i] = p->rho_x;
|
|
51
|
+
/* diag(A' R_y^{-1} A) */
|
|
52
|
+
for (k = A->p[i]; k < A->p[i + 1]; ++k) {
|
|
53
|
+
/* A->i[k] is row of entry k with value A->x[k] */
|
|
54
|
+
M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
|
|
55
|
+
}
|
|
56
|
+
if (P) {
|
|
57
|
+
for (k = P->p[i]; k < P->p[i + 1]; k++) {
|
|
58
|
+
/* diagonal element only */
|
|
59
|
+
if (P->i[k] == i) { /* row == col */
|
|
60
|
+
M[i] += P->x[k];
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
M[i] = 1. / M[i];
|
|
66
|
+
}
|
|
67
|
+
cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
68
|
+
scs_free(M);
|
|
69
|
+
#if VERBOSITY > 0
|
|
70
|
+
scs_printf("finished getting pre-conditioner\n");
|
|
71
|
+
#endif
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/* no need to update anything in this case */
|
|
75
|
+
void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
|
|
76
|
+
scs_int i;
|
|
77
|
+
for (i = 0; i < p->m; ++i)
|
|
78
|
+
p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
|
|
79
|
+
cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
|
|
80
|
+
cudaMemcpyHostToDevice);
|
|
81
|
+
set_preconditioner(p, rho_y_vec);
|
|
82
|
+
}
|
|
49
83
|
|
|
50
84
|
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
51
85
|
if (p) {
|
|
86
|
+
scs_free(p->inv_rho_y_vec);
|
|
52
87
|
cudaFree(p->p);
|
|
53
88
|
cudaFree(p->r);
|
|
54
89
|
cudaFree(p->Gp);
|
|
@@ -56,6 +91,11 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
|
56
91
|
cudaFree(p->tmp_m);
|
|
57
92
|
cudaFree(p->z);
|
|
58
93
|
cudaFree(p->M);
|
|
94
|
+
cudaFree(p->inv_rho_y_vec_gpu);
|
|
95
|
+
if (p->Pg) {
|
|
96
|
+
SCS(free_gpu_matrix)(p->Pg);
|
|
97
|
+
scs_free(p->Pg);
|
|
98
|
+
}
|
|
59
99
|
if (p->Ag) {
|
|
60
100
|
SCS(free_gpu_matrix)(p->Ag);
|
|
61
101
|
scs_free(p->Ag);
|
|
@@ -64,6 +104,12 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
|
64
104
|
SCS(free_gpu_matrix)(p->Agt);
|
|
65
105
|
scs_free(p->Agt);
|
|
66
106
|
}
|
|
107
|
+
if (p->buffer != SCS_NULL) {
|
|
108
|
+
cudaFree(p->buffer);
|
|
109
|
+
}
|
|
110
|
+
cusparseDestroyDnVec(p->dn_vec_m);
|
|
111
|
+
cusparseDestroyDnVec(p->dn_vec_n);
|
|
112
|
+
cusparseDestroyDnVec(p->dn_vec_n_p);
|
|
67
113
|
cusparseDestroy(p->cusparse_handle);
|
|
68
114
|
cublasDestroy(p->cublas_handle);
|
|
69
115
|
/* Don't reset because it interferes with other GPU programs. */
|
|
@@ -72,53 +118,110 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
|
72
118
|
}
|
|
73
119
|
}
|
|
74
120
|
|
|
75
|
-
/*
|
|
76
|
-
static void
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
|
|
82
|
-
cudaMemset(y, 0, A->n * sizeof(scs_float));
|
|
83
|
-
SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
|
|
84
|
-
CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
|
|
121
|
+
/* z = M * z elementwise in place, assumes M, z on GPU */
|
|
122
|
+
static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
|
|
123
|
+
scs_float *z, scs_int n) {
|
|
124
|
+
CUBLAS(tbmv)
|
|
125
|
+
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
|
126
|
+
0, M, 1, z, 1);
|
|
85
127
|
}
|
|
86
128
|
|
|
87
|
-
/*
|
|
88
|
-
static void
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
129
|
+
/* y = (rho_x * I + P + A' R_y^{-1} A) x */
|
|
130
|
+
static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
131
|
+
/* x and y MUST already be loaded to GPU */
|
|
132
|
+
scs_float *z = p->tmp_m; /* temp memory */
|
|
133
|
+
cudaMemset(y, 0, p->n * sizeof(scs_float));
|
|
134
|
+
cudaMemset(z, 0, p->m * sizeof(scs_float));
|
|
135
|
+
|
|
136
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
|
|
137
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
|
|
138
|
+
cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
|
|
139
|
+
|
|
140
|
+
/* y = rho_x * x */
|
|
141
|
+
CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
|
|
142
|
+
|
|
143
|
+
if (p->Pg) {
|
|
144
|
+
/* y = rho_x * x + Px */
|
|
145
|
+
SCS(accum_by_p_gpu)
|
|
146
|
+
(p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
|
147
|
+
&p->buffer);
|
|
148
|
+
}
|
|
92
149
|
|
|
93
|
-
|
|
94
|
-
|
|
150
|
+
/* z = Ax */
|
|
151
|
+
#if GPU_TRANSPOSE_MAT > 0
|
|
152
|
+
SCS(accum_by_atrans_gpu)
|
|
153
|
+
(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
154
|
+
&p->buffer);
|
|
155
|
+
#else
|
|
156
|
+
SCS(accum_by_a_gpu)
|
|
157
|
+
(p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
158
|
+
&p->buffer);
|
|
95
159
|
#endif
|
|
160
|
+
/* z = R_y^{-1} A x */
|
|
161
|
+
scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
|
|
96
162
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
103
|
-
scs_free(M);
|
|
163
|
+
/* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
|
|
164
|
+
SCS(accum_by_atrans_gpu)
|
|
165
|
+
(p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
|
166
|
+
&p->buffer);
|
|
167
|
+
}
|
|
104
168
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
169
|
+
/* P comes in upper triangular, expand to full
|
|
170
|
+
* First compute triplet version of full matrix, then compress to csc
|
|
171
|
+
* */
|
|
172
|
+
static csc *fill_p_matrix(const ScsMatrix *P) {
|
|
173
|
+
scs_int i, j, k, kk;
|
|
174
|
+
scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
|
|
175
|
+
csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
|
|
176
|
+
csc *P_full;
|
|
177
|
+
kk = 0;
|
|
178
|
+
for (j = 0; j < P->n; j++) { /* cols */
|
|
179
|
+
for (k = P->p[j]; k < P->p[j + 1]; k++) {
|
|
180
|
+
i = P->i[k]; /* row */
|
|
181
|
+
if (i > j) { /* only upper triangular needed */
|
|
182
|
+
break;
|
|
183
|
+
}
|
|
184
|
+
P_tmp->i[kk] = i;
|
|
185
|
+
P_tmp->p[kk] = j;
|
|
186
|
+
P_tmp->x[kk] = P->x[k];
|
|
187
|
+
kk++;
|
|
188
|
+
if (i == j) { /* diagonal */
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
P_tmp->i[kk] = j;
|
|
192
|
+
P_tmp->p[kk] = i;
|
|
193
|
+
P_tmp->x[kk] = P->x[k];
|
|
194
|
+
kk++;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
P_tmp->nz = kk; /* set number of nonzeros */
|
|
198
|
+
P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
|
|
199
|
+
SCS(cs_spfree)(P_tmp);
|
|
200
|
+
return P_full;
|
|
108
201
|
}
|
|
109
202
|
|
|
110
|
-
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
111
|
-
|
|
203
|
+
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
204
|
+
scs_float *rho_y_vec, scs_float rho_x) {
|
|
112
205
|
cudaError_t err;
|
|
206
|
+
scs_int i;
|
|
207
|
+
csc *P_full;
|
|
113
208
|
ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
|
114
|
-
ScsGpuMatrix *Ag = (ScsGpuMatrix *)
|
|
209
|
+
ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
|
210
|
+
ScsGpuMatrix *Pg = SCS_NULL;
|
|
115
211
|
|
|
212
|
+
#if GPU_TRANSPOSE_MAT > 0
|
|
213
|
+
size_t new_buffer_size = 0;
|
|
214
|
+
#endif
|
|
215
|
+
|
|
216
|
+
p->rho_x = rho_x;
|
|
116
217
|
p->cublas_handle = 0;
|
|
117
218
|
p->cusparse_handle = 0;
|
|
118
219
|
|
|
119
|
-
p->total_solve_time = 0;
|
|
120
220
|
p->tot_cg_its = 0;
|
|
121
221
|
|
|
222
|
+
p->buffer_size = 0;
|
|
223
|
+
p->buffer = SCS_NULL;
|
|
224
|
+
|
|
122
225
|
/* Get handle to the CUBLAS context */
|
|
123
226
|
cublasCreate(&p->cublas_handle);
|
|
124
227
|
|
|
@@ -127,15 +230,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
127
230
|
|
|
128
231
|
Ag->n = A->n;
|
|
129
232
|
Ag->m = A->m;
|
|
130
|
-
Ag->
|
|
233
|
+
Ag->nnz = A->p[A->n];
|
|
131
234
|
Ag->descr = 0;
|
|
132
|
-
/* Matrix description */
|
|
133
|
-
cusparseCreateMatDescr(&Ag->descr);
|
|
134
|
-
cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
|
135
|
-
cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
|
|
136
|
-
p->Ag = Ag;
|
|
137
|
-
p->Agt = SCS_NULL;
|
|
138
|
-
|
|
139
235
|
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
|
|
140
236
|
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
|
|
141
237
|
cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
|
|
@@ -144,10 +240,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
144
240
|
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
|
|
145
241
|
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
|
|
146
242
|
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
|
147
|
-
cudaMalloc((void **)&p->tmp_m,
|
|
148
|
-
A->m * sizeof(scs_float)); /* intermediate result */
|
|
243
|
+
cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
|
|
149
244
|
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
|
150
245
|
cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
|
|
246
|
+
cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
|
|
151
247
|
|
|
152
248
|
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
|
153
249
|
cudaMemcpyHostToDevice);
|
|
@@ -155,32 +251,94 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
155
251
|
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
|
156
252
|
cudaMemcpyHostToDevice);
|
|
157
253
|
|
|
158
|
-
|
|
254
|
+
p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
|
|
255
|
+
for (i = 0; i < A->m; ++i)
|
|
256
|
+
p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
|
|
257
|
+
cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
|
|
258
|
+
cudaMemcpyHostToDevice);
|
|
259
|
+
|
|
260
|
+
cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
|
|
261
|
+
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
|
262
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
|
263
|
+
|
|
264
|
+
if (P) {
|
|
265
|
+
Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
|
266
|
+
P_full = fill_p_matrix(P);
|
|
267
|
+
Pg->n = P_full->n;
|
|
268
|
+
Pg->m = P_full->m;
|
|
269
|
+
Pg->nnz = P_full->p[P_full->n];
|
|
270
|
+
Pg->descr = 0;
|
|
271
|
+
cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
|
|
272
|
+
cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
|
|
273
|
+
cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
|
|
274
|
+
|
|
275
|
+
cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
|
|
276
|
+
cudaMemcpyHostToDevice);
|
|
277
|
+
cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
|
|
278
|
+
cudaMemcpyHostToDevice);
|
|
279
|
+
cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
|
|
280
|
+
cudaMemcpyHostToDevice);
|
|
281
|
+
|
|
282
|
+
cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
|
|
283
|
+
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
|
284
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
|
285
|
+
|
|
286
|
+
SCS(cs_spfree)(P_full);
|
|
287
|
+
} else {
|
|
288
|
+
Pg = SCS_NULL;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
p->Ag = Ag;
|
|
292
|
+
p->Pg = Pg;
|
|
293
|
+
p->Agt = SCS_NULL;
|
|
294
|
+
|
|
295
|
+
/* we initialize with tmp_m but always overwrite it so it doesn't matter */
|
|
296
|
+
cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
|
297
|
+
cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
|
298
|
+
cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
|
|
299
|
+
|
|
300
|
+
set_preconditioner(p, rho_y_vec);
|
|
159
301
|
|
|
160
302
|
#if GPU_TRANSPOSE_MAT > 0
|
|
161
303
|
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
|
162
304
|
p->Agt->n = A->m;
|
|
163
305
|
p->Agt->m = A->n;
|
|
164
|
-
p->Agt->
|
|
306
|
+
p->Agt->nnz = A->p[A->n];
|
|
165
307
|
p->Agt->descr = 0;
|
|
166
308
|
/* Matrix description */
|
|
167
|
-
cusparseCreateMatDescr(&p->Agt->descr);
|
|
168
|
-
cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
|
169
|
-
cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
|
|
170
309
|
|
|
171
310
|
cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
|
|
172
311
|
cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
|
|
173
312
|
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
|
|
174
313
|
/* transpose Ag into Agt for faster multiplies */
|
|
175
314
|
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
315
|
+
cusparseCsr2cscEx2_bufferSize(
|
|
316
|
+
p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
|
|
317
|
+
p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
|
|
318
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
|
|
319
|
+
|
|
320
|
+
if (new_buffer_size > p->buffer_size) {
|
|
321
|
+
if (p->buffer != SCS_NULL) {
|
|
322
|
+
cudaFree(p->buffer);
|
|
323
|
+
}
|
|
324
|
+
cudaMalloc(&p->buffer, new_buffer_size);
|
|
325
|
+
p->buffer_size = new_buffer_size;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
|
|
329
|
+
Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
|
|
330
|
+
CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
|
|
331
|
+
SCS_CSR2CSC_ALG, p->buffer);
|
|
332
|
+
|
|
333
|
+
cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
|
|
334
|
+
p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
|
|
335
|
+
SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
|
|
336
|
+
SCS_CUDA_FLOAT);
|
|
179
337
|
#endif
|
|
180
338
|
|
|
181
339
|
err = cudaGetLastError();
|
|
182
340
|
if (err != cudaSuccess) {
|
|
183
|
-
printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
|
|
341
|
+
printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
|
|
184
342
|
cudaGetErrorString(err));
|
|
185
343
|
SCS(free_lin_sys_work)(p);
|
|
186
344
|
return SCS_NULL;
|
|
@@ -188,117 +346,173 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
188
346
|
return p;
|
|
189
347
|
}
|
|
190
348
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
|
196
|
-
0, M, 1, z, 1);
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
|
|
200
|
-
static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
|
|
201
|
-
ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
349
|
+
/* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
|
|
350
|
+
* b */
|
|
351
|
+
/* on GPU */
|
|
352
|
+
static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
202
353
|
scs_int max_its, scs_float tol) {
|
|
203
|
-
scs_int i, n =
|
|
204
|
-
scs_float
|
|
354
|
+
scs_int i, n = pr->n;
|
|
355
|
+
scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
|
|
205
356
|
scs_float onef = 1.0, neg_onef = -1.0;
|
|
206
357
|
scs_float *p = pr->p; /* cg direction */
|
|
207
358
|
scs_float *Gp = pr->Gp; /* updated CG direction */
|
|
208
359
|
scs_float *r = pr->r; /* cg residual */
|
|
209
360
|
scs_float *z = pr->z; /* preconditioned */
|
|
210
|
-
scs_float *M = pr->M; /* preconditioner */
|
|
211
361
|
cublasHandle_t cublas_handle = pr->cublas_handle;
|
|
212
362
|
|
|
213
|
-
if (s
|
|
363
|
+
if (!s) {
|
|
364
|
+
/* take s = 0 */
|
|
365
|
+
/* r = b */
|
|
214
366
|
cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
367
|
+
/* b = 0 */
|
|
215
368
|
cudaMemset(bg, 0, n * sizeof(scs_float));
|
|
216
369
|
} else {
|
|
217
370
|
/* p contains bg temporarily */
|
|
218
371
|
cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
219
|
-
/* bg
|
|
372
|
+
/* bg = s */
|
|
220
373
|
cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
221
|
-
|
|
374
|
+
/* r = Mat * s */
|
|
375
|
+
mat_vec(pr, bg, r);
|
|
376
|
+
/* r = Mat * s - b */
|
|
222
377
|
CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
|
|
378
|
+
/* r = b - Mat * s */
|
|
223
379
|
CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
|
|
224
380
|
}
|
|
225
381
|
|
|
226
|
-
/* for some reason nrm2 is VERY slow */
|
|
227
|
-
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
|
228
|
-
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
|
229
|
-
nrm_r = SQRTF(nrm_r);
|
|
230
382
|
/* check to see if we need to run CG at all */
|
|
231
|
-
if (
|
|
383
|
+
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
|
|
232
384
|
return 0;
|
|
233
385
|
}
|
|
234
386
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
387
|
+
/* z = M r */
|
|
388
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
389
|
+
scale_by_diag(cublas_handle, pr->M, z, n);
|
|
390
|
+
/* ztr = z'r */
|
|
391
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
|
392
|
+
/* p = z */
|
|
238
393
|
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
239
394
|
|
|
240
395
|
for (i = 0; i < max_its; ++i) {
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
alpha =
|
|
396
|
+
/* Gp = Mat * p */
|
|
397
|
+
mat_vec(pr, p, Gp);
|
|
398
|
+
/* ptGp = p'Gp */
|
|
399
|
+
CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
|
|
400
|
+
/* alpha = z'r / p'G p */
|
|
401
|
+
alpha = ztr / ptGp;
|
|
246
402
|
neg_alpha = -alpha;
|
|
247
|
-
|
|
403
|
+
/* b += alpha * p */
|
|
248
404
|
CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
|
|
405
|
+
/* r -= alpha * G p */
|
|
249
406
|
CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
|
|
250
407
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
if (nrm_r < tol) {
|
|
256
|
-
i++;
|
|
257
|
-
break;
|
|
258
|
-
}
|
|
259
|
-
ipzr_old = ipzr;
|
|
260
|
-
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
|
261
|
-
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
|
408
|
+
#if VERBOSITY > 3
|
|
409
|
+
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
|
|
410
|
+
cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
|
|
411
|
+
#endif
|
|
262
412
|
|
|
263
|
-
|
|
413
|
+
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
|
|
414
|
+
return i + 1;
|
|
415
|
+
}
|
|
416
|
+
/* z = M r */
|
|
417
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
418
|
+
scale_by_diag(cublas_handle, pr->M, z, n);
|
|
419
|
+
ztr_prev = ztr;
|
|
420
|
+
/* ztr = z'r */
|
|
421
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
|
422
|
+
beta = ztr / ztr_prev;
|
|
423
|
+
/* p = beta * p, where beta = ztr / ztr_prev */
|
|
264
424
|
CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
|
|
425
|
+
/* p = z + beta * p */
|
|
265
426
|
CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
|
|
266
427
|
}
|
|
267
|
-
#if EXTRA_VERBOSE > 0
|
|
268
|
-
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
|
|
269
|
-
#endif
|
|
270
428
|
return i;
|
|
271
429
|
}
|
|
272
430
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
431
|
+
/* solves Mx = b, for x but stores result in b */
|
|
432
|
+
/* s contains warm-start (if available) */
|
|
433
|
+
/*
|
|
434
|
+
* [x] = [rho_x I + P A' ]^{-1} [rx]
|
|
435
|
+
* [y] [ A -R_y ] [ry]
|
|
436
|
+
*
|
|
437
|
+
* R_y = diag(rho_y_vec)
|
|
438
|
+
*
|
|
439
|
+
* becomes:
|
|
440
|
+
*
|
|
441
|
+
* x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
|
|
442
|
+
* y = R_y^{-1} (Ax - ry)
|
|
443
|
+
*
|
|
444
|
+
*/
|
|
445
|
+
scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
446
|
+
scs_float tol) {
|
|
447
|
+
scs_int cg_its, max_iters;
|
|
279
448
|
scs_float neg_onef = -1.0;
|
|
449
|
+
|
|
450
|
+
/* these are on GPU */
|
|
451
|
+
scs_float *bg = p->bg;
|
|
452
|
+
scs_float *tmp_m = p->tmp_m;
|
|
280
453
|
ScsGpuMatrix *Ag = p->Ag;
|
|
281
|
-
|
|
282
|
-
SCS(norm)(b, Ag->n) *
|
|
283
|
-
(iter < 0 ? CG_BEST_TOL
|
|
284
|
-
: CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
|
|
285
|
-
SCS(tic)(&linsys_timer);
|
|
286
|
-
/* all on GPU */
|
|
287
|
-
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
288
|
-
SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
|
|
289
|
-
/* solves (I+A'A)x = b, s warm start, solution stored in b */
|
|
290
|
-
cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
|
|
291
|
-
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
|
292
|
-
SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
|
|
293
|
-
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
454
|
+
ScsGpuMatrix *Pg = p->Pg;
|
|
294
455
|
|
|
295
|
-
if (
|
|
296
|
-
p->
|
|
456
|
+
if (CG_NORM(b, p->n + p->m) <= 1e-12) {
|
|
457
|
+
memset(b, 0, (p->n + p->m) * sizeof(scs_float));
|
|
458
|
+
return 0;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
if (tol <= 0.) {
|
|
462
|
+
scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
|
|
463
|
+
"INDIRECT flag.\n",
|
|
464
|
+
tol);
|
|
297
465
|
}
|
|
298
466
|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
467
|
+
/* bg = b = [rx; ry] */
|
|
468
|
+
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
|
|
469
|
+
cudaMemcpyHostToDevice);
|
|
470
|
+
/* tmp = ry */
|
|
471
|
+
cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
|
|
472
|
+
cudaMemcpyDeviceToDevice);
|
|
473
|
+
/* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
|
|
474
|
+
scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
|
|
475
|
+
|
|
476
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
|
|
477
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
|
|
478
|
+
/* bg[:n] = rx + A' R ry */
|
|
479
|
+
SCS(accum_by_atrans_gpu)
|
|
480
|
+
(Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
|
|
481
|
+
&p->buffer);
|
|
482
|
+
|
|
483
|
+
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
|
|
484
|
+
max_iters = 10 * Ag->n;
|
|
485
|
+
|
|
486
|
+
/* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
|
|
487
|
+
* in bg */
|
|
488
|
+
cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
|
|
489
|
+
|
|
490
|
+
/* bg[n:] = -ry */
|
|
491
|
+
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
|
492
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
|
|
493
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
|
|
494
|
+
|
|
495
|
+
/* b[n:] = Ax - ry */
|
|
496
|
+
#if GPU_TRANSPOSE_MAT > 0
|
|
497
|
+
SCS(accum_by_atrans_gpu)
|
|
498
|
+
(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
499
|
+
&p->buffer);
|
|
500
|
+
#else
|
|
501
|
+
SCS(accum_by_a_gpu)
|
|
502
|
+
(Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
503
|
+
&p->buffer);
|
|
504
|
+
#endif
|
|
505
|
+
|
|
506
|
+
/* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
|
|
507
|
+
scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
|
|
508
|
+
|
|
509
|
+
/* copy bg = [x; y] back to b */
|
|
510
|
+
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
|
|
511
|
+
cudaMemcpyDeviceToHost);
|
|
512
|
+
p->tot_cg_its += cg_its;
|
|
513
|
+
#if VERBOSITY > 1
|
|
514
|
+
scs_printf("tol %.3e\n", tol);
|
|
515
|
+
scs_printf("cg_its %i\n", (int)cg_its);
|
|
302
516
|
#endif
|
|
303
517
|
return 0;
|
|
304
518
|
}
|