scs 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +11 -6
- data/lib/scs/ffi.rb +30 -13
- data/lib/scs/solver.rb +32 -9
- data/lib/scs/version.rb +1 -1
- data/vendor/scs/CITATION.cff +39 -0
- data/vendor/scs/CMakeLists.txt +7 -8
- data/vendor/scs/Makefile +24 -15
- data/vendor/scs/README.md +5 -263
- data/vendor/scs/include/aa.h +67 -23
- data/vendor/scs/include/cones.h +17 -17
- data/vendor/scs/include/glbopts.h +98 -32
- data/vendor/scs/include/linalg.h +2 -4
- data/vendor/scs/include/linsys.h +58 -44
- data/vendor/scs/include/normalize.h +3 -3
- data/vendor/scs/include/rw.h +8 -2
- data/vendor/scs/include/scs.h +293 -133
- data/vendor/scs/include/util.h +3 -15
- data/vendor/scs/linsys/cpu/direct/private.c +220 -224
- data/vendor/scs/linsys/cpu/direct/private.h +13 -7
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.c +177 -110
- data/vendor/scs/linsys/cpu/indirect/private.h +8 -4
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/csparse.c +87 -0
- data/vendor/scs/linsys/csparse.h +34 -0
- data/vendor/scs/linsys/csparse.o +0 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +1 -1
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/qdldl/changes +2 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
- data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
- data/vendor/scs/linsys/gpu/gpu.c +31 -33
- data/vendor/scs/linsys/gpu/gpu.h +48 -31
- data/vendor/scs/linsys/gpu/indirect/private.c +338 -232
- data/vendor/scs/linsys/gpu/indirect/private.h +23 -14
- data/vendor/scs/linsys/scs_matrix.c +498 -0
- data/vendor/scs/linsys/scs_matrix.h +70 -0
- data/vendor/scs/linsys/scs_matrix.o +0 -0
- data/vendor/scs/scs.mk +13 -9
- data/vendor/scs/src/aa.c +384 -109
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.c +440 -353
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.c +15 -5
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.c +84 -28
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.c +22 -64
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.c +160 -21
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.c +767 -563
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_indir.o +0 -0
- data/vendor/scs/src/scs_version.c +9 -3
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.c +37 -106
- data/vendor/scs/src/util.o +0 -0
- data/vendor/scs/test/minunit.h +17 -8
- data/vendor/scs/test/problem_utils.h +176 -14
- data/vendor/scs/test/problems/degenerate.h +130 -0
- data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
- data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
- data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
- data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
- data/vendor/scs/test/problems/random_prob +0 -0
- data/vendor/scs/test/problems/random_prob.h +45 -0
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
- data/vendor/scs/test/problems/small_lp.h +13 -14
- data/vendor/scs/test/problems/test_fails.h +43 -0
- data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
- data/vendor/scs/test/random_socp_prob.c +54 -53
- data/vendor/scs/test/rng.h +109 -0
- data/vendor/scs/test/run_from_file.c +19 -10
- data/vendor/scs/test/run_tests.c +27 -3
- metadata +20 -8
- data/vendor/scs/linsys/amatrix.c +0 -305
- data/vendor/scs/linsys/amatrix.h +0 -36
- data/vendor/scs/linsys/amatrix.o +0 -0
- data/vendor/scs/test/data/small_random_socp +0 -0
- data/vendor/scs/test/problems/small_random_socp.h +0 -33
- data/vendor/scs/test/run_tests +0 -2
|
@@ -1,71 +1,89 @@
|
|
|
1
1
|
#include "private.h"
|
|
2
|
+
#include "linsys.h"
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
/* do not use within pcg, reuses memory */
|
|
7
|
-
void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
|
|
8
|
-
const scs_float *x, scs_float *y) {
|
|
9
|
-
scs_float *v_m = p->tmp_m;
|
|
10
|
-
scs_float *v_n = p->r;
|
|
11
|
-
cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
12
|
-
cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
13
|
-
|
|
14
|
-
cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m);
|
|
15
|
-
cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n);
|
|
16
|
-
SCS(_accum_by_atrans_gpu)(
|
|
17
|
-
p->Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
|
|
18
|
-
&p->buffer_size, &p->buffer
|
|
19
|
-
);
|
|
20
|
-
|
|
21
|
-
cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
/* do not use within pcg, reuses memory */
|
|
25
|
-
void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
|
|
26
|
-
scs_float *y) {
|
|
27
|
-
scs_float *v_m = p->tmp_m;
|
|
28
|
-
scs_float *v_n = p->r;
|
|
29
|
-
cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
30
|
-
cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
4
|
+
/* norm to use when deciding convergence */
|
|
5
|
+
/* should be consistent with CG_NORM in glbopts.h */
|
|
6
|
+
#define USE_L2_NORM (0)
|
|
31
7
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
#if
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
&p->buffer_size, &p->buffer
|
|
38
|
-
);
|
|
8
|
+
static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
|
|
9
|
+
scs_int n) {
|
|
10
|
+
#if USE_L2_NORM > 0
|
|
11
|
+
scs_float nrm;
|
|
12
|
+
CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
|
|
39
13
|
#else
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
14
|
+
scs_int idx;
|
|
15
|
+
scs_float nrm;
|
|
16
|
+
CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
|
|
17
|
+
/* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
|
|
18
|
+
cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
19
|
+
nrm = ABS(nrm);
|
|
44
20
|
#endif
|
|
45
|
-
|
|
46
|
-
cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
21
|
+
return nrm;
|
|
47
22
|
}
|
|
48
23
|
|
|
49
|
-
char *SCS(get_lin_sys_method)(
|
|
50
|
-
|
|
51
|
-
sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
|
|
52
|
-
(long)A->p[A->n], stgs->cg_rate);
|
|
53
|
-
return str;
|
|
24
|
+
const char *SCS(get_lin_sys_method)() {
|
|
25
|
+
return "sparse-indirect GPU";
|
|
54
26
|
}
|
|
55
27
|
|
|
28
|
+
/*
|
|
56
29
|
char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
|
57
30
|
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
|
58
|
-
sprintf(str,
|
|
59
|
-
|
|
60
|
-
(scs_float)p->tot_cg_its / (info->iter + 1),
|
|
61
|
-
p->total_solve_time / (info->iter + 1) / 1e3);
|
|
31
|
+
sprintf(str, "lin-sys: avg cg its: %2.2f\n",
|
|
32
|
+
(scs_float)p->tot_cg_its / (info->iter + 1));
|
|
62
33
|
p->tot_cg_its = 0;
|
|
63
|
-
p->total_solve_time = 0;
|
|
64
34
|
return str;
|
|
65
35
|
}
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
/* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
|
|
39
|
+
static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
|
|
40
|
+
scs_int i, k;
|
|
41
|
+
const ScsMatrix *A = p->A;
|
|
42
|
+
const ScsMatrix *P = p->P;
|
|
43
|
+
scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
|
|
44
|
+
|
|
45
|
+
#if VERBOSITY > 0
|
|
46
|
+
scs_printf("getting pre-conditioner\n");
|
|
47
|
+
#endif
|
|
48
|
+
|
|
49
|
+
for (i = 0; i < A->n; ++i) { /* cols */
|
|
50
|
+
M[i] = p->rho_x;
|
|
51
|
+
/* diag(A' R_y^{-1} A) */
|
|
52
|
+
for (k = A->p[i]; k < A->p[i + 1]; ++k) {
|
|
53
|
+
/* A->i[k] is row of entry k with value A->x[k] */
|
|
54
|
+
M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
|
|
55
|
+
}
|
|
56
|
+
if (P) {
|
|
57
|
+
for (k = P->p[i]; k < P->p[i + 1]; k++) {
|
|
58
|
+
/* diagonal element only */
|
|
59
|
+
if (P->i[k] == i) { /* row == col */
|
|
60
|
+
M[i] += P->x[k];
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
M[i] = 1. / M[i];
|
|
66
|
+
}
|
|
67
|
+
cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
68
|
+
scs_free(M);
|
|
69
|
+
#if VERBOSITY > 0
|
|
70
|
+
scs_printf("finished getting pre-conditioner\n");
|
|
71
|
+
#endif
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/* no need to update anything in this case */
|
|
75
|
+
void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
|
|
76
|
+
scs_int i;
|
|
77
|
+
for (i = 0; i < p->m; ++i)
|
|
78
|
+
p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
|
|
79
|
+
cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
|
|
80
|
+
cudaMemcpyHostToDevice);
|
|
81
|
+
set_preconditioner(p, rho_y_vec);
|
|
82
|
+
}
|
|
66
83
|
|
|
67
84
|
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
68
85
|
if (p) {
|
|
86
|
+
scs_free(p->inv_rho_y_vec);
|
|
69
87
|
cudaFree(p->p);
|
|
70
88
|
cudaFree(p->r);
|
|
71
89
|
cudaFree(p->Gp);
|
|
@@ -73,6 +91,11 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
|
73
91
|
cudaFree(p->tmp_m);
|
|
74
92
|
cudaFree(p->z);
|
|
75
93
|
cudaFree(p->M);
|
|
94
|
+
cudaFree(p->inv_rho_y_vec_gpu);
|
|
95
|
+
if (p->Pg) {
|
|
96
|
+
SCS(free_gpu_matrix)(p->Pg);
|
|
97
|
+
scs_free(p->Pg);
|
|
98
|
+
}
|
|
76
99
|
if (p->Ag) {
|
|
77
100
|
SCS(free_gpu_matrix)(p->Ag);
|
|
78
101
|
scs_free(p->Ag);
|
|
@@ -86,6 +109,7 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
|
86
109
|
}
|
|
87
110
|
cusparseDestroyDnVec(p->dn_vec_m);
|
|
88
111
|
cusparseDestroyDnVec(p->dn_vec_n);
|
|
112
|
+
cusparseDestroyDnVec(p->dn_vec_n_p);
|
|
89
113
|
cusparseDestroy(p->cusparse_handle);
|
|
90
114
|
cublasDestroy(p->cublas_handle);
|
|
91
115
|
/* Don't reset because it interferes with other GPU programs. */
|
|
@@ -94,80 +118,105 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
|
94
118
|
}
|
|
95
119
|
}
|
|
96
120
|
|
|
97
|
-
/*
|
|
98
|
-
static void
|
|
99
|
-
|
|
121
|
+
/* z = M * z elementwise in place, assumes M, z on GPU */
|
|
122
|
+
static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
|
|
123
|
+
scs_float *z, scs_int n) {
|
|
124
|
+
CUBLAS(tbmv)
|
|
125
|
+
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
|
126
|
+
0, M, 1, z, 1);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/* y = (rho_x * I + P + A' R_y^{-1} A) x */
|
|
130
|
+
static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
100
131
|
/* x and y MUST already be loaded to GPU */
|
|
101
|
-
scs_float *
|
|
102
|
-
cudaMemset(
|
|
132
|
+
scs_float *z = p->tmp_m; /* temp memory */
|
|
133
|
+
cudaMemset(y, 0, p->n * sizeof(scs_float));
|
|
134
|
+
cudaMemset(z, 0, p->m * sizeof(scs_float));
|
|
135
|
+
|
|
136
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
|
|
137
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
|
|
138
|
+
cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
|
|
139
|
+
|
|
140
|
+
/* y = rho_x * x */
|
|
141
|
+
CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
|
|
142
|
+
|
|
143
|
+
if (p->Pg) {
|
|
144
|
+
/* y = rho_x * x + Px */
|
|
145
|
+
SCS(accum_by_p_gpu)
|
|
146
|
+
(p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
|
147
|
+
&p->buffer);
|
|
148
|
+
}
|
|
103
149
|
|
|
104
|
-
|
|
105
|
-
cusparseDnVecSetValues(p->dn_vec_n, (void *) x);
|
|
150
|
+
/* z = Ax */
|
|
106
151
|
#if GPU_TRANSPOSE_MAT > 0
|
|
107
|
-
SCS(
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
);
|
|
152
|
+
SCS(accum_by_atrans_gpu)
|
|
153
|
+
(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
154
|
+
&p->buffer);
|
|
111
155
|
#else
|
|
112
|
-
SCS(
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
);
|
|
156
|
+
SCS(accum_by_a_gpu)
|
|
157
|
+
(p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
158
|
+
&p->buffer);
|
|
116
159
|
#endif
|
|
160
|
+
/* z = R_y^{-1} A x */
|
|
161
|
+
scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
|
|
117
162
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
SCS(_accum_by_atrans_gpu)(
|
|
123
|
-
A, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
|
|
124
|
-
&p->buffer_size, &p->buffer
|
|
125
|
-
);
|
|
126
|
-
|
|
127
|
-
CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
|
|
163
|
+
/* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
|
|
164
|
+
SCS(accum_by_atrans_gpu)
|
|
165
|
+
(p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
|
166
|
+
&p->buffer);
|
|
128
167
|
}
|
|
129
168
|
|
|
130
|
-
/*
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
169
|
+
/* P comes in upper triangular, expand to full
|
|
170
|
+
* First compute triplet version of full matrix, then compress to csc
|
|
171
|
+
* */
|
|
172
|
+
static csc *fill_p_matrix(const ScsMatrix *P) {
|
|
173
|
+
scs_int i, j, k, kk;
|
|
174
|
+
scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
|
|
175
|
+
csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
|
|
176
|
+
csc *P_full;
|
|
177
|
+
kk = 0;
|
|
178
|
+
for (j = 0; j < P->n; j++) { /* cols */
|
|
179
|
+
for (k = P->p[j]; k < P->p[j + 1]; k++) {
|
|
180
|
+
i = P->i[k]; /* row */
|
|
181
|
+
if (i > j) { /* only upper triangular needed */
|
|
182
|
+
break;
|
|
183
|
+
}
|
|
184
|
+
P_tmp->i[kk] = i;
|
|
185
|
+
P_tmp->p[kk] = j;
|
|
186
|
+
P_tmp->x[kk] = P->x[k];
|
|
187
|
+
kk++;
|
|
188
|
+
if (i == j) { /* diagonal */
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
P_tmp->i[kk] = j;
|
|
192
|
+
P_tmp->p[kk] = i;
|
|
193
|
+
P_tmp->x[kk] = P->x[k];
|
|
194
|
+
kk++;
|
|
195
|
+
}
|
|
144
196
|
}
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
scs_printf("finished getting pre-conditioner\n");
|
|
150
|
-
#endif
|
|
197
|
+
P_tmp->nz = kk; /* set number of nonzeros */
|
|
198
|
+
P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
|
|
199
|
+
SCS(cs_spfree)(P_tmp);
|
|
200
|
+
return P_full;
|
|
151
201
|
}
|
|
152
202
|
|
|
153
|
-
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
154
|
-
|
|
203
|
+
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
204
|
+
scs_float *rho_y_vec, scs_float rho_x) {
|
|
155
205
|
cudaError_t err;
|
|
206
|
+
scs_int i;
|
|
207
|
+
csc *P_full;
|
|
156
208
|
ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
|
157
|
-
ScsGpuMatrix *Ag = (ScsGpuMatrix *)
|
|
158
|
-
|
|
159
|
-
/* Used for initializing dense vectors */
|
|
160
|
-
scs_float *tmp_null_n = SCS_NULL;
|
|
161
|
-
scs_float *tmp_null_m = SCS_NULL;
|
|
209
|
+
ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
|
210
|
+
ScsGpuMatrix *Pg = SCS_NULL;
|
|
162
211
|
|
|
163
212
|
#if GPU_TRANSPOSE_MAT > 0
|
|
164
213
|
size_t new_buffer_size = 0;
|
|
165
214
|
#endif
|
|
166
215
|
|
|
216
|
+
p->rho_x = rho_x;
|
|
167
217
|
p->cublas_handle = 0;
|
|
168
218
|
p->cusparse_handle = 0;
|
|
169
219
|
|
|
170
|
-
p->total_solve_time = 0;
|
|
171
220
|
p->tot_cg_its = 0;
|
|
172
221
|
|
|
173
222
|
p->buffer_size = 0;
|
|
@@ -181,13 +230,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
181
230
|
|
|
182
231
|
Ag->n = A->n;
|
|
183
232
|
Ag->m = A->m;
|
|
184
|
-
Ag->
|
|
233
|
+
Ag->nnz = A->p[A->n];
|
|
185
234
|
Ag->descr = 0;
|
|
186
|
-
/* Matrix description */
|
|
187
|
-
|
|
188
|
-
p->Ag = Ag;
|
|
189
|
-
p->Agt = SCS_NULL;
|
|
190
|
-
|
|
191
235
|
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
|
|
192
236
|
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
|
|
193
237
|
cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
|
|
@@ -196,10 +240,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
196
240
|
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
|
|
197
241
|
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
|
|
198
242
|
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
|
199
|
-
cudaMalloc((void **)&p->tmp_m,
|
|
200
|
-
A->m * sizeof(scs_float)); /* intermediate result */
|
|
243
|
+
cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
|
|
201
244
|
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
|
202
245
|
cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
|
|
246
|
+
cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
|
|
203
247
|
|
|
204
248
|
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
|
205
249
|
cudaMemcpyHostToDevice);
|
|
@@ -207,25 +251,59 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
207
251
|
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
|
208
252
|
cudaMemcpyHostToDevice);
|
|
209
253
|
|
|
210
|
-
|
|
211
|
-
(
|
|
212
|
-
|
|
213
|
-
|
|
254
|
+
p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
|
|
255
|
+
for (i = 0; i < A->m; ++i)
|
|
256
|
+
p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
|
|
257
|
+
cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
|
|
258
|
+
cudaMemcpyHostToDevice);
|
|
214
259
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
260
|
+
cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
|
|
261
|
+
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
|
262
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
|
263
|
+
|
|
264
|
+
if (P) {
|
|
265
|
+
Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
|
266
|
+
P_full = fill_p_matrix(P);
|
|
267
|
+
Pg->n = P_full->n;
|
|
268
|
+
Pg->m = P_full->m;
|
|
269
|
+
Pg->nnz = P_full->p[P_full->n];
|
|
270
|
+
Pg->descr = 0;
|
|
271
|
+
cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
|
|
272
|
+
cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
|
|
273
|
+
cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
|
|
274
|
+
|
|
275
|
+
cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
|
|
276
|
+
cudaMemcpyHostToDevice);
|
|
277
|
+
cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
|
|
278
|
+
cudaMemcpyHostToDevice);
|
|
279
|
+
cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
|
|
280
|
+
cudaMemcpyHostToDevice);
|
|
281
|
+
|
|
282
|
+
cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
|
|
283
|
+
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
|
284
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
|
285
|
+
|
|
286
|
+
SCS(cs_spfree)(P_full);
|
|
287
|
+
} else {
|
|
288
|
+
Pg = SCS_NULL;
|
|
289
|
+
}
|
|
221
290
|
|
|
222
|
-
|
|
291
|
+
p->Ag = Ag;
|
|
292
|
+
p->Pg = Pg;
|
|
293
|
+
p->Agt = SCS_NULL;
|
|
294
|
+
|
|
295
|
+
/* we initialize with tmp_m but always overwrite it so it doesn't matter */
|
|
296
|
+
cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
|
297
|
+
cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
|
298
|
+
cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
|
|
299
|
+
|
|
300
|
+
set_preconditioner(p, rho_y_vec);
|
|
223
301
|
|
|
224
302
|
#if GPU_TRANSPOSE_MAT > 0
|
|
225
303
|
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
|
226
304
|
p->Agt->n = A->m;
|
|
227
305
|
p->Agt->m = A->n;
|
|
228
|
-
p->Agt->
|
|
306
|
+
p->Agt->nnz = A->p[A->n];
|
|
229
307
|
p->Agt->descr = 0;
|
|
230
308
|
/* Matrix description */
|
|
231
309
|
|
|
@@ -234,13 +312,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
234
312
|
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
|
|
235
313
|
/* transpose Ag into Agt for faster multiplies */
|
|
236
314
|
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
|
|
237
|
-
cusparseCsr2cscEx2_bufferSize
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
|
|
242
|
-
CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG,
|
|
243
|
-
&new_buffer_size);
|
|
315
|
+
cusparseCsr2cscEx2_bufferSize(
|
|
316
|
+
p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
|
|
317
|
+
p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
|
|
318
|
+
CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
|
|
244
319
|
|
|
245
320
|
if (new_buffer_size > p->buffer_size) {
|
|
246
321
|
if (p->buffer != SCS_NULL) {
|
|
@@ -250,24 +325,20 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
250
325
|
p->buffer_size = new_buffer_size;
|
|
251
326
|
}
|
|
252
327
|
|
|
253
|
-
cusparseCsr2cscEx2
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->Annz,
|
|
263
|
-
p->Agt->p, p->Agt->i, p->Agt->x,
|
|
264
|
-
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
|
265
|
-
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
|
328
|
+
cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
|
|
329
|
+
Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
|
|
330
|
+
CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
|
|
331
|
+
SCS_CSR2CSC_ALG, p->buffer);
|
|
332
|
+
|
|
333
|
+
cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
|
|
334
|
+
p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
|
|
335
|
+
SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
|
|
336
|
+
SCS_CUDA_FLOAT);
|
|
266
337
|
#endif
|
|
267
338
|
|
|
268
339
|
err = cudaGetLastError();
|
|
269
340
|
if (err != cudaSuccess) {
|
|
270
|
-
printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
|
|
341
|
+
printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
|
|
271
342
|
cudaGetErrorString(err));
|
|
272
343
|
SCS(free_lin_sys_work)(p);
|
|
273
344
|
return SCS_NULL;
|
|
@@ -275,138 +346,173 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
|
275
346
|
return p;
|
|
276
347
|
}
|
|
277
348
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
|
283
|
-
0, M, 1, z, 1);
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
|
|
287
|
-
static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
|
|
288
|
-
ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
349
|
+
/* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
|
|
350
|
+
* b */
|
|
351
|
+
/* on GPU */
|
|
352
|
+
static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
289
353
|
scs_int max_its, scs_float tol) {
|
|
290
|
-
scs_int i, n =
|
|
291
|
-
scs_float
|
|
354
|
+
scs_int i, n = pr->n;
|
|
355
|
+
scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
|
|
292
356
|
scs_float onef = 1.0, neg_onef = -1.0;
|
|
293
357
|
scs_float *p = pr->p; /* cg direction */
|
|
294
358
|
scs_float *Gp = pr->Gp; /* updated CG direction */
|
|
295
359
|
scs_float *r = pr->r; /* cg residual */
|
|
296
360
|
scs_float *z = pr->z; /* preconditioned */
|
|
297
|
-
scs_float *M = pr->M; /* preconditioner */
|
|
298
361
|
cublasHandle_t cublas_handle = pr->cublas_handle;
|
|
299
362
|
|
|
300
|
-
if (s
|
|
363
|
+
if (!s) {
|
|
364
|
+
/* take s = 0 */
|
|
365
|
+
/* r = b */
|
|
301
366
|
cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
367
|
+
/* b = 0 */
|
|
302
368
|
cudaMemset(bg, 0, n * sizeof(scs_float));
|
|
303
369
|
} else {
|
|
304
370
|
/* p contains bg temporarily */
|
|
305
371
|
cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
306
|
-
/* bg
|
|
372
|
+
/* bg = s */
|
|
307
373
|
cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
308
|
-
|
|
374
|
+
/* r = Mat * s */
|
|
375
|
+
mat_vec(pr, bg, r);
|
|
376
|
+
/* r = Mat * s - b */
|
|
309
377
|
CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
|
|
378
|
+
/* r = b - Mat * s */
|
|
310
379
|
CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
|
|
311
380
|
}
|
|
312
381
|
|
|
313
|
-
/* for some reason nrm2 is VERY slow */
|
|
314
|
-
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
|
315
|
-
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
|
316
|
-
nrm_r = SQRTF(nrm_r);
|
|
317
382
|
/* check to see if we need to run CG at all */
|
|
318
|
-
if (
|
|
383
|
+
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
|
|
319
384
|
return 0;
|
|
320
385
|
}
|
|
321
386
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
387
|
+
/* z = M r */
|
|
388
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
389
|
+
scale_by_diag(cublas_handle, pr->M, z, n);
|
|
390
|
+
/* ztr = z'r */
|
|
391
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
|
392
|
+
/* p = z */
|
|
325
393
|
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
326
394
|
|
|
327
395
|
for (i = 0; i < max_its; ++i) {
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
alpha =
|
|
396
|
+
/* Gp = Mat * p */
|
|
397
|
+
mat_vec(pr, p, Gp);
|
|
398
|
+
/* ptGp = p'Gp */
|
|
399
|
+
CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
|
|
400
|
+
/* alpha = z'r / p'G p */
|
|
401
|
+
alpha = ztr / ptGp;
|
|
333
402
|
neg_alpha = -alpha;
|
|
334
|
-
|
|
403
|
+
/* b += alpha * p */
|
|
335
404
|
CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
|
|
405
|
+
/* r -= alpha * G p */
|
|
336
406
|
CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
|
|
337
407
|
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
if (nrm_r < tol) {
|
|
343
|
-
i++;
|
|
344
|
-
break;
|
|
345
|
-
}
|
|
346
|
-
ipzr_old = ipzr;
|
|
347
|
-
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
|
348
|
-
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
|
408
|
+
#if VERBOSITY > 3
|
|
409
|
+
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
|
|
410
|
+
cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
|
|
411
|
+
#endif
|
|
349
412
|
|
|
350
|
-
|
|
413
|
+
if (cg_gpu_norm(cublas_handle, r, n) < tol) {
|
|
414
|
+
return i + 1;
|
|
415
|
+
}
|
|
416
|
+
/* z = M r */
|
|
417
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
418
|
+
scale_by_diag(cublas_handle, pr->M, z, n);
|
|
419
|
+
ztr_prev = ztr;
|
|
420
|
+
/* ztr = z'r */
|
|
421
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
|
422
|
+
beta = ztr / ztr_prev;
|
|
423
|
+
/* p = beta * p, where beta = ztr / ztr_prev */
|
|
351
424
|
CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
|
|
425
|
+
/* p = z + beta * p */
|
|
352
426
|
CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
|
|
353
427
|
}
|
|
354
|
-
#if EXTRA_VERBOSE > 0
|
|
355
|
-
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
|
|
356
|
-
#endif
|
|
357
428
|
return i;
|
|
358
429
|
}
|
|
359
430
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
431
|
+
/* solves Mx = b, for x but stores result in b */
|
|
432
|
+
/* s contains warm-start (if available) */
|
|
433
|
+
/*
|
|
434
|
+
* [x] = [rho_x I + P A' ]^{-1} [rx]
|
|
435
|
+
* [y] [ A -R_y ] [ry]
|
|
436
|
+
*
|
|
437
|
+
* R_y = diag(rho_y_vec)
|
|
438
|
+
*
|
|
439
|
+
* becomes:
|
|
440
|
+
*
|
|
441
|
+
* x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
|
|
442
|
+
* y = R_y^{-1} (Ax - ry)
|
|
443
|
+
*
|
|
444
|
+
*/
|
|
445
|
+
scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
446
|
+
scs_float tol) {
|
|
447
|
+
scs_int cg_its, max_iters;
|
|
366
448
|
scs_float neg_onef = -1.0;
|
|
449
|
+
|
|
450
|
+
/* these are on GPU */
|
|
451
|
+
scs_float *bg = p->bg;
|
|
452
|
+
scs_float *tmp_m = p->tmp_m;
|
|
367
453
|
ScsGpuMatrix *Ag = p->Ag;
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
454
|
+
ScsGpuMatrix *Pg = p->Pg;
|
|
455
|
+
|
|
456
|
+
if (CG_NORM(b, p->n + p->m) <= 1e-12) {
|
|
457
|
+
memset(b, 0, (p->n + p->m) * sizeof(scs_float));
|
|
458
|
+
return 0;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
if (tol <= 0.) {
|
|
462
|
+
scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
|
|
463
|
+
"INDIRECT flag.\n",
|
|
464
|
+
tol);
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/* bg = b = [rx; ry] */
|
|
468
|
+
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
|
|
469
|
+
cudaMemcpyHostToDevice);
|
|
470
|
+
/* tmp = ry */
|
|
471
|
+
cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
|
|
472
|
+
cudaMemcpyDeviceToDevice);
|
|
473
|
+
/* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
|
|
474
|
+
scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
|
|
475
|
+
|
|
476
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
|
|
477
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
|
|
478
|
+
/* bg[:n] = rx + A' R ry */
|
|
479
|
+
SCS(accum_by_atrans_gpu)
|
|
480
|
+
(Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
|
|
481
|
+
&p->buffer);
|
|
482
|
+
|
|
483
|
+
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
|
|
484
|
+
max_iters = 10 * Ag->n;
|
|
485
|
+
|
|
486
|
+
/* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
|
|
487
|
+
* in bg */
|
|
488
|
+
cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
|
|
489
|
+
|
|
490
|
+
/* bg[n:] = -ry */
|
|
385
491
|
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
|
492
|
+
cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
|
|
493
|
+
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
|
|
386
494
|
|
|
387
|
-
|
|
388
|
-
cusparseDnVecSetValues(p->dn_vec_n, (void *) bg);
|
|
495
|
+
/* b[n:] = Ax - ry */
|
|
389
496
|
#if GPU_TRANSPOSE_MAT > 0
|
|
390
|
-
SCS(
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
);
|
|
497
|
+
SCS(accum_by_atrans_gpu)
|
|
498
|
+
(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
499
|
+
&p->buffer);
|
|
394
500
|
#else
|
|
395
|
-
SCS(
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
);
|
|
501
|
+
SCS(accum_by_a_gpu)
|
|
502
|
+
(Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
|
|
503
|
+
&p->buffer);
|
|
399
504
|
#endif
|
|
400
505
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
if (iter >= 0) {
|
|
404
|
-
p->tot_cg_its += cg_its;
|
|
405
|
-
}
|
|
506
|
+
/* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
|
|
507
|
+
scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
|
|
406
508
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
509
|
+
/* copy bg = [x; y] back to b */
|
|
510
|
+
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
|
|
511
|
+
cudaMemcpyDeviceToHost);
|
|
512
|
+
p->tot_cg_its += cg_its;
|
|
513
|
+
#if VERBOSITY > 1
|
|
514
|
+
scs_printf("tol %.3e\n", tol);
|
|
515
|
+
scs_printf("cg_its %i\n", (int)cg_its);
|
|
410
516
|
#endif
|
|
411
517
|
return 0;
|
|
412
518
|
}
|