scs 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +8 -8
- data/lib/scs/ffi.rb +1 -7
- data/lib/scs/version.rb +1 -1
- data/vendor/scs/CITATION.cff +1 -1
- data/vendor/scs/CMakeLists.txt +55 -7
- data/vendor/scs/Makefile +9 -9
- data/vendor/scs/README.md +2 -1
- data/vendor/scs/include/aa.h +1 -1
- data/vendor/scs/include/cones.h +14 -11
- data/vendor/scs/include/glbopts.h +26 -64
- data/vendor/scs/include/linalg.h +2 -1
- data/vendor/scs/include/linsys.h +13 -13
- data/vendor/scs/include/normalize.h +6 -5
- data/vendor/scs/include/scs.h +43 -87
- data/vendor/scs/include/scs_types.h +34 -0
- data/vendor/scs/include/scs_work.h +83 -0
- data/vendor/scs/linsys/cpu/direct/private.c +86 -73
- data/vendor/scs/linsys/cpu/direct/private.h +2 -2
- data/vendor/scs/linsys/cpu/indirect/private.c +42 -33
- data/vendor/scs/linsys/cpu/indirect/private.h +1 -2
- data/vendor/scs/linsys/csparse.c +3 -3
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +6 -6
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +6 -1
- data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
- data/vendor/scs/linsys/gpu/gpu.h +8 -11
- data/vendor/scs/linsys/gpu/indirect/private.c +72 -49
- data/vendor/scs/linsys/gpu/indirect/private.h +14 -13
- data/vendor/scs/linsys/scs_matrix.c +26 -46
- data/vendor/scs/linsys/scs_matrix.h +4 -4
- data/vendor/scs/scs.mk +1 -1
- data/vendor/scs/src/aa.c +13 -4
- data/vendor/scs/src/cones.c +143 -92
- data/vendor/scs/src/linalg.c +25 -0
- data/vendor/scs/src/normalize.c +26 -26
- data/vendor/scs/src/rw.c +48 -12
- data/vendor/scs/src/scs.c +104 -110
- data/vendor/scs/src/scs_version.c +8 -6
- data/vendor/scs/src/util.c +1 -1
- data/vendor/scs/test/minunit.h +6 -1
- data/vendor/scs/test/problem_utils.h +28 -35
- data/vendor/scs/test/problems/degenerate.h +1 -1
- data/vendor/scs/test/problems/hs21_tiny_qp.h +1 -1
- data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +1 -1
- data/vendor/scs/test/problems/infeasible_tiny_qp.h +1 -1
- data/vendor/scs/test/problems/qafiro_tiny_qp.h +3 -3
- data/vendor/scs/test/problems/random_prob.h +1 -1
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +1 -1
- data/vendor/scs/test/problems/small_lp.h +3 -1
- data/vendor/scs/test/problems/small_qp.h +352 -0
- data/vendor/scs/test/problems/{test_fails.h → test_validation.h} +3 -3
- data/vendor/scs/test/problems/unbounded_tiny_qp.h +1 -1
- data/vendor/scs/test/random_socp_prob.c +1 -1
- data/vendor/scs/test/run_from_file.c +1 -1
- data/vendor/scs/test/run_tests.c +23 -14
- metadata +8 -5
@@ -17,7 +17,8 @@ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
|
17
17
|
}
|
18
18
|
*/
|
19
19
|
|
20
|
-
/*
|
20
|
+
/* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
|
21
|
+
/* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
|
21
22
|
static void set_preconditioner(ScsLinSysWork *p) {
|
22
23
|
scs_int i, k;
|
23
24
|
scs_float *M = p->M;
|
@@ -28,22 +29,26 @@ static void set_preconditioner(ScsLinSysWork *p) {
|
|
28
29
|
scs_printf("getting pre-conditioner\n");
|
29
30
|
#endif
|
30
31
|
|
32
|
+
/* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
|
31
33
|
for (i = 0; i < A->n; ++i) { /* cols */
|
32
|
-
|
33
|
-
|
34
|
+
/* M_ii = (R_x)_i */
|
35
|
+
M[i] = p->diag_r[i];
|
36
|
+
/* M_ii += a_i' (R_y)^-1 a_i */
|
34
37
|
for (k = A->p[i]; k < A->p[i + 1]; ++k) {
|
35
38
|
/* A->i[k] is row of entry k with value A->x[k] */
|
36
|
-
M[i] += A->x[k] * A->x[k] / p->
|
39
|
+
M[i] += A->x[k] * A->x[k] / p->diag_r[A->n + A->i[k]];
|
37
40
|
}
|
38
41
|
if (P) {
|
39
42
|
for (k = P->p[i]; k < P->p[i + 1]; k++) {
|
40
43
|
/* diagonal element only */
|
41
44
|
if (P->i[k] == i) { /* row == col */
|
45
|
+
/* M_ii += P_ii */
|
42
46
|
M[i] += P->x[k];
|
43
47
|
break;
|
44
48
|
}
|
45
49
|
}
|
46
50
|
}
|
51
|
+
/* finally invert for pre-conditioner */
|
47
52
|
M[i] = 1. / M[i];
|
48
53
|
}
|
49
54
|
#if VERBOSITY > 0
|
@@ -111,10 +116,18 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
111
116
|
}
|
112
117
|
|
113
118
|
/* vec -> R_y^{-1} vec */
|
114
|
-
static void
|
119
|
+
static void scale_by_r_y_inv(scs_float *vec, ScsLinSysWork *p) {
|
115
120
|
scs_int i;
|
116
121
|
for (i = 0; i < p->m; ++i) {
|
117
|
-
vec[i] /= p->
|
122
|
+
vec[i] /= p->diag_r[p->n + i];
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
/* y += R_x * x */
|
127
|
+
static void accum_by_r_x(scs_float *y, const scs_float *x, ScsLinSysWork *p) {
|
128
|
+
scs_int i;
|
129
|
+
for (i = 0; i < p->n; ++i) {
|
130
|
+
y[i] += p->diag_r[i] * x[i];
|
118
131
|
}
|
119
132
|
}
|
120
133
|
|
@@ -123,7 +136,7 @@ static void accum_by_a(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
123
136
|
SCS(accum_by_atrans)(p->At, x, y);
|
124
137
|
}
|
125
138
|
|
126
|
-
/* y = (
|
139
|
+
/* y = (R_x + P + A' R_y^{-1} A) x */
|
127
140
|
static void mat_vec(const ScsMatrix *A, const ScsMatrix *P, ScsLinSysWork *p,
|
128
141
|
const scs_float *x, scs_float *y) {
|
129
142
|
scs_float *z = p->tmp;
|
@@ -133,10 +146,10 @@ static void mat_vec(const ScsMatrix *A, const ScsMatrix *P, ScsLinSysWork *p,
|
|
133
146
|
SCS(accum_by_p)(P, x, y); /* y = Px */
|
134
147
|
}
|
135
148
|
accum_by_a(p, x, z); /* z = Ax */
|
136
|
-
|
149
|
+
scale_by_r_y_inv(z, p); /* z = R_y^{-1} A x */
|
137
150
|
SCS(accum_by_atrans)(A, z, y); /* y += A'z, y = Px + A' R_y^{-1} Ax */
|
138
|
-
/* y =
|
139
|
-
|
151
|
+
/* y = R_x * x + Px + A' R_y^{-1} A * x */
|
152
|
+
accum_by_r_x(y, x, p);
|
140
153
|
}
|
141
154
|
|
142
155
|
static void apply_pre_conditioner(scs_float *z, scs_float *r, scs_int n,
|
@@ -149,36 +162,35 @@ static void apply_pre_conditioner(scs_float *z, scs_float *r, scs_int n,
|
|
149
162
|
}
|
150
163
|
|
151
164
|
/* no need to update anything in this case */
|
152
|
-
void SCS(
|
153
|
-
p->
|
165
|
+
void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
|
166
|
+
p->diag_r = diag_r; /* this isn't needed but do it to be safe */
|
154
167
|
set_preconditioner(p);
|
155
168
|
}
|
156
169
|
|
157
170
|
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
158
|
-
scs_float *
|
171
|
+
const scs_float *diag_r) {
|
159
172
|
ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
160
173
|
p->A = A;
|
161
174
|
p->P = P;
|
162
175
|
p->m = A->m;
|
163
176
|
p->n = A->n;
|
164
|
-
p->rho_x = rho_x;
|
165
177
|
|
166
|
-
p->p = (scs_float *)
|
167
|
-
p->r = (scs_float *)
|
168
|
-
p->Gp = (scs_float *)
|
169
|
-
p->tmp = (scs_float *)
|
178
|
+
p->p = (scs_float *)scs_calloc((A->n), sizeof(scs_float));
|
179
|
+
p->r = (scs_float *)scs_calloc((A->n), sizeof(scs_float));
|
180
|
+
p->Gp = (scs_float *)scs_calloc((A->n), sizeof(scs_float));
|
181
|
+
p->tmp = (scs_float *)scs_calloc((A->m), sizeof(scs_float));
|
170
182
|
|
171
183
|
/* memory for A transpose */
|
172
|
-
p->At = (ScsMatrix *)
|
184
|
+
p->At = (ScsMatrix *)scs_calloc(1, sizeof(ScsMatrix));
|
173
185
|
p->At->m = A->n;
|
174
186
|
p->At->n = A->m;
|
175
|
-
p->At->i = (scs_int *)
|
176
|
-
p->At->p = (scs_int *)
|
177
|
-
p->At->x = (scs_float *)
|
187
|
+
p->At->i = (scs_int *)scs_calloc((A->p[A->n]), sizeof(scs_int));
|
188
|
+
p->At->p = (scs_int *)scs_calloc((A->m + 1), sizeof(scs_int));
|
189
|
+
p->At->x = (scs_float *)scs_calloc((A->p[A->n]), sizeof(scs_float));
|
178
190
|
transpose(A, p);
|
179
191
|
|
180
192
|
/* preconditioner memory */
|
181
|
-
p->
|
193
|
+
p->diag_r = diag_r;
|
182
194
|
p->z = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
|
183
195
|
p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
|
184
196
|
set_preconditioner(p);
|
@@ -192,8 +204,7 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
192
204
|
return p;
|
193
205
|
}
|
194
206
|
|
195
|
-
/* solves (
|
196
|
-
* b */
|
207
|
+
/* solves (R_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution in b */
|
197
208
|
static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *b,
|
198
209
|
scs_int max_its, scs_float tol) {
|
199
210
|
scs_int i, n = pr->n;
|
@@ -268,14 +279,12 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *b,
|
|
268
279
|
/* solves Mx = b, for x but stores result in b */
|
269
280
|
/* s contains warm-start (if available) */
|
270
281
|
/*
|
271
|
-
* [x] = [
|
272
|
-
* [y] [
|
273
|
-
*
|
274
|
-
* R_y = diag(rho_y_vec)
|
282
|
+
* [x] = [R_x + P A' ]^{-1} [rx]
|
283
|
+
* [y] [ A -R_y ] [ry]
|
275
284
|
*
|
276
285
|
* becomes:
|
277
286
|
*
|
278
|
-
* x = (
|
287
|
+
* x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
|
279
288
|
* y = R_y^{-1} (Ax - ry)
|
280
289
|
*
|
281
290
|
*/
|
@@ -299,12 +308,12 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
299
308
|
/* tmp = ry */
|
300
309
|
memcpy(p->tmp, &(b[p->n]), p->m * sizeof(scs_float));
|
301
310
|
/* tmp = R_y^{-1} * ry */
|
302
|
-
|
311
|
+
scale_by_r_y_inv(p->tmp, p);
|
303
312
|
/* b[:n] = rx + A' R_y^{-1} ry */
|
304
313
|
SCS(accum_by_atrans)(p->A, p->tmp, b);
|
305
314
|
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
|
306
315
|
max_iters = 10 * p->n;
|
307
|
-
/* solves (
|
316
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
|
308
317
|
* b */
|
309
318
|
cg_its = pcg(p, s, b, max_iters, tol); /* b[:n] = x */
|
310
319
|
|
@@ -313,7 +322,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
313
322
|
/* b[n:] = Ax - ry */
|
314
323
|
accum_by_a(p, b, &(b[p->n]));
|
315
324
|
/* b[n:] = R_y^{-1} (Ax - ry) = y */
|
316
|
-
|
325
|
+
scale_by_r_y_inv(&(b[p->n]), p);
|
317
326
|
p->tot_cg_its += cg_its;
|
318
327
|
#if VERBOSITY > 1
|
319
328
|
scs_printf("tol %.3e\n", tol);
|
data/vendor/scs/linsys/csparse.c
CHANGED
@@ -12,9 +12,9 @@ csc *SCS(cs_spalloc)(scs_int m, scs_int n, scs_int nzmax, scs_int values,
|
|
12
12
|
A->n = n;
|
13
13
|
A->nzmax = nzmax = MAX(nzmax, 1);
|
14
14
|
A->nz = triplet ? 0 : -1; /* allocate triplet or comp.col */
|
15
|
-
A->p = (scs_int *)
|
16
|
-
A->i = (scs_int *)
|
17
|
-
A->x = values ? (scs_float *)
|
15
|
+
A->p = (scs_int *)scs_calloc((triplet ? nzmax : n + 1), sizeof(scs_int));
|
16
|
+
A->i = (scs_int *)scs_calloc(nzmax, sizeof(scs_int));
|
17
|
+
A->x = values ? (scs_float *)scs_calloc(nzmax, sizeof(scs_float)) : SCS_NULL;
|
18
18
|
return (!A->p || !A->i || (values && !A->x)) ? SCS(cs_spfree)(A) : A;
|
19
19
|
}
|
20
20
|
|
@@ -18,7 +18,6 @@
|
|
18
18
|
|
19
19
|
#ifdef MATLAB_MEX_FILE
|
20
20
|
#include "mex.h"
|
21
|
-
#include "scs_matrix.h"
|
22
21
|
#endif
|
23
22
|
|
24
23
|
#ifndef NULL
|
@@ -51,7 +50,7 @@
|
|
51
50
|
|
52
51
|
struct SuiteSparse_config_struct SuiteSparse_config =
|
53
52
|
{
|
54
|
-
|
53
|
+
scs_malloc, scs_calloc, scs_realloc, scs_free, _scs_printf,
|
55
54
|
SuiteSparse_hypot,
|
56
55
|
SuiteSparse_divcomplex
|
57
56
|
|
@@ -73,12 +72,13 @@ struct SuiteSparse_config_struct SuiteSparse_config =
|
|
73
72
|
SuiteSparse_start be called prior to calling any SuiteSparse function.
|
74
73
|
*/
|
75
74
|
|
75
|
+
|
76
76
|
void SuiteSparse_start ( void )
|
77
77
|
{
|
78
|
-
SuiteSparse_config.malloc_func =
|
79
|
-
SuiteSparse_config.calloc_func =
|
80
|
-
SuiteSparse_config.realloc_func =
|
81
|
-
SuiteSparse_config.free_func =
|
78
|
+
SuiteSparse_config.malloc_func = scs_malloc ;
|
79
|
+
SuiteSparse_config.calloc_func = scs_calloc ;
|
80
|
+
SuiteSparse_config.realloc_func = scs_realloc ;
|
81
|
+
SuiteSparse_config.free_func = scs_free ;
|
82
82
|
SuiteSparse_config.printf_func = _scs_printf ;
|
83
83
|
/* math functions */
|
84
84
|
SuiteSparse_config.hypot_func = SuiteSparse_hypot ;
|
@@ -44,7 +44,7 @@ extern "C" {
|
|
44
44
|
|
45
45
|
#include <limits.h>
|
46
46
|
#include <stdlib.h>
|
47
|
-
#include "
|
47
|
+
#include "glbopts.h"
|
48
48
|
#include "ctrlc.h"
|
49
49
|
|
50
50
|
/* ========================================================================== */
|
@@ -71,6 +71,11 @@ extern "C" {
|
|
71
71
|
#define SuiteSparse_long_id "%" SuiteSparse_long_idd
|
72
72
|
#endif
|
73
73
|
|
74
|
+
#ifndef _scs_printf
|
75
|
+
#define _scs_printf scs_printf
|
76
|
+
#endif
|
77
|
+
|
78
|
+
|
74
79
|
/* ========================================================================== */
|
75
80
|
/* === SuiteSparse_config parameters and functions ========================== */
|
76
81
|
/* ========================================================================== */
|
@@ -89,8 +89,8 @@ GLOBAL Int AMD_order
|
|
89
89
|
}
|
90
90
|
|
91
91
|
/* allocate two size-n integer workspaces */
|
92
|
-
Len = SuiteSparse_malloc (n, sizeof (Int)) ;
|
93
|
-
Pinv = SuiteSparse_malloc (n, sizeof (Int)) ;
|
92
|
+
Len = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
|
93
|
+
Pinv = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
|
94
94
|
mem += n ;
|
95
95
|
mem += n ;
|
96
96
|
if (!Len || !Pinv)
|
@@ -106,8 +106,8 @@ GLOBAL Int AMD_order
|
|
106
106
|
{
|
107
107
|
/* sort the input matrix and remove duplicate entries */
|
108
108
|
AMD_DEBUG1 (("Matrix is jumbled\n")) ;
|
109
|
-
Rp = SuiteSparse_malloc (n+1, sizeof (Int)) ;
|
110
|
-
Ri = SuiteSparse_malloc (nz, sizeof (Int)) ;
|
109
|
+
Rp = (Int *)SuiteSparse_malloc (n+1, sizeof (Int)) ;
|
110
|
+
Ri = (Int *)SuiteSparse_malloc (nz, sizeof (Int)) ;
|
111
111
|
mem += (n+1) ;
|
112
112
|
mem += MAX (nz,1) ;
|
113
113
|
if (!Rp || !Ri)
|
@@ -160,7 +160,7 @@ GLOBAL Int AMD_order
|
|
160
160
|
ok = ok && (slen < Int_MAX) ; /* S[i] for Int i must be OK */
|
161
161
|
if (ok)
|
162
162
|
{
|
163
|
-
S = SuiteSparse_malloc (slen, sizeof (Int)) ;
|
163
|
+
S = (Int *)SuiteSparse_malloc (slen, sizeof (Int)) ;
|
164
164
|
}
|
165
165
|
AMD_DEBUG1 (("slen %g\n", (scs_float) slen)) ;
|
166
166
|
if (!S)
|
data/vendor/scs/linsys/gpu/gpu.h
CHANGED
@@ -1,12 +1,17 @@
|
|
1
|
-
#ifndef
|
2
|
-
#define
|
1
|
+
#ifndef SCS_GPU_H_GUARD
|
2
|
+
#define SCS_GPU_H_GUARD
|
3
3
|
|
4
4
|
#ifdef __cplusplus
|
5
5
|
extern "C" {
|
6
6
|
#endif
|
7
7
|
|
8
|
-
|
8
|
+
/* TODO: Do we need this?
|
9
|
+
|
9
10
|
#include <cuda.h>
|
11
|
+
|
12
|
+
*/
|
13
|
+
|
14
|
+
#include <cublas_v2.h>
|
10
15
|
#include <cuda_runtime_api.h>
|
11
16
|
#include <cusparse.h>
|
12
17
|
|
@@ -31,11 +36,9 @@ extern "C" {
|
|
31
36
|
#ifndef SFLOAT
|
32
37
|
#define CUBLAS(x) cublasD##x
|
33
38
|
#define CUBLASI(x) cublasId##x
|
34
|
-
#define CUSPARSE(x) cusparseD##x
|
35
39
|
#else
|
36
40
|
#define CUBLAS(x) cublasS##x
|
37
41
|
#define CUBLASI(x) cublasIs##x
|
38
|
-
#define CUSPARSE(x) cusparseS##x
|
39
42
|
#endif
|
40
43
|
#define CUSPARSE_GEN(x) cusparse##x
|
41
44
|
#else
|
@@ -46,9 +49,6 @@ extern "C" {
|
|
46
49
|
#define CUBLASI(x) \
|
47
50
|
CUDA_CHECK_ERR; \
|
48
51
|
cublasId##x
|
49
|
-
#define CUSPARSE(x) \
|
50
|
-
CUDA_CHECK_ERR; \
|
51
|
-
cusparseD##x
|
52
52
|
#else
|
53
53
|
#define CUBLAS(x) \
|
54
54
|
CUDA_CHECK_ERR; \
|
@@ -56,9 +56,6 @@ extern "C" {
|
|
56
56
|
#define CUBLASI(x) \
|
57
57
|
CUDA_CHECK_ERR; \
|
58
58
|
cublasIs##x
|
59
|
-
#define CUSPARSE(x) \
|
60
|
-
CUDA_CHECK_ERR; \
|
61
|
-
cusparseS##x
|
62
59
|
#endif
|
63
60
|
#define CUSPARSE_GEN(x) \
|
64
61
|
CUDA_CHECK_ERR; \
|
@@ -35,63 +35,77 @@ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
|
35
35
|
}
|
36
36
|
*/
|
37
37
|
|
38
|
-
/*
|
39
|
-
|
38
|
+
/* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
|
39
|
+
/* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
|
40
|
+
static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
|
40
41
|
scs_int i, k;
|
41
42
|
const ScsMatrix *A = p->A;
|
42
43
|
const ScsMatrix *P = p->P;
|
43
|
-
scs_float *M =
|
44
|
+
scs_float *M = p->M;
|
44
45
|
|
45
46
|
#if VERBOSITY > 0
|
46
47
|
scs_printf("getting pre-conditioner\n");
|
47
48
|
#endif
|
48
49
|
|
50
|
+
/* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
|
49
51
|
for (i = 0; i < A->n; ++i) { /* cols */
|
50
|
-
|
51
|
-
|
52
|
+
/* M_ii = (R_x)_i */
|
53
|
+
M[i] = diag_r[i];
|
54
|
+
/* M_ii += a_i' (R_y)^-1 a_i */
|
52
55
|
for (k = A->p[i]; k < A->p[i + 1]; ++k) {
|
53
56
|
/* A->i[k] is row of entry k with value A->x[k] */
|
54
|
-
M[i] += A->x[k] * A->x[k] /
|
57
|
+
M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
|
55
58
|
}
|
56
59
|
if (P) {
|
57
60
|
for (k = P->p[i]; k < P->p[i + 1]; k++) {
|
58
61
|
/* diagonal element only */
|
59
62
|
if (P->i[k] == i) { /* row == col */
|
63
|
+
/* M_ii += P_ii */
|
60
64
|
M[i] += P->x[k];
|
61
65
|
break;
|
62
66
|
}
|
63
67
|
}
|
64
68
|
}
|
69
|
+
/* finally invert for pre-conditioner */
|
65
70
|
M[i] = 1. / M[i];
|
66
71
|
}
|
67
|
-
cudaMemcpy(p->
|
68
|
-
scs_free(M);
|
72
|
+
cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
69
73
|
#if VERBOSITY > 0
|
70
74
|
scs_printf("finished getting pre-conditioner\n");
|
71
75
|
#endif
|
72
76
|
}
|
73
77
|
|
74
78
|
/* no need to update anything in this case */
|
75
|
-
void SCS(
|
79
|
+
void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
|
76
80
|
scs_int i;
|
81
|
+
|
82
|
+
/* R_x to gpu */
|
83
|
+
cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
|
84
|
+
cudaMemcpyHostToDevice);
|
85
|
+
|
86
|
+
/* 1/R_y to gpu */
|
77
87
|
for (i = 0; i < p->m; ++i)
|
78
|
-
p->
|
79
|
-
cudaMemcpy(p->
|
88
|
+
p->inv_r_y[i] = 1. / diag_r[p->n + i];
|
89
|
+
cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
|
80
90
|
cudaMemcpyHostToDevice);
|
81
|
-
|
91
|
+
|
92
|
+
/* set preconditioner M on gpu */
|
93
|
+
set_preconditioner(p, diag_r);
|
82
94
|
}
|
83
95
|
|
84
96
|
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
85
97
|
if (p) {
|
86
|
-
scs_free(p->
|
98
|
+
scs_free(p->M);
|
99
|
+
scs_free(p->inv_r_y);
|
87
100
|
cudaFree(p->p);
|
88
101
|
cudaFree(p->r);
|
89
102
|
cudaFree(p->Gp);
|
90
103
|
cudaFree(p->bg);
|
91
104
|
cudaFree(p->tmp_m);
|
92
105
|
cudaFree(p->z);
|
93
|
-
cudaFree(p->
|
94
|
-
cudaFree(p->
|
106
|
+
cudaFree(p->M_gpu);
|
107
|
+
cudaFree(p->r_x_gpu);
|
108
|
+
cudaFree(p->inv_r_y_gpu);
|
95
109
|
if (p->Pg) {
|
96
110
|
SCS(free_gpu_matrix)(p->Pg);
|
97
111
|
scs_free(p->Pg);
|
@@ -126,22 +140,23 @@ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
|
|
126
140
|
0, M, 1, z, 1);
|
127
141
|
}
|
128
142
|
|
129
|
-
/* y = (
|
143
|
+
/* y = (R_x + P + A' R_y^{-1} A) x */
|
130
144
|
static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
131
145
|
/* x and y MUST already be loaded to GPU */
|
132
146
|
scs_float *z = p->tmp_m; /* temp memory */
|
133
|
-
cudaMemset(y, 0, p->n * sizeof(scs_float));
|
134
147
|
cudaMemset(z, 0, p->m * sizeof(scs_float));
|
135
148
|
|
136
149
|
cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
|
137
150
|
cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
|
138
151
|
cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
|
139
152
|
|
140
|
-
/* y =
|
141
|
-
|
153
|
+
/* y = x */
|
154
|
+
cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
155
|
+
/* y = R_x * x */
|
156
|
+
scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
|
142
157
|
|
143
158
|
if (p->Pg) {
|
144
|
-
/* y =
|
159
|
+
/* y = R_x * x + P x */
|
145
160
|
SCS(accum_by_p_gpu)
|
146
161
|
(p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
147
162
|
&p->buffer);
|
@@ -158,9 +173,9 @@ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
158
173
|
&p->buffer);
|
159
174
|
#endif
|
160
175
|
/* z = R_y^{-1} A x */
|
161
|
-
scale_by_diag(p->cublas_handle, p->
|
176
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
|
162
177
|
|
163
|
-
/* y += A'z => y =
|
178
|
+
/* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
|
164
179
|
SCS(accum_by_atrans_gpu)
|
165
180
|
(p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
166
181
|
&p->buffer);
|
@@ -201,19 +216,35 @@ static csc *fill_p_matrix(const ScsMatrix *P) {
|
|
201
216
|
}
|
202
217
|
|
203
218
|
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
204
|
-
scs_float *
|
219
|
+
const scs_float *diag_r) {
|
205
220
|
cudaError_t err;
|
206
|
-
scs_int i;
|
207
221
|
csc *P_full;
|
208
|
-
ScsLinSysWork *p =
|
209
|
-
ScsGpuMatrix *Ag =
|
222
|
+
ScsLinSysWork *p = SCS_NULL;
|
223
|
+
ScsGpuMatrix *Ag = SCS_NULL;
|
210
224
|
ScsGpuMatrix *Pg = SCS_NULL;
|
225
|
+
int device_count;
|
226
|
+
|
227
|
+
err = cudaGetDeviceCount(&device_count);
|
228
|
+
if (err > 0) {
|
229
|
+
scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
|
230
|
+
return SCS_NULL;
|
231
|
+
}
|
232
|
+
|
233
|
+
p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
234
|
+
Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
235
|
+
|
236
|
+
p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
|
237
|
+
p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
|
238
|
+
|
239
|
+
p->A = A;
|
240
|
+
p->P = P;
|
241
|
+
p->m = A->m;
|
242
|
+
p->n = A->n;
|
211
243
|
|
212
244
|
#if GPU_TRANSPOSE_MAT > 0
|
213
245
|
size_t new_buffer_size = 0;
|
214
246
|
#endif
|
215
247
|
|
216
|
-
p->rho_x = rho_x;
|
217
248
|
p->cublas_handle = 0;
|
218
249
|
p->cusparse_handle = 0;
|
219
250
|
|
@@ -242,8 +273,9 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
242
273
|
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
243
274
|
cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
|
244
275
|
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
245
|
-
cudaMalloc((void **)&p->
|
246
|
-
cudaMalloc((void **)&p->
|
276
|
+
cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
|
277
|
+
cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
|
278
|
+
cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
|
247
279
|
|
248
280
|
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
249
281
|
cudaMemcpyHostToDevice);
|
@@ -251,12 +283,6 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
251
283
|
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
252
284
|
cudaMemcpyHostToDevice);
|
253
285
|
|
254
|
-
p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
|
255
|
-
for (i = 0; i < A->m; ++i)
|
256
|
-
p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
|
257
|
-
cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
|
258
|
-
cudaMemcpyHostToDevice);
|
259
|
-
|
260
286
|
cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
|
261
287
|
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
262
288
|
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
@@ -297,7 +323,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
297
323
|
cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
298
324
|
cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
|
299
325
|
|
300
|
-
|
326
|
+
/* Form preconditioner and copy R_x, 1/R_y to gpu */
|
327
|
+
SCS(update_lin_sys_diag_r)(p, diag_r);
|
301
328
|
|
302
329
|
#if GPU_TRANSPOSE_MAT > 0
|
303
330
|
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
@@ -346,9 +373,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
346
373
|
return p;
|
347
374
|
}
|
348
375
|
|
349
|
-
/* solves (
|
350
|
-
* b */
|
351
|
-
/* on GPU */
|
376
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
|
377
|
+
* b, on GPU */
|
352
378
|
static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
353
379
|
scs_int max_its, scs_float tol) {
|
354
380
|
scs_int i, n = pr->n;
|
@@ -386,7 +412,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
386
412
|
|
387
413
|
/* z = M r */
|
388
414
|
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
389
|
-
scale_by_diag(cublas_handle, pr->
|
415
|
+
scale_by_diag(cublas_handle, pr->M_gpu, z, n);
|
390
416
|
/* ztr = z'r */
|
391
417
|
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
392
418
|
/* p = z */
|
@@ -415,7 +441,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
415
441
|
}
|
416
442
|
/* z = M r */
|
417
443
|
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
418
|
-
scale_by_diag(cublas_handle, pr->
|
444
|
+
scale_by_diag(cublas_handle, pr->M_gpu, z, n);
|
419
445
|
ztr_prev = ztr;
|
420
446
|
/* ztr = z'r */
|
421
447
|
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
@@ -431,14 +457,12 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
431
457
|
/* solves Mx = b, for x but stores result in b */
|
432
458
|
/* s contains warm-start (if available) */
|
433
459
|
/*
|
434
|
-
* [x] = [
|
460
|
+
* [x] = [R_x + P A' ]^{-1} [rx]
|
435
461
|
* [y] [ A -R_y ] [ry]
|
436
462
|
*
|
437
|
-
* R_y = diag(rho_y_vec)
|
438
|
-
*
|
439
463
|
* becomes:
|
440
464
|
*
|
441
|
-
* x = (
|
465
|
+
* x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
|
442
466
|
* y = R_y^{-1} (Ax - ry)
|
443
467
|
*
|
444
468
|
*/
|
@@ -451,7 +475,6 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
451
475
|
scs_float *bg = p->bg;
|
452
476
|
scs_float *tmp_m = p->tmp_m;
|
453
477
|
ScsGpuMatrix *Ag = p->Ag;
|
454
|
-
ScsGpuMatrix *Pg = p->Pg;
|
455
478
|
|
456
479
|
if (CG_NORM(b, p->n + p->m) <= 1e-12) {
|
457
480
|
memset(b, 0, (p->n + p->m) * sizeof(scs_float));
|
@@ -471,7 +494,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
471
494
|
cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
|
472
495
|
cudaMemcpyDeviceToDevice);
|
473
496
|
/* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
|
474
|
-
scale_by_diag(p->cublas_handle, p->
|
497
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
|
475
498
|
|
476
499
|
cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
|
477
500
|
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
|
@@ -483,7 +506,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
483
506
|
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
|
484
507
|
max_iters = 10 * Ag->n;
|
485
508
|
|
486
|
-
/* solves (
|
509
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
|
487
510
|
* in bg */
|
488
511
|
cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
|
489
512
|
|
@@ -504,7 +527,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
504
527
|
#endif
|
505
528
|
|
506
529
|
/* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
|
507
|
-
scale_by_diag(p->cublas_handle, p->
|
530
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
|
508
531
|
|
509
532
|
/* copy bg = [x; y] back to b */
|
510
533
|
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
|