scs 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/README.md +42 -13
- data/lib/scs/ffi.rb +1 -7
- data/lib/scs/matrix.rb +72 -0
- data/lib/scs/solver.rb +19 -26
- data/lib/scs/version.rb +1 -1
- data/lib/scs.rb +1 -0
- data/vendor/scs/CITATION.cff +1 -1
- data/vendor/scs/CMakeLists.txt +55 -7
- data/vendor/scs/Makefile +9 -9
- data/vendor/scs/README.md +4 -1
- data/vendor/scs/include/aa.h +1 -1
- data/vendor/scs/include/cones.h +17 -12
- data/vendor/scs/include/glbopts.h +27 -66
- data/vendor/scs/include/linalg.h +2 -1
- data/vendor/scs/include/linsys.h +13 -13
- data/vendor/scs/include/normalize.h +7 -5
- data/vendor/scs/include/rw.h +3 -3
- data/vendor/scs/include/scs.h +85 -106
- data/vendor/scs/include/scs_types.h +34 -0
- data/vendor/scs/include/scs_work.h +80 -0
- data/vendor/scs/include/util.h +3 -1
- data/vendor/scs/linsys/cpu/direct/private.c +86 -73
- data/vendor/scs/linsys/cpu/direct/private.h +2 -2
- data/vendor/scs/linsys/cpu/indirect/private.c +42 -33
- data/vendor/scs/linsys/cpu/indirect/private.h +1 -2
- data/vendor/scs/linsys/csparse.c +3 -3
- data/vendor/scs/linsys/external/amd/LICENSE.txt +0 -897
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +9 -7
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +1 -1
- data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
- data/vendor/scs/linsys/gpu/gpu.h +8 -11
- data/vendor/scs/linsys/gpu/indirect/private.c +72 -49
- data/vendor/scs/linsys/gpu/indirect/private.h +14 -13
- data/vendor/scs/linsys/scs_matrix.c +55 -104
- data/vendor/scs/linsys/scs_matrix.h +5 -4
- data/vendor/scs/scs.mk +1 -5
- data/vendor/scs/src/aa.c +13 -8
- data/vendor/scs/src/cones.c +197 -108
- data/vendor/scs/src/linalg.c +25 -0
- data/vendor/scs/src/normalize.c +75 -26
- data/vendor/scs/src/rw.c +74 -30
- data/vendor/scs/src/scs.c +300 -264
- data/vendor/scs/src/scs_version.c +8 -6
- data/vendor/scs/src/util.c +27 -13
- data/vendor/scs/test/minunit.h +6 -1
- data/vendor/scs/test/problem_utils.h +28 -35
- data/vendor/scs/test/problems/degenerate.h +2 -1
- data/vendor/scs/test/problems/hs21_tiny_qp.h +2 -1
- data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +6 -2
- data/vendor/scs/test/problems/infeasible_tiny_qp.h +2 -1
- data/vendor/scs/test/problems/qafiro_tiny_qp.h +5 -4
- data/vendor/scs/test/problems/random_prob.h +6 -2
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +9 -2
- data/vendor/scs/test/problems/small_lp.h +7 -2
- data/vendor/scs/test/problems/small_qp.h +387 -0
- data/vendor/scs/test/problems/{test_fails.h → test_validation.h} +7 -4
- data/vendor/scs/test/problems/unbounded_tiny_qp.h +4 -4
- data/vendor/scs/test/random_socp_prob.c +4 -2
- data/vendor/scs/test/run_from_file.c +16 -4
- data/vendor/scs/test/run_tests.c +23 -14
- metadata +10 -35
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/csparse.o +0 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/linsys/scs_matrix.o +0 -0
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_indir.o +0 -0
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.o +0 -0
@@ -18,7 +18,6 @@
|
|
18
18
|
|
19
19
|
#ifdef MATLAB_MEX_FILE
|
20
20
|
#include "mex.h"
|
21
|
-
#include "scs_matrix.h"
|
22
21
|
#endif
|
23
22
|
|
24
23
|
#ifndef NULL
|
@@ -51,7 +50,9 @@
|
|
51
50
|
|
52
51
|
struct SuiteSparse_config_struct SuiteSparse_config =
|
53
52
|
{
|
54
|
-
|
53
|
+
scs_malloc, scs_calloc, scs_realloc, scs_free,
|
54
|
+
/* Disable printing */
|
55
|
+
SCS_NULL,
|
55
56
|
SuiteSparse_hypot,
|
56
57
|
SuiteSparse_divcomplex
|
57
58
|
|
@@ -73,13 +74,14 @@ struct SuiteSparse_config_struct SuiteSparse_config =
|
|
73
74
|
SuiteSparse_start be called prior to calling any SuiteSparse function.
|
74
75
|
*/
|
75
76
|
|
77
|
+
|
76
78
|
void SuiteSparse_start ( void )
|
77
79
|
{
|
78
|
-
SuiteSparse_config.malloc_func =
|
79
|
-
SuiteSparse_config.calloc_func =
|
80
|
-
SuiteSparse_config.realloc_func =
|
81
|
-
SuiteSparse_config.free_func =
|
82
|
-
SuiteSparse_config.printf_func =
|
80
|
+
SuiteSparse_config.malloc_func = scs_malloc ;
|
81
|
+
SuiteSparse_config.calloc_func = scs_calloc ;
|
82
|
+
SuiteSparse_config.realloc_func = scs_realloc ;
|
83
|
+
SuiteSparse_config.free_func = scs_free ;
|
84
|
+
SuiteSparse_config.printf_func = SCS_NULL;
|
83
85
|
/* math functions */
|
84
86
|
SuiteSparse_config.hypot_func = SuiteSparse_hypot ;
|
85
87
|
SuiteSparse_config.divcomplex_func = SuiteSparse_divcomplex ;
|
@@ -89,8 +89,8 @@ GLOBAL Int AMD_order
|
|
89
89
|
}
|
90
90
|
|
91
91
|
/* allocate two size-n integer workspaces */
|
92
|
-
Len = SuiteSparse_malloc (n, sizeof (Int)) ;
|
93
|
-
Pinv = SuiteSparse_malloc (n, sizeof (Int)) ;
|
92
|
+
Len = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
|
93
|
+
Pinv = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
|
94
94
|
mem += n ;
|
95
95
|
mem += n ;
|
96
96
|
if (!Len || !Pinv)
|
@@ -106,8 +106,8 @@ GLOBAL Int AMD_order
|
|
106
106
|
{
|
107
107
|
/* sort the input matrix and remove duplicate entries */
|
108
108
|
AMD_DEBUG1 (("Matrix is jumbled\n")) ;
|
109
|
-
Rp = SuiteSparse_malloc (n+1, sizeof (Int)) ;
|
110
|
-
Ri = SuiteSparse_malloc (nz, sizeof (Int)) ;
|
109
|
+
Rp = (Int *)SuiteSparse_malloc (n+1, sizeof (Int)) ;
|
110
|
+
Ri = (Int *)SuiteSparse_malloc (nz, sizeof (Int)) ;
|
111
111
|
mem += (n+1) ;
|
112
112
|
mem += MAX (nz,1) ;
|
113
113
|
if (!Rp || !Ri)
|
@@ -160,7 +160,7 @@ GLOBAL Int AMD_order
|
|
160
160
|
ok = ok && (slen < Int_MAX) ; /* S[i] for Int i must be OK */
|
161
161
|
if (ok)
|
162
162
|
{
|
163
|
-
S = SuiteSparse_malloc (slen, sizeof (Int)) ;
|
163
|
+
S = (Int *)SuiteSparse_malloc (slen, sizeof (Int)) ;
|
164
164
|
}
|
165
165
|
AMD_DEBUG1 (("slen %g\n", (scs_float) slen)) ;
|
166
166
|
if (!S)
|
data/vendor/scs/linsys/gpu/gpu.h
CHANGED
@@ -1,12 +1,17 @@
|
|
1
|
-
#ifndef
|
2
|
-
#define
|
1
|
+
#ifndef SCS_GPU_H_GUARD
|
2
|
+
#define SCS_GPU_H_GUARD
|
3
3
|
|
4
4
|
#ifdef __cplusplus
|
5
5
|
extern "C" {
|
6
6
|
#endif
|
7
7
|
|
8
|
-
|
8
|
+
/* TODO: Do we need this?
|
9
|
+
|
9
10
|
#include <cuda.h>
|
11
|
+
|
12
|
+
*/
|
13
|
+
|
14
|
+
#include <cublas_v2.h>
|
10
15
|
#include <cuda_runtime_api.h>
|
11
16
|
#include <cusparse.h>
|
12
17
|
|
@@ -31,11 +36,9 @@ extern "C" {
|
|
31
36
|
#ifndef SFLOAT
|
32
37
|
#define CUBLAS(x) cublasD##x
|
33
38
|
#define CUBLASI(x) cublasId##x
|
34
|
-
#define CUSPARSE(x) cusparseD##x
|
35
39
|
#else
|
36
40
|
#define CUBLAS(x) cublasS##x
|
37
41
|
#define CUBLASI(x) cublasIs##x
|
38
|
-
#define CUSPARSE(x) cusparseS##x
|
39
42
|
#endif
|
40
43
|
#define CUSPARSE_GEN(x) cusparse##x
|
41
44
|
#else
|
@@ -46,9 +49,6 @@ extern "C" {
|
|
46
49
|
#define CUBLASI(x) \
|
47
50
|
CUDA_CHECK_ERR; \
|
48
51
|
cublasId##x
|
49
|
-
#define CUSPARSE(x) \
|
50
|
-
CUDA_CHECK_ERR; \
|
51
|
-
cusparseD##x
|
52
52
|
#else
|
53
53
|
#define CUBLAS(x) \
|
54
54
|
CUDA_CHECK_ERR; \
|
@@ -56,9 +56,6 @@ extern "C" {
|
|
56
56
|
#define CUBLASI(x) \
|
57
57
|
CUDA_CHECK_ERR; \
|
58
58
|
cublasIs##x
|
59
|
-
#define CUSPARSE(x) \
|
60
|
-
CUDA_CHECK_ERR; \
|
61
|
-
cusparseS##x
|
62
59
|
#endif
|
63
60
|
#define CUSPARSE_GEN(x) \
|
64
61
|
CUDA_CHECK_ERR; \
|
@@ -35,63 +35,77 @@ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
|
35
35
|
}
|
36
36
|
*/
|
37
37
|
|
38
|
-
/*
|
39
|
-
|
38
|
+
/* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
|
39
|
+
/* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
|
40
|
+
static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
|
40
41
|
scs_int i, k;
|
41
42
|
const ScsMatrix *A = p->A;
|
42
43
|
const ScsMatrix *P = p->P;
|
43
|
-
scs_float *M =
|
44
|
+
scs_float *M = p->M;
|
44
45
|
|
45
46
|
#if VERBOSITY > 0
|
46
47
|
scs_printf("getting pre-conditioner\n");
|
47
48
|
#endif
|
48
49
|
|
50
|
+
/* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
|
49
51
|
for (i = 0; i < A->n; ++i) { /* cols */
|
50
|
-
|
51
|
-
|
52
|
+
/* M_ii = (R_x)_i */
|
53
|
+
M[i] = diag_r[i];
|
54
|
+
/* M_ii += a_i' (R_y)^-1 a_i */
|
52
55
|
for (k = A->p[i]; k < A->p[i + 1]; ++k) {
|
53
56
|
/* A->i[k] is row of entry k with value A->x[k] */
|
54
|
-
M[i] += A->x[k] * A->x[k] /
|
57
|
+
M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
|
55
58
|
}
|
56
59
|
if (P) {
|
57
60
|
for (k = P->p[i]; k < P->p[i + 1]; k++) {
|
58
61
|
/* diagonal element only */
|
59
62
|
if (P->i[k] == i) { /* row == col */
|
63
|
+
/* M_ii += P_ii */
|
60
64
|
M[i] += P->x[k];
|
61
65
|
break;
|
62
66
|
}
|
63
67
|
}
|
64
68
|
}
|
69
|
+
/* finally invert for pre-conditioner */
|
65
70
|
M[i] = 1. / M[i];
|
66
71
|
}
|
67
|
-
cudaMemcpy(p->
|
68
|
-
scs_free(M);
|
72
|
+
cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
69
73
|
#if VERBOSITY > 0
|
70
74
|
scs_printf("finished getting pre-conditioner\n");
|
71
75
|
#endif
|
72
76
|
}
|
73
77
|
|
74
78
|
/* no need to update anything in this case */
|
75
|
-
void SCS(
|
79
|
+
void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
|
76
80
|
scs_int i;
|
81
|
+
|
82
|
+
/* R_x to gpu */
|
83
|
+
cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
|
84
|
+
cudaMemcpyHostToDevice);
|
85
|
+
|
86
|
+
/* 1/R_y to gpu */
|
77
87
|
for (i = 0; i < p->m; ++i)
|
78
|
-
p->
|
79
|
-
cudaMemcpy(p->
|
88
|
+
p->inv_r_y[i] = 1. / diag_r[p->n + i];
|
89
|
+
cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
|
80
90
|
cudaMemcpyHostToDevice);
|
81
|
-
|
91
|
+
|
92
|
+
/* set preconditioner M on gpu */
|
93
|
+
set_preconditioner(p, diag_r);
|
82
94
|
}
|
83
95
|
|
84
96
|
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
85
97
|
if (p) {
|
86
|
-
scs_free(p->
|
98
|
+
scs_free(p->M);
|
99
|
+
scs_free(p->inv_r_y);
|
87
100
|
cudaFree(p->p);
|
88
101
|
cudaFree(p->r);
|
89
102
|
cudaFree(p->Gp);
|
90
103
|
cudaFree(p->bg);
|
91
104
|
cudaFree(p->tmp_m);
|
92
105
|
cudaFree(p->z);
|
93
|
-
cudaFree(p->
|
94
|
-
cudaFree(p->
|
106
|
+
cudaFree(p->M_gpu);
|
107
|
+
cudaFree(p->r_x_gpu);
|
108
|
+
cudaFree(p->inv_r_y_gpu);
|
95
109
|
if (p->Pg) {
|
96
110
|
SCS(free_gpu_matrix)(p->Pg);
|
97
111
|
scs_free(p->Pg);
|
@@ -126,22 +140,23 @@ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
|
|
126
140
|
0, M, 1, z, 1);
|
127
141
|
}
|
128
142
|
|
129
|
-
/* y = (
|
143
|
+
/* y = (R_x + P + A' R_y^{-1} A) x */
|
130
144
|
static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
131
145
|
/* x and y MUST already be loaded to GPU */
|
132
146
|
scs_float *z = p->tmp_m; /* temp memory */
|
133
|
-
cudaMemset(y, 0, p->n * sizeof(scs_float));
|
134
147
|
cudaMemset(z, 0, p->m * sizeof(scs_float));
|
135
148
|
|
136
149
|
cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
|
137
150
|
cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
|
138
151
|
cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
|
139
152
|
|
140
|
-
/* y =
|
141
|
-
|
153
|
+
/* y = x */
|
154
|
+
cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
155
|
+
/* y = R_x * x */
|
156
|
+
scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
|
142
157
|
|
143
158
|
if (p->Pg) {
|
144
|
-
/* y =
|
159
|
+
/* y = R_x * x + P x */
|
145
160
|
SCS(accum_by_p_gpu)
|
146
161
|
(p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
147
162
|
&p->buffer);
|
@@ -158,9 +173,9 @@ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
158
173
|
&p->buffer);
|
159
174
|
#endif
|
160
175
|
/* z = R_y^{-1} A x */
|
161
|
-
scale_by_diag(p->cublas_handle, p->
|
176
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
|
162
177
|
|
163
|
-
/* y += A'z => y =
|
178
|
+
/* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
|
164
179
|
SCS(accum_by_atrans_gpu)
|
165
180
|
(p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
166
181
|
&p->buffer);
|
@@ -201,19 +216,35 @@ static csc *fill_p_matrix(const ScsMatrix *P) {
|
|
201
216
|
}
|
202
217
|
|
203
218
|
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
204
|
-
scs_float *
|
219
|
+
const scs_float *diag_r) {
|
205
220
|
cudaError_t err;
|
206
|
-
scs_int i;
|
207
221
|
csc *P_full;
|
208
|
-
ScsLinSysWork *p =
|
209
|
-
ScsGpuMatrix *Ag =
|
222
|
+
ScsLinSysWork *p = SCS_NULL;
|
223
|
+
ScsGpuMatrix *Ag = SCS_NULL;
|
210
224
|
ScsGpuMatrix *Pg = SCS_NULL;
|
225
|
+
int device_count;
|
226
|
+
|
227
|
+
err = cudaGetDeviceCount(&device_count);
|
228
|
+
if (err > 0) {
|
229
|
+
scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
|
230
|
+
return SCS_NULL;
|
231
|
+
}
|
232
|
+
|
233
|
+
p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
234
|
+
Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
235
|
+
|
236
|
+
p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
|
237
|
+
p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
|
238
|
+
|
239
|
+
p->A = A;
|
240
|
+
p->P = P;
|
241
|
+
p->m = A->m;
|
242
|
+
p->n = A->n;
|
211
243
|
|
212
244
|
#if GPU_TRANSPOSE_MAT > 0
|
213
245
|
size_t new_buffer_size = 0;
|
214
246
|
#endif
|
215
247
|
|
216
|
-
p->rho_x = rho_x;
|
217
248
|
p->cublas_handle = 0;
|
218
249
|
p->cusparse_handle = 0;
|
219
250
|
|
@@ -242,8 +273,9 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
242
273
|
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
243
274
|
cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
|
244
275
|
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
245
|
-
cudaMalloc((void **)&p->
|
246
|
-
cudaMalloc((void **)&p->
|
276
|
+
cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
|
277
|
+
cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
|
278
|
+
cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
|
247
279
|
|
248
280
|
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
249
281
|
cudaMemcpyHostToDevice);
|
@@ -251,12 +283,6 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
251
283
|
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
252
284
|
cudaMemcpyHostToDevice);
|
253
285
|
|
254
|
-
p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
|
255
|
-
for (i = 0; i < A->m; ++i)
|
256
|
-
p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
|
257
|
-
cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
|
258
|
-
cudaMemcpyHostToDevice);
|
259
|
-
|
260
286
|
cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
|
261
287
|
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
262
288
|
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
@@ -297,7 +323,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
297
323
|
cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
298
324
|
cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
|
299
325
|
|
300
|
-
|
326
|
+
/* Form preconditioner and copy R_x, 1/R_y to gpu */
|
327
|
+
SCS(update_lin_sys_diag_r)(p, diag_r);
|
301
328
|
|
302
329
|
#if GPU_TRANSPOSE_MAT > 0
|
303
330
|
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
@@ -346,9 +373,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
346
373
|
return p;
|
347
374
|
}
|
348
375
|
|
349
|
-
/* solves (
|
350
|
-
* b */
|
351
|
-
/* on GPU */
|
376
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
|
377
|
+
* b, on GPU */
|
352
378
|
static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
353
379
|
scs_int max_its, scs_float tol) {
|
354
380
|
scs_int i, n = pr->n;
|
@@ -386,7 +412,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
386
412
|
|
387
413
|
/* z = M r */
|
388
414
|
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
389
|
-
scale_by_diag(cublas_handle, pr->
|
415
|
+
scale_by_diag(cublas_handle, pr->M_gpu, z, n);
|
390
416
|
/* ztr = z'r */
|
391
417
|
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
392
418
|
/* p = z */
|
@@ -415,7 +441,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
415
441
|
}
|
416
442
|
/* z = M r */
|
417
443
|
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
418
|
-
scale_by_diag(cublas_handle, pr->
|
444
|
+
scale_by_diag(cublas_handle, pr->M_gpu, z, n);
|
419
445
|
ztr_prev = ztr;
|
420
446
|
/* ztr = z'r */
|
421
447
|
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
@@ -431,14 +457,12 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
431
457
|
/* solves Mx = b, for x but stores result in b */
|
432
458
|
/* s contains warm-start (if available) */
|
433
459
|
/*
|
434
|
-
* [x] = [
|
460
|
+
* [x] = [R_x + P A' ]^{-1} [rx]
|
435
461
|
* [y] [ A -R_y ] [ry]
|
436
462
|
*
|
437
|
-
* R_y = diag(rho_y_vec)
|
438
|
-
*
|
439
463
|
* becomes:
|
440
464
|
*
|
441
|
-
* x = (
|
465
|
+
* x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
|
442
466
|
* y = R_y^{-1} (Ax - ry)
|
443
467
|
*
|
444
468
|
*/
|
@@ -451,7 +475,6 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
451
475
|
scs_float *bg = p->bg;
|
452
476
|
scs_float *tmp_m = p->tmp_m;
|
453
477
|
ScsGpuMatrix *Ag = p->Ag;
|
454
|
-
ScsGpuMatrix *Pg = p->Pg;
|
455
478
|
|
456
479
|
if (CG_NORM(b, p->n + p->m) <= 1e-12) {
|
457
480
|
memset(b, 0, (p->n + p->m) * sizeof(scs_float));
|
@@ -471,7 +494,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
471
494
|
cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
|
472
495
|
cudaMemcpyDeviceToDevice);
|
473
496
|
/* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
|
474
|
-
scale_by_diag(p->cublas_handle, p->
|
497
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
|
475
498
|
|
476
499
|
cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
|
477
500
|
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
|
@@ -483,7 +506,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
483
506
|
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
|
484
507
|
max_iters = 10 * Ag->n;
|
485
508
|
|
486
|
-
/* solves (
|
509
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
|
487
510
|
* in bg */
|
488
511
|
cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
|
489
512
|
|
@@ -504,7 +527,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
504
527
|
#endif
|
505
528
|
|
506
529
|
/* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
|
507
|
-
scale_by_diag(p->cublas_handle, p->
|
530
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
|
508
531
|
|
509
532
|
/* copy bg = [x; y] back to b */
|
510
533
|
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
|
@@ -15,19 +15,20 @@ struct SCS_LIN_SYS_WORK {
|
|
15
15
|
scs_int n, m; /* linear system dimensions */
|
16
16
|
/* reporting */
|
17
17
|
scs_int tot_cg_its;
|
18
|
+
scs_float *M; /* preconditioner on cpu */
|
18
19
|
/* ALL BELOW HOSTED ON THE GPU */
|
19
|
-
scs_float *p;
|
20
|
-
scs_float *r;
|
21
|
-
scs_float *Gp;
|
22
|
-
scs_float *bg;
|
23
|
-
scs_float *tmp_m;
|
24
|
-
scs_float *z;
|
25
|
-
scs_float *
|
20
|
+
scs_float *p; /* cg iterate, n */
|
21
|
+
scs_float *r; /* cg residual, n */
|
22
|
+
scs_float *Gp; /* G * p, n */
|
23
|
+
scs_float *bg; /* b, n */
|
24
|
+
scs_float *tmp_m; /* m, used in mat_vec */
|
25
|
+
scs_float *z; /* preconditioned */
|
26
|
+
scs_float *M_gpu; /* preconditioner */
|
26
27
|
const ScsMatrix *A; /* does *not* own this memory */
|
27
28
|
const ScsMatrix *P; /* does *not* own this memory */
|
28
|
-
ScsGpuMatrix *Ag;
|
29
|
-
ScsGpuMatrix *Agt;
|
30
|
-
ScsGpuMatrix *Pg;
|
29
|
+
ScsGpuMatrix *Ag; /* A matrix on GPU */
|
30
|
+
ScsGpuMatrix *Agt; /* A trans matrix on GPU */
|
31
|
+
ScsGpuMatrix *Pg; /* P matrix on GPU */
|
31
32
|
/* CUDA */
|
32
33
|
cublasHandle_t cublas_handle;
|
33
34
|
cusparseHandle_t cusparse_handle;
|
@@ -39,9 +40,9 @@ struct SCS_LIN_SYS_WORK {
|
|
39
40
|
cusparseDnVecDescr_t dn_vec_n_p; /* Dense vector of length n */
|
40
41
|
|
41
42
|
/* rho terms */
|
42
|
-
scs_float
|
43
|
-
scs_float *
|
44
|
-
scs_float *
|
43
|
+
scs_float *r_x_gpu;
|
44
|
+
scs_float *inv_r_y; /* inverse R_y */
|
45
|
+
scs_float *inv_r_y_gpu; /* inverse R_y on GPU */
|
45
46
|
};
|
46
47
|
|
47
48
|
#ifdef __cplusplus
|