scs 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/README.md +42 -13
- data/lib/scs/ffi.rb +1 -7
- data/lib/scs/matrix.rb +72 -0
- data/lib/scs/solver.rb +19 -26
- data/lib/scs/version.rb +1 -1
- data/lib/scs.rb +1 -0
- data/vendor/scs/CITATION.cff +1 -1
- data/vendor/scs/CMakeLists.txt +55 -7
- data/vendor/scs/Makefile +9 -9
- data/vendor/scs/README.md +4 -1
- data/vendor/scs/include/aa.h +1 -1
- data/vendor/scs/include/cones.h +17 -12
- data/vendor/scs/include/glbopts.h +27 -66
- data/vendor/scs/include/linalg.h +2 -1
- data/vendor/scs/include/linsys.h +13 -13
- data/vendor/scs/include/normalize.h +7 -5
- data/vendor/scs/include/rw.h +3 -3
- data/vendor/scs/include/scs.h +85 -106
- data/vendor/scs/include/scs_types.h +34 -0
- data/vendor/scs/include/scs_work.h +80 -0
- data/vendor/scs/include/util.h +3 -1
- data/vendor/scs/linsys/cpu/direct/private.c +86 -73
- data/vendor/scs/linsys/cpu/direct/private.h +2 -2
- data/vendor/scs/linsys/cpu/indirect/private.c +42 -33
- data/vendor/scs/linsys/cpu/indirect/private.h +1 -2
- data/vendor/scs/linsys/csparse.c +3 -3
- data/vendor/scs/linsys/external/amd/LICENSE.txt +0 -897
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +9 -7
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +1 -1
- data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
- data/vendor/scs/linsys/gpu/gpu.h +8 -11
- data/vendor/scs/linsys/gpu/indirect/private.c +72 -49
- data/vendor/scs/linsys/gpu/indirect/private.h +14 -13
- data/vendor/scs/linsys/scs_matrix.c +55 -104
- data/vendor/scs/linsys/scs_matrix.h +5 -4
- data/vendor/scs/scs.mk +1 -5
- data/vendor/scs/src/aa.c +13 -8
- data/vendor/scs/src/cones.c +197 -108
- data/vendor/scs/src/linalg.c +25 -0
- data/vendor/scs/src/normalize.c +75 -26
- data/vendor/scs/src/rw.c +74 -30
- data/vendor/scs/src/scs.c +300 -264
- data/vendor/scs/src/scs_version.c +8 -6
- data/vendor/scs/src/util.c +27 -13
- data/vendor/scs/test/minunit.h +6 -1
- data/vendor/scs/test/problem_utils.h +28 -35
- data/vendor/scs/test/problems/degenerate.h +2 -1
- data/vendor/scs/test/problems/hs21_tiny_qp.h +2 -1
- data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +6 -2
- data/vendor/scs/test/problems/infeasible_tiny_qp.h +2 -1
- data/vendor/scs/test/problems/qafiro_tiny_qp.h +5 -4
- data/vendor/scs/test/problems/random_prob.h +6 -2
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +9 -2
- data/vendor/scs/test/problems/small_lp.h +7 -2
- data/vendor/scs/test/problems/small_qp.h +387 -0
- data/vendor/scs/test/problems/{test_fails.h → test_validation.h} +7 -4
- data/vendor/scs/test/problems/unbounded_tiny_qp.h +4 -4
- data/vendor/scs/test/random_socp_prob.c +4 -2
- data/vendor/scs/test/run_from_file.c +16 -4
- data/vendor/scs/test/run_tests.c +23 -14
- metadata +10 -35
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/csparse.o +0 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/linsys/scs_matrix.o +0 -0
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_indir.o +0 -0
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.o +0 -0
|
@@ -18,7 +18,6 @@
|
|
|
18
18
|
|
|
19
19
|
#ifdef MATLAB_MEX_FILE
|
|
20
20
|
#include "mex.h"
|
|
21
|
-
#include "scs_matrix.h"
|
|
22
21
|
#endif
|
|
23
22
|
|
|
24
23
|
#ifndef NULL
|
|
@@ -51,7 +50,9 @@
|
|
|
51
50
|
|
|
52
51
|
struct SuiteSparse_config_struct SuiteSparse_config =
|
|
53
52
|
{
|
|
54
|
-
|
|
53
|
+
scs_malloc, scs_calloc, scs_realloc, scs_free,
|
|
54
|
+
/* Disable printing */
|
|
55
|
+
SCS_NULL,
|
|
55
56
|
SuiteSparse_hypot,
|
|
56
57
|
SuiteSparse_divcomplex
|
|
57
58
|
|
|
@@ -73,13 +74,14 @@ struct SuiteSparse_config_struct SuiteSparse_config =
|
|
|
73
74
|
SuiteSparse_start be called prior to calling any SuiteSparse function.
|
|
74
75
|
*/
|
|
75
76
|
|
|
77
|
+
|
|
76
78
|
void SuiteSparse_start ( void )
|
|
77
79
|
{
|
|
78
|
-
SuiteSparse_config.malloc_func =
|
|
79
|
-
SuiteSparse_config.calloc_func =
|
|
80
|
-
SuiteSparse_config.realloc_func =
|
|
81
|
-
SuiteSparse_config.free_func =
|
|
82
|
-
SuiteSparse_config.printf_func =
|
|
80
|
+
SuiteSparse_config.malloc_func = scs_malloc ;
|
|
81
|
+
SuiteSparse_config.calloc_func = scs_calloc ;
|
|
82
|
+
SuiteSparse_config.realloc_func = scs_realloc ;
|
|
83
|
+
SuiteSparse_config.free_func = scs_free ;
|
|
84
|
+
SuiteSparse_config.printf_func = SCS_NULL;
|
|
83
85
|
/* math functions */
|
|
84
86
|
SuiteSparse_config.hypot_func = SuiteSparse_hypot ;
|
|
85
87
|
SuiteSparse_config.divcomplex_func = SuiteSparse_divcomplex ;
|
|
@@ -89,8 +89,8 @@ GLOBAL Int AMD_order
|
|
|
89
89
|
}
|
|
90
90
|
|
|
91
91
|
/* allocate two size-n integer workspaces */
|
|
92
|
-
Len = SuiteSparse_malloc (n, sizeof (Int)) ;
|
|
93
|
-
Pinv = SuiteSparse_malloc (n, sizeof (Int)) ;
|
|
92
|
+
Len = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
|
|
93
|
+
Pinv = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
|
|
94
94
|
mem += n ;
|
|
95
95
|
mem += n ;
|
|
96
96
|
if (!Len || !Pinv)
|
|
@@ -106,8 +106,8 @@ GLOBAL Int AMD_order
|
|
|
106
106
|
{
|
|
107
107
|
/* sort the input matrix and remove duplicate entries */
|
|
108
108
|
AMD_DEBUG1 (("Matrix is jumbled\n")) ;
|
|
109
|
-
Rp = SuiteSparse_malloc (n+1, sizeof (Int)) ;
|
|
110
|
-
Ri = SuiteSparse_malloc (nz, sizeof (Int)) ;
|
|
109
|
+
Rp = (Int *)SuiteSparse_malloc (n+1, sizeof (Int)) ;
|
|
110
|
+
Ri = (Int *)SuiteSparse_malloc (nz, sizeof (Int)) ;
|
|
111
111
|
mem += (n+1) ;
|
|
112
112
|
mem += MAX (nz,1) ;
|
|
113
113
|
if (!Rp || !Ri)
|
|
@@ -160,7 +160,7 @@ GLOBAL Int AMD_order
|
|
|
160
160
|
ok = ok && (slen < Int_MAX) ; /* S[i] for Int i must be OK */
|
|
161
161
|
if (ok)
|
|
162
162
|
{
|
|
163
|
-
S = SuiteSparse_malloc (slen, sizeof (Int)) ;
|
|
163
|
+
S = (Int *)SuiteSparse_malloc (slen, sizeof (Int)) ;
|
|
164
164
|
}
|
|
165
165
|
AMD_DEBUG1 (("slen %g\n", (scs_float) slen)) ;
|
|
166
166
|
if (!S)
|
data/vendor/scs/linsys/gpu/gpu.h
CHANGED
|
@@ -1,12 +1,17 @@
|
|
|
1
|
-
#ifndef
|
|
2
|
-
#define
|
|
1
|
+
#ifndef SCS_GPU_H_GUARD
|
|
2
|
+
#define SCS_GPU_H_GUARD
|
|
3
3
|
|
|
4
4
|
#ifdef __cplusplus
|
|
5
5
|
extern "C" {
|
|
6
6
|
#endif
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
/* TODO: Do we need this?
|
|
9
|
+
|
|
9
10
|
#include <cuda.h>
|
|
11
|
+
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
#include <cublas_v2.h>
|
|
10
15
|
#include <cuda_runtime_api.h>
|
|
11
16
|
#include <cusparse.h>
|
|
12
17
|
|
|
@@ -31,11 +36,9 @@ extern "C" {
|
|
|
31
36
|
#ifndef SFLOAT
|
|
32
37
|
#define CUBLAS(x) cublasD##x
|
|
33
38
|
#define CUBLASI(x) cublasId##x
|
|
34
|
-
#define CUSPARSE(x) cusparseD##x
|
|
35
39
|
#else
|
|
36
40
|
#define CUBLAS(x) cublasS##x
|
|
37
41
|
#define CUBLASI(x) cublasIs##x
|
|
38
|
-
#define CUSPARSE(x) cusparseS##x
|
|
39
42
|
#endif
|
|
40
43
|
#define CUSPARSE_GEN(x) cusparse##x
|
|
41
44
|
#else
|
|
@@ -46,9 +49,6 @@ extern "C" {
|
|
|
46
49
|
#define CUBLASI(x) \
|
|
47
50
|
CUDA_CHECK_ERR; \
|
|
48
51
|
cublasId##x
|
|
49
|
-
#define CUSPARSE(x) \
|
|
50
|
-
CUDA_CHECK_ERR; \
|
|
51
|
-
cusparseD##x
|
|
52
52
|
#else
|
|
53
53
|
#define CUBLAS(x) \
|
|
54
54
|
CUDA_CHECK_ERR; \
|
|
@@ -56,9 +56,6 @@ extern "C" {
|
|
|
56
56
|
#define CUBLASI(x) \
|
|
57
57
|
CUDA_CHECK_ERR; \
|
|
58
58
|
cublasIs##x
|
|
59
|
-
#define CUSPARSE(x) \
|
|
60
|
-
CUDA_CHECK_ERR; \
|
|
61
|
-
cusparseS##x
|
|
62
59
|
#endif
|
|
63
60
|
#define CUSPARSE_GEN(x) \
|
|
64
61
|
CUDA_CHECK_ERR; \
|
|
@@ -35,63 +35,77 @@ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
|
|
35
35
|
}
|
|
36
36
|
*/
|
|
37
37
|
|
|
38
|
-
/*
|
|
39
|
-
|
|
38
|
+
/* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
|
|
39
|
+
/* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
|
|
40
|
+
static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
|
|
40
41
|
scs_int i, k;
|
|
41
42
|
const ScsMatrix *A = p->A;
|
|
42
43
|
const ScsMatrix *P = p->P;
|
|
43
|
-
scs_float *M =
|
|
44
|
+
scs_float *M = p->M;
|
|
44
45
|
|
|
45
46
|
#if VERBOSITY > 0
|
|
46
47
|
scs_printf("getting pre-conditioner\n");
|
|
47
48
|
#endif
|
|
48
49
|
|
|
50
|
+
/* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
|
|
49
51
|
for (i = 0; i < A->n; ++i) { /* cols */
|
|
50
|
-
|
|
51
|
-
|
|
52
|
+
/* M_ii = (R_x)_i */
|
|
53
|
+
M[i] = diag_r[i];
|
|
54
|
+
/* M_ii += a_i' (R_y)^-1 a_i */
|
|
52
55
|
for (k = A->p[i]; k < A->p[i + 1]; ++k) {
|
|
53
56
|
/* A->i[k] is row of entry k with value A->x[k] */
|
|
54
|
-
M[i] += A->x[k] * A->x[k] /
|
|
57
|
+
M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
|
|
55
58
|
}
|
|
56
59
|
if (P) {
|
|
57
60
|
for (k = P->p[i]; k < P->p[i + 1]; k++) {
|
|
58
61
|
/* diagonal element only */
|
|
59
62
|
if (P->i[k] == i) { /* row == col */
|
|
63
|
+
/* M_ii += P_ii */
|
|
60
64
|
M[i] += P->x[k];
|
|
61
65
|
break;
|
|
62
66
|
}
|
|
63
67
|
}
|
|
64
68
|
}
|
|
69
|
+
/* finally invert for pre-conditioner */
|
|
65
70
|
M[i] = 1. / M[i];
|
|
66
71
|
}
|
|
67
|
-
cudaMemcpy(p->
|
|
68
|
-
scs_free(M);
|
|
72
|
+
cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
69
73
|
#if VERBOSITY > 0
|
|
70
74
|
scs_printf("finished getting pre-conditioner\n");
|
|
71
75
|
#endif
|
|
72
76
|
}
|
|
73
77
|
|
|
74
78
|
/* no need to update anything in this case */
|
|
75
|
-
void SCS(
|
|
79
|
+
void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
|
|
76
80
|
scs_int i;
|
|
81
|
+
|
|
82
|
+
/* R_x to gpu */
|
|
83
|
+
cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
|
|
84
|
+
cudaMemcpyHostToDevice);
|
|
85
|
+
|
|
86
|
+
/* 1/R_y to gpu */
|
|
77
87
|
for (i = 0; i < p->m; ++i)
|
|
78
|
-
p->
|
|
79
|
-
cudaMemcpy(p->
|
|
88
|
+
p->inv_r_y[i] = 1. / diag_r[p->n + i];
|
|
89
|
+
cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
|
|
80
90
|
cudaMemcpyHostToDevice);
|
|
81
|
-
|
|
91
|
+
|
|
92
|
+
/* set preconditioner M on gpu */
|
|
93
|
+
set_preconditioner(p, diag_r);
|
|
82
94
|
}
|
|
83
95
|
|
|
84
96
|
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
85
97
|
if (p) {
|
|
86
|
-
scs_free(p->
|
|
98
|
+
scs_free(p->M);
|
|
99
|
+
scs_free(p->inv_r_y);
|
|
87
100
|
cudaFree(p->p);
|
|
88
101
|
cudaFree(p->r);
|
|
89
102
|
cudaFree(p->Gp);
|
|
90
103
|
cudaFree(p->bg);
|
|
91
104
|
cudaFree(p->tmp_m);
|
|
92
105
|
cudaFree(p->z);
|
|
93
|
-
cudaFree(p->
|
|
94
|
-
cudaFree(p->
|
|
106
|
+
cudaFree(p->M_gpu);
|
|
107
|
+
cudaFree(p->r_x_gpu);
|
|
108
|
+
cudaFree(p->inv_r_y_gpu);
|
|
95
109
|
if (p->Pg) {
|
|
96
110
|
SCS(free_gpu_matrix)(p->Pg);
|
|
97
111
|
scs_free(p->Pg);
|
|
@@ -126,22 +140,23 @@ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
|
|
|
126
140
|
0, M, 1, z, 1);
|
|
127
141
|
}
|
|
128
142
|
|
|
129
|
-
/* y = (
|
|
143
|
+
/* y = (R_x + P + A' R_y^{-1} A) x */
|
|
130
144
|
static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
131
145
|
/* x and y MUST already be loaded to GPU */
|
|
132
146
|
scs_float *z = p->tmp_m; /* temp memory */
|
|
133
|
-
cudaMemset(y, 0, p->n * sizeof(scs_float));
|
|
134
147
|
cudaMemset(z, 0, p->m * sizeof(scs_float));
|
|
135
148
|
|
|
136
149
|
cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
|
|
137
150
|
cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
|
|
138
151
|
cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
|
|
139
152
|
|
|
140
|
-
/* y =
|
|
141
|
-
|
|
153
|
+
/* y = x */
|
|
154
|
+
cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
155
|
+
/* y = R_x * x */
|
|
156
|
+
scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
|
|
142
157
|
|
|
143
158
|
if (p->Pg) {
|
|
144
|
-
/* y =
|
|
159
|
+
/* y = R_x * x + P x */
|
|
145
160
|
SCS(accum_by_p_gpu)
|
|
146
161
|
(p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
|
147
162
|
&p->buffer);
|
|
@@ -158,9 +173,9 @@ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
|
158
173
|
&p->buffer);
|
|
159
174
|
#endif
|
|
160
175
|
/* z = R_y^{-1} A x */
|
|
161
|
-
scale_by_diag(p->cublas_handle, p->
|
|
176
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
|
|
162
177
|
|
|
163
|
-
/* y += A'z => y =
|
|
178
|
+
/* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
|
|
164
179
|
SCS(accum_by_atrans_gpu)
|
|
165
180
|
(p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
|
|
166
181
|
&p->buffer);
|
|
@@ -201,19 +216,35 @@ static csc *fill_p_matrix(const ScsMatrix *P) {
|
|
|
201
216
|
}
|
|
202
217
|
|
|
203
218
|
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
204
|
-
scs_float *
|
|
219
|
+
const scs_float *diag_r) {
|
|
205
220
|
cudaError_t err;
|
|
206
|
-
scs_int i;
|
|
207
221
|
csc *P_full;
|
|
208
|
-
ScsLinSysWork *p =
|
|
209
|
-
ScsGpuMatrix *Ag =
|
|
222
|
+
ScsLinSysWork *p = SCS_NULL;
|
|
223
|
+
ScsGpuMatrix *Ag = SCS_NULL;
|
|
210
224
|
ScsGpuMatrix *Pg = SCS_NULL;
|
|
225
|
+
int device_count;
|
|
226
|
+
|
|
227
|
+
err = cudaGetDeviceCount(&device_count);
|
|
228
|
+
if (err > 0) {
|
|
229
|
+
scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
|
|
230
|
+
return SCS_NULL;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
|
234
|
+
Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
|
|
235
|
+
|
|
236
|
+
p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
|
|
237
|
+
p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
|
|
238
|
+
|
|
239
|
+
p->A = A;
|
|
240
|
+
p->P = P;
|
|
241
|
+
p->m = A->m;
|
|
242
|
+
p->n = A->n;
|
|
211
243
|
|
|
212
244
|
#if GPU_TRANSPOSE_MAT > 0
|
|
213
245
|
size_t new_buffer_size = 0;
|
|
214
246
|
#endif
|
|
215
247
|
|
|
216
|
-
p->rho_x = rho_x;
|
|
217
248
|
p->cublas_handle = 0;
|
|
218
249
|
p->cusparse_handle = 0;
|
|
219
250
|
|
|
@@ -242,8 +273,9 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
|
242
273
|
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
|
243
274
|
cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
|
|
244
275
|
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
|
245
|
-
cudaMalloc((void **)&p->
|
|
246
|
-
cudaMalloc((void **)&p->
|
|
276
|
+
cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
|
|
277
|
+
cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
|
|
278
|
+
cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
|
|
247
279
|
|
|
248
280
|
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
|
249
281
|
cudaMemcpyHostToDevice);
|
|
@@ -251,12 +283,6 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
|
251
283
|
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
|
252
284
|
cudaMemcpyHostToDevice);
|
|
253
285
|
|
|
254
|
-
p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
|
|
255
|
-
for (i = 0; i < A->m; ++i)
|
|
256
|
-
p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
|
|
257
|
-
cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
|
|
258
|
-
cudaMemcpyHostToDevice);
|
|
259
|
-
|
|
260
286
|
cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
|
|
261
287
|
SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
|
|
262
288
|
CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
|
|
@@ -297,7 +323,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
|
297
323
|
cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
|
|
298
324
|
cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
|
|
299
325
|
|
|
300
|
-
|
|
326
|
+
/* Form preconditioner and copy R_x, 1/R_y to gpu */
|
|
327
|
+
SCS(update_lin_sys_diag_r)(p, diag_r);
|
|
301
328
|
|
|
302
329
|
#if GPU_TRANSPOSE_MAT > 0
|
|
303
330
|
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
|
@@ -346,9 +373,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
|
|
|
346
373
|
return p;
|
|
347
374
|
}
|
|
348
375
|
|
|
349
|
-
/* solves (
|
|
350
|
-
* b */
|
|
351
|
-
/* on GPU */
|
|
376
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
|
|
377
|
+
* b, on GPU */
|
|
352
378
|
static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
353
379
|
scs_int max_its, scs_float tol) {
|
|
354
380
|
scs_int i, n = pr->n;
|
|
@@ -386,7 +412,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
|
386
412
|
|
|
387
413
|
/* z = M r */
|
|
388
414
|
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
389
|
-
scale_by_diag(cublas_handle, pr->
|
|
415
|
+
scale_by_diag(cublas_handle, pr->M_gpu, z, n);
|
|
390
416
|
/* ztr = z'r */
|
|
391
417
|
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
|
392
418
|
/* p = z */
|
|
@@ -415,7 +441,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
|
415
441
|
}
|
|
416
442
|
/* z = M r */
|
|
417
443
|
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
418
|
-
scale_by_diag(cublas_handle, pr->
|
|
444
|
+
scale_by_diag(cublas_handle, pr->M_gpu, z, n);
|
|
419
445
|
ztr_prev = ztr;
|
|
420
446
|
/* ztr = z'r */
|
|
421
447
|
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
|
|
@@ -431,14 +457,12 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
|
431
457
|
/* solves Mx = b, for x but stores result in b */
|
|
432
458
|
/* s contains warm-start (if available) */
|
|
433
459
|
/*
|
|
434
|
-
* [x] = [
|
|
460
|
+
* [x] = [R_x + P A' ]^{-1} [rx]
|
|
435
461
|
* [y] [ A -R_y ] [ry]
|
|
436
462
|
*
|
|
437
|
-
* R_y = diag(rho_y_vec)
|
|
438
|
-
*
|
|
439
463
|
* becomes:
|
|
440
464
|
*
|
|
441
|
-
* x = (
|
|
465
|
+
* x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
|
|
442
466
|
* y = R_y^{-1} (Ax - ry)
|
|
443
467
|
*
|
|
444
468
|
*/
|
|
@@ -451,7 +475,6 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
|
451
475
|
scs_float *bg = p->bg;
|
|
452
476
|
scs_float *tmp_m = p->tmp_m;
|
|
453
477
|
ScsGpuMatrix *Ag = p->Ag;
|
|
454
|
-
ScsGpuMatrix *Pg = p->Pg;
|
|
455
478
|
|
|
456
479
|
if (CG_NORM(b, p->n + p->m) <= 1e-12) {
|
|
457
480
|
memset(b, 0, (p->n + p->m) * sizeof(scs_float));
|
|
@@ -471,7 +494,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
|
471
494
|
cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
|
|
472
495
|
cudaMemcpyDeviceToDevice);
|
|
473
496
|
/* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
|
|
474
|
-
scale_by_diag(p->cublas_handle, p->
|
|
497
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
|
|
475
498
|
|
|
476
499
|
cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
|
|
477
500
|
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
|
|
@@ -483,7 +506,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
|
483
506
|
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
|
|
484
507
|
max_iters = 10 * Ag->n;
|
|
485
508
|
|
|
486
|
-
/* solves (
|
|
509
|
+
/* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
|
|
487
510
|
* in bg */
|
|
488
511
|
cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
|
|
489
512
|
|
|
@@ -504,7 +527,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
|
504
527
|
#endif
|
|
505
528
|
|
|
506
529
|
/* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
|
|
507
|
-
scale_by_diag(p->cublas_handle, p->
|
|
530
|
+
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
|
|
508
531
|
|
|
509
532
|
/* copy bg = [x; y] back to b */
|
|
510
533
|
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
|
|
@@ -15,19 +15,20 @@ struct SCS_LIN_SYS_WORK {
|
|
|
15
15
|
scs_int n, m; /* linear system dimensions */
|
|
16
16
|
/* reporting */
|
|
17
17
|
scs_int tot_cg_its;
|
|
18
|
+
scs_float *M; /* preconditioner on cpu */
|
|
18
19
|
/* ALL BELOW HOSTED ON THE GPU */
|
|
19
|
-
scs_float *p;
|
|
20
|
-
scs_float *r;
|
|
21
|
-
scs_float *Gp;
|
|
22
|
-
scs_float *bg;
|
|
23
|
-
scs_float *tmp_m;
|
|
24
|
-
scs_float *z;
|
|
25
|
-
scs_float *
|
|
20
|
+
scs_float *p; /* cg iterate, n */
|
|
21
|
+
scs_float *r; /* cg residual, n */
|
|
22
|
+
scs_float *Gp; /* G * p, n */
|
|
23
|
+
scs_float *bg; /* b, n */
|
|
24
|
+
scs_float *tmp_m; /* m, used in mat_vec */
|
|
25
|
+
scs_float *z; /* preconditioned */
|
|
26
|
+
scs_float *M_gpu; /* preconditioner */
|
|
26
27
|
const ScsMatrix *A; /* does *not* own this memory */
|
|
27
28
|
const ScsMatrix *P; /* does *not* own this memory */
|
|
28
|
-
ScsGpuMatrix *Ag;
|
|
29
|
-
ScsGpuMatrix *Agt;
|
|
30
|
-
ScsGpuMatrix *Pg;
|
|
29
|
+
ScsGpuMatrix *Ag; /* A matrix on GPU */
|
|
30
|
+
ScsGpuMatrix *Agt; /* A trans matrix on GPU */
|
|
31
|
+
ScsGpuMatrix *Pg; /* P matrix on GPU */
|
|
31
32
|
/* CUDA */
|
|
32
33
|
cublasHandle_t cublas_handle;
|
|
33
34
|
cusparseHandle_t cusparse_handle;
|
|
@@ -39,9 +40,9 @@ struct SCS_LIN_SYS_WORK {
|
|
|
39
40
|
cusparseDnVecDescr_t dn_vec_n_p; /* Dense vector of length n */
|
|
40
41
|
|
|
41
42
|
/* rho terms */
|
|
42
|
-
scs_float
|
|
43
|
-
scs_float *
|
|
44
|
-
scs_float *
|
|
43
|
+
scs_float *r_x_gpu;
|
|
44
|
+
scs_float *inv_r_y; /* inverse R_y */
|
|
45
|
+
scs_float *inv_r_y_gpu; /* inverse R_y on GPU */
|
|
45
46
|
};
|
|
46
47
|
|
|
47
48
|
#ifdef __cplusplus
|