scs 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +12 -0
- data/LICENSE.txt +22 -0
- data/README.md +98 -0
- data/ext/scs/extconf.rb +29 -0
- data/lib/scs.rb +17 -0
- data/lib/scs/ffi.rb +117 -0
- data/lib/scs/solver.rb +173 -0
- data/lib/scs/version.rb +3 -0
- data/vendor/scs/LICENSE.txt +21 -0
- data/vendor/scs/Makefile +164 -0
- data/vendor/scs/README.md +222 -0
- data/vendor/scs/include/aa.h +56 -0
- data/vendor/scs/include/cones.h +46 -0
- data/vendor/scs/include/ctrlc.h +33 -0
- data/vendor/scs/include/glbopts.h +177 -0
- data/vendor/scs/include/linalg.h +26 -0
- data/vendor/scs/include/linsys.h +64 -0
- data/vendor/scs/include/normalize.h +18 -0
- data/vendor/scs/include/rw.h +17 -0
- data/vendor/scs/include/scs.h +161 -0
- data/vendor/scs/include/scs_blas.h +51 -0
- data/vendor/scs/include/util.h +65 -0
- data/vendor/scs/linsys/amatrix.c +305 -0
- data/vendor/scs/linsys/amatrix.h +36 -0
- data/vendor/scs/linsys/amatrix.o +0 -0
- data/vendor/scs/linsys/cpu/direct/private.c +366 -0
- data/vendor/scs/linsys/cpu/direct/private.h +26 -0
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.c +256 -0
- data/vendor/scs/linsys/cpu/indirect/private.h +31 -0
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/external/amd/LICENSE.txt +934 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +469 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +254 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd.h +400 -0
- data/vendor/scs/linsys/external/amd/amd_1.c +180 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.c +1842 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.c +184 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.c +64 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.c +37 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.c +179 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.c +16 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.c +119 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_internal.h +304 -0
- data/vendor/scs/linsys/external/amd/amd_order.c +199 -0
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.c +120 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.c +206 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.c +118 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.c +92 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/amd/changes +11 -0
- data/vendor/scs/linsys/external/qdldl/LICENSE +201 -0
- data/vendor/scs/linsys/external/qdldl/README.md +120 -0
- data/vendor/scs/linsys/external/qdldl/changes +4 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.c +298 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.h +177 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl_types.h +21 -0
- data/vendor/scs/linsys/gpu/gpu.c +41 -0
- data/vendor/scs/linsys/gpu/gpu.h +85 -0
- data/vendor/scs/linsys/gpu/indirect/private.c +304 -0
- data/vendor/scs/linsys/gpu/indirect/private.h +36 -0
- data/vendor/scs/scs.mk +181 -0
- data/vendor/scs/src/aa.c +224 -0
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.c +802 -0
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.c +77 -0
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.c +84 -0
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.c +93 -0
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.c +167 -0
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.c +978 -0
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_version.c +5 -0
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.c +196 -0
- data/vendor/scs/src/util.o +0 -0
- data/vendor/scs/test/data/small_random_socp +0 -0
- data/vendor/scs/test/minunit.h +13 -0
- data/vendor/scs/test/problem_utils.h +93 -0
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +85 -0
- data/vendor/scs/test/problems/small_lp.h +50 -0
- data/vendor/scs/test/problems/small_random_socp.h +33 -0
- data/vendor/scs/test/random_socp_prob.c +171 -0
- data/vendor/scs/test/run_from_file.c +69 -0
- data/vendor/scs/test/run_tests +2 -0
- data/vendor/scs/test/run_tests.c +32 -0
- metadata +203 -0
@@ -0,0 +1,177 @@
|
|
1
|
+
#ifndef QDLDL_H
|
2
|
+
#define QDLDL_H
|
3
|
+
|
4
|
+
// Include qdldl type options
|
5
|
+
#include "qdldl_types.h"
|
6
|
+
|
7
|
+
# ifdef __cplusplus
|
8
|
+
extern "C" {
|
9
|
+
# endif // ifdef __cplusplus
|
10
|
+
|
11
|
+
/**
|
12
|
+
* Compute the elimination tree for a quasidefinite matrix
|
13
|
+
* in compressed sparse column form, where the input matrix is
|
14
|
+
* assumed to contain data for the upper triangular part of A only,
|
15
|
+
* and there are no duplicate indices.
|
16
|
+
*
|
17
|
+
* Returns an elimination tree for the factorization A = LDL^T and a
|
18
|
+
* count of the nonzeros in each column of L that are strictly below the
|
19
|
+
* diagonal.
|
20
|
+
*
|
21
|
+
* Does not use MALLOC. It is assumed that the arrays work, Lnz, and
|
22
|
+
* etree will be allocated with a number of elements equal to n.
|
23
|
+
*
|
24
|
+
* The data in (n,Ap,Ai) are from a square matrix A in CSC format, and
|
25
|
+
* should include the upper triangular part of A only.
|
26
|
+
*
|
27
|
+
* This function is only intended for factorisation of QD matrices specified
|
28
|
+
* by their upper triangular part. An error is returned if any column has
|
29
|
+
* data below the diagonal or s completely empty.
|
30
|
+
*
|
31
|
+
* For matrices with a non-empty column but a zero on the corresponding diagonal,
|
32
|
+
* this function will *not* return an error, as it may still be possible to factor
|
33
|
+
* such a matrix in LDL form. No promises are made in this case though...
|
34
|
+
*
|
35
|
+
* @param n number of columns in CSC matrix A (assumed square)
|
36
|
+
* @param Ap column pointers (size n+1) for columns of A
|
37
|
+
* @param Ai row indices of A. Has Ap[n] elements
|
38
|
+
* @param work work vector (size n) (no meaning on return)
|
39
|
+
* @param Lnz count of nonzeros in each column of L (size n) below diagonal
|
40
|
+
* @param etree elimination tree (size n)
|
41
|
+
* @return total sum of Lnz (i.e. total nonzeros in L below diagonal). Returns
|
42
|
+
* -1 if the input does not have triu structure or has an empty
|
43
|
+
* column.
|
44
|
+
*
|
45
|
+
*
|
46
|
+
*/
|
47
|
+
|
48
|
+
QDLDL_int QDLDL_etree(const QDLDL_int n,
|
49
|
+
const QDLDL_int* Ap,
|
50
|
+
const QDLDL_int* Ai,
|
51
|
+
QDLDL_int* work,
|
52
|
+
QDLDL_int* Lnz,
|
53
|
+
QDLDL_int* etree);
|
54
|
+
|
55
|
+
/**
|
56
|
+
* Compute an LDL decomposition for a quasidefinite matrix
|
57
|
+
* in compressed sparse column form, where the input matrix is
|
58
|
+
* assumed to contain data for the upper triangular part of A only,
|
59
|
+
* and there are no duplicate indices.
|
60
|
+
*
|
61
|
+
* Returns factors L, D and Dinv = 1./D.
|
62
|
+
*
|
63
|
+
* Does not use MALLOC. It is assumed that L will be a compressed
|
64
|
+
* sparse column matrix with data (Ln,Lp,Li) with sufficient space
|
65
|
+
* allocated, with a number of nonzeros equal to the count given
|
66
|
+
* as a return value by osqp_ldl_etree
|
67
|
+
*
|
68
|
+
* @param n number of columns in L and A (both square)
|
69
|
+
* @param Ap column pointers (size n+1) for columns of A
|
70
|
+
* @param Ai row indices of A. Has Ap[n] elements
|
71
|
+
* @param Ln number of columns in CSC matrix L
|
72
|
+
* @param Lp column pointers (size Ln+1) for columns of L
|
73
|
+
* @param Li row indices of L. Has Lp[Ln] elements
|
74
|
+
* @param D vectorized factor D. Length is n
|
75
|
+
* @param Dinv reciprocal of D. Length is n
|
76
|
+
* @param Lnz count of nonzeros in each column of L below diagonal,
|
77
|
+
* as given by osqp_ldl_etree (not modified)
|
78
|
+
* @param etree elimination tree as as given by osqp_ldl_etree (not modified)
|
79
|
+
* @param bwork working array of bools. Length is n
|
80
|
+
* @param iwork working array of integers. Length is 3*n
|
81
|
+
* @param fwork working array of floats. Length is n
|
82
|
+
* @return Returns a count of the number of positive elements
|
83
|
+
* in D. Returns -1 and exits immediately if any element
|
84
|
+
* of D evaluates exactly to zero (matrix is not quasidefinite
|
85
|
+
* or otherwise LDL factorisable)
|
86
|
+
*
|
87
|
+
*/
|
88
|
+
|
89
|
+
|
90
|
+
QDLDL_int QDLDL_factor(const QDLDL_int n,
|
91
|
+
const QDLDL_int* Ap,
|
92
|
+
const QDLDL_int* Ai,
|
93
|
+
const QDLDL_float* Ax,
|
94
|
+
QDLDL_int* Lp,
|
95
|
+
QDLDL_int* Li,
|
96
|
+
QDLDL_float* Lx,
|
97
|
+
QDLDL_float* D,
|
98
|
+
QDLDL_float* Dinv,
|
99
|
+
const QDLDL_int* Lnz,
|
100
|
+
const QDLDL_int* etree,
|
101
|
+
QDLDL_bool* bwork,
|
102
|
+
QDLDL_int* iwork,
|
103
|
+
QDLDL_float* fwork);
|
104
|
+
|
105
|
+
|
106
|
+
/**
|
107
|
+
* Solves LDL'x = b
|
108
|
+
*
|
109
|
+
* It is assumed that L will be a compressed
|
110
|
+
* sparse column matrix with data (Ln,Lp,Li).
|
111
|
+
*
|
112
|
+
* @param n number of columns in L (both square)
|
113
|
+
* @param Ln number of columns in CSC matrix L
|
114
|
+
* @param Lp column pointers (size Ln+1) for columns of L
|
115
|
+
* @param Li row indices of L. Has Lp[Ln] elements
|
116
|
+
* @param Dinv reciprocal of D. Length is n
|
117
|
+
* @param x initialized to b. Equal to x on return
|
118
|
+
*
|
119
|
+
*
|
120
|
+
*/
|
121
|
+
void QDLDL_solve(const QDLDL_int n,
|
122
|
+
const QDLDL_int* Lp,
|
123
|
+
const QDLDL_int* Li,
|
124
|
+
const QDLDL_float* Lx,
|
125
|
+
const QDLDL_float* Dinv,
|
126
|
+
QDLDL_float* x);
|
127
|
+
|
128
|
+
|
129
|
+
/**
|
130
|
+
* Solves (L+I)x = b
|
131
|
+
*
|
132
|
+
* It is assumed that L will be a compressed
|
133
|
+
* sparse column matrix with data (Ln,Lp,Li).
|
134
|
+
*
|
135
|
+
* @param n number of columns in L (both square)
|
136
|
+
* @param Ln number of columns in CSC matrix L
|
137
|
+
* @param Lp column pointers (size Ln+1) for columns of L
|
138
|
+
* @param Li row indices of L. Has Lp[Ln] elements
|
139
|
+
* @param Dinv reciprocal of D. Length is n
|
140
|
+
* @param x initialized to b. Equal to x on return
|
141
|
+
*
|
142
|
+
*
|
143
|
+
*/
|
144
|
+
|
145
|
+
void QDLDL_Lsolve(const QDLDL_int n,
|
146
|
+
const QDLDL_int* Lp,
|
147
|
+
const QDLDL_int* Li,
|
148
|
+
const QDLDL_float* Lx,
|
149
|
+
QDLDL_float* x);
|
150
|
+
|
151
|
+
/**
|
152
|
+
* Solves (L+I)'x = b
|
153
|
+
*
|
154
|
+
* It is assumed that L will be a compressed
|
155
|
+
* sparse column matrix with data (Ln,Lp,Li).
|
156
|
+
*
|
157
|
+
* @param n number of columns in L (both square)
|
158
|
+
* @param Ln number of columns in CSC matrix L
|
159
|
+
* @param Lp column pointers (size Ln+1) for columns of L
|
160
|
+
* @param Li row indices of L. Has Lp[Ln] elements
|
161
|
+
* @param Dinv reciprocal of D. Length is n
|
162
|
+
* @param x initialized to b. Equal to x on return
|
163
|
+
*
|
164
|
+
*
|
165
|
+
*/
|
166
|
+
|
167
|
+
void QDLDL_Ltsolve(const QDLDL_int n,
|
168
|
+
const QDLDL_int* Lp,
|
169
|
+
const QDLDL_int* Li,
|
170
|
+
const QDLDL_float* Lx,
|
171
|
+
QDLDL_float* x);
|
172
|
+
|
173
|
+
# ifdef __cplusplus
|
174
|
+
}
|
175
|
+
# endif // ifdef __cplusplus
|
176
|
+
|
177
|
+
#endif // ifndef QDLDL_H
|
Binary file
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#ifndef QDLDL_TYPES_H
|
2
|
+
# define QDLDL_TYPES_H
|
3
|
+
|
4
|
+
#include "glbopts.h"
|
5
|
+
|
6
|
+
# ifdef __cplusplus
|
7
|
+
extern "C" {
|
8
|
+
# endif /* ifdef __cplusplus */
|
9
|
+
|
10
|
+
// QDLDL integer and float types
|
11
|
+
|
12
|
+
#define QDLDL_int scs_int
|
13
|
+
#define QDLDL_float scs_float
|
14
|
+
#define QDLDL_bool scs_int
|
15
|
+
|
16
|
+
# ifdef __cplusplus
|
17
|
+
}
|
18
|
+
# endif /* ifdef __cplusplus */
|
19
|
+
|
20
|
+
#endif /* ifndef QDLDL_TYPES_H */
|
21
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#include "gpu.h"
|
2
|
+
|
3
|
+
void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
|
4
|
+
scs_float *y, cusparseHandle_t cusparse_handle) {
|
5
|
+
/* y += A'*x
|
6
|
+
x and y MUST be on GPU already
|
7
|
+
*/
|
8
|
+
const scs_float onef = 1.0;
|
9
|
+
CUSPARSE(csrmv)
|
10
|
+
(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, Ag->n, Ag->m, Ag->Annz,
|
11
|
+
&onef, Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
|
12
|
+
}
|
13
|
+
|
14
|
+
void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
|
15
|
+
scs_float *y, cusparseHandle_t cusparse_handle) {
|
16
|
+
/* y += A*x
|
17
|
+
x and y MUST be on GPU already
|
18
|
+
*/
|
19
|
+
const scs_float onef = 1.0;
|
20
|
+
/* The A matrix idx pointers must be ORDERED */
|
21
|
+
CUSPARSE(csrmv)
|
22
|
+
(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, Ag->n, Ag->m, Ag->Annz, &onef,
|
23
|
+
Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
|
24
|
+
}
|
25
|
+
|
26
|
+
void SCS(free_gpu_matrix)(ScsGpuMatrix *A) {
|
27
|
+
cudaFree(A->x);
|
28
|
+
cudaFree(A->i);
|
29
|
+
cudaFree(A->p);
|
30
|
+
cusparseDestroyMatDescr(A->descr);
|
31
|
+
}
|
32
|
+
|
33
|
+
void SCS(normalize_a)(ScsMatrix *A, const ScsSettings *stgs, const ScsCone *k,
|
34
|
+
ScsScaling *scal) {
|
35
|
+
SCS(_normalize_a)(A, stgs, k, scal);
|
36
|
+
}
|
37
|
+
|
38
|
+
void SCS(un_normalize_a)(ScsMatrix *A, const ScsSettings *stgs,
|
39
|
+
const ScsScaling *scal) {
|
40
|
+
SCS(_un_normalize_a)(A, stgs, scal);
|
41
|
+
}
|
@@ -0,0 +1,85 @@
|
|
1
|
+
#ifndef SCSGPU_H_GUARD
|
2
|
+
#define SCSGPU_H_GUARD
|
3
|
+
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#include <cublas_v2.h>
|
9
|
+
#include <cuda.h>
|
10
|
+
#include <cuda_runtime_api.h>
|
11
|
+
#include <cusparse.h>
|
12
|
+
|
13
|
+
#include "amatrix.h"
|
14
|
+
#include "glbopts.h"
|
15
|
+
#include "linalg.h"
|
16
|
+
#include "linsys.h"
|
17
|
+
#include "scs.h"
|
18
|
+
#include "util.h"
|
19
|
+
|
20
|
+
#define CUDA_CHECK_ERR \
|
21
|
+
do { \
|
22
|
+
cudaError_t err = cudaGetLastError(); \
|
23
|
+
if (err != cudaSuccess) { \
|
24
|
+
printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
|
25
|
+
cudaGetErrorString(err)); \
|
26
|
+
} \
|
27
|
+
} while (0)
|
28
|
+
|
29
|
+
#ifndef EXTRA_VERBOSE
|
30
|
+
#ifndef SFLOAT
|
31
|
+
#define CUBLAS(x) cublasD##x
|
32
|
+
#define CUSPARSE(x) cusparseD##x
|
33
|
+
#else
|
34
|
+
#define CUBLAS(x) cublasS##x
|
35
|
+
#define CUSPARSE(x) cusparseS##x
|
36
|
+
#endif
|
37
|
+
#else
|
38
|
+
#ifndef SFLOAT
|
39
|
+
#define CUBLAS(x) \
|
40
|
+
CUDA_CHECK_ERR; \
|
41
|
+
cublasD##x
|
42
|
+
#define CUSPARSE(x) \
|
43
|
+
CUDA_CHECK_ERR; \
|
44
|
+
cusparseD##x
|
45
|
+
#else
|
46
|
+
#define CUBLAS(x) \
|
47
|
+
CUDA_CHECK_ERR; \
|
48
|
+
cublasS##x
|
49
|
+
#define CUSPARSE(x) \
|
50
|
+
CUDA_CHECK_ERR; \
|
51
|
+
cusparseS##x
|
52
|
+
#endif
|
53
|
+
#endif
|
54
|
+
|
55
|
+
/*
|
56
|
+
CUDA matrix routines only for CSR, not CSC matrices:
|
57
|
+
CSC CSR GPU Mult
|
58
|
+
A (m x n) A' (n x m) Ag accum_by_a_trans_gpu
|
59
|
+
A'(n x m) A (m x n) Agt accum_by_a_gpu
|
60
|
+
*/
|
61
|
+
|
62
|
+
/* this struct defines the data matrix A on GPU */
|
63
|
+
typedef struct SCS_GPU_A_DATA_MATRIX {
|
64
|
+
/* A is supplied in column compressed format */
|
65
|
+
scs_float *x; /* A values, size: NNZ A */
|
66
|
+
scs_int *i; /* A row index, size: NNZ A */
|
67
|
+
scs_int *p; /* A column pointer, size: n+1 */
|
68
|
+
scs_int m, n; /* m rows, n cols */
|
69
|
+
scs_int Annz; /* num non-zeros in A matrix */
|
70
|
+
/* CUDA */
|
71
|
+
cusparseMatDescr_t descr;
|
72
|
+
} ScsGpuMatrix;
|
73
|
+
|
74
|
+
void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const scs_float *x,
|
75
|
+
scs_float *y, cusparseHandle_t cusparse_handle);
|
76
|
+
|
77
|
+
void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const scs_float *x,
|
78
|
+
scs_float *y, cusparseHandle_t cusparse_handle);
|
79
|
+
|
80
|
+
void SCS(free_gpu_matrix)(ScsGpuMatrix *A);
|
81
|
+
|
82
|
+
#ifdef __cplusplus
|
83
|
+
}
|
84
|
+
#endif
|
85
|
+
#endif
|
@@ -0,0 +1,304 @@
|
|
1
|
+
#include "private.h"
|
2
|
+
|
3
|
+
#define CG_BEST_TOL 1e-9
|
4
|
+
#define CG_MIN_TOL 1e-1
|
5
|
+
|
6
|
+
/* do not use within pcg, reuses memory */
|
7
|
+
void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
|
8
|
+
const scs_float *x, scs_float *y) {
|
9
|
+
scs_float *v_m = p->tmp_m;
|
10
|
+
scs_float *v_n = p->r;
|
11
|
+
cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
12
|
+
cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
13
|
+
SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
|
14
|
+
cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
15
|
+
}
|
16
|
+
|
17
|
+
/* do not use within pcg, reuses memory */
|
18
|
+
void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
|
19
|
+
scs_float *y) {
|
20
|
+
scs_float *v_m = p->tmp_m;
|
21
|
+
scs_float *v_n = p->r;
|
22
|
+
cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
23
|
+
cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
24
|
+
#if GPU_TRANSPOSE_MAT > 0
|
25
|
+
SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
|
26
|
+
#else
|
27
|
+
SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
|
28
|
+
#endif
|
29
|
+
cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
30
|
+
}
|
31
|
+
|
32
|
+
char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
|
33
|
+
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
34
|
+
sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
|
35
|
+
(long)A->p[A->n], stgs->cg_rate);
|
36
|
+
return str;
|
37
|
+
}
|
38
|
+
|
39
|
+
char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
40
|
+
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
41
|
+
sprintf(str,
|
42
|
+
"\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
|
43
|
+
(scs_float)p->tot_cg_its / (info->iter + 1),
|
44
|
+
p->total_solve_time / (info->iter + 1) / 1e3);
|
45
|
+
p->tot_cg_its = 0;
|
46
|
+
p->total_solve_time = 0;
|
47
|
+
return str;
|
48
|
+
}
|
49
|
+
|
50
|
+
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
51
|
+
if (p) {
|
52
|
+
cudaFree(p->p);
|
53
|
+
cudaFree(p->r);
|
54
|
+
cudaFree(p->Gp);
|
55
|
+
cudaFree(p->bg);
|
56
|
+
cudaFree(p->tmp_m);
|
57
|
+
cudaFree(p->z);
|
58
|
+
cudaFree(p->M);
|
59
|
+
if (p->Ag) {
|
60
|
+
SCS(free_gpu_matrix)(p->Ag);
|
61
|
+
scs_free(p->Ag);
|
62
|
+
}
|
63
|
+
if (p->Agt) {
|
64
|
+
SCS(free_gpu_matrix)(p->Agt);
|
65
|
+
scs_free(p->Agt);
|
66
|
+
}
|
67
|
+
cusparseDestroy(p->cusparse_handle);
|
68
|
+
cublasDestroy(p->cublas_handle);
|
69
|
+
/* Don't reset because it interferes with other GPU programs. */
|
70
|
+
/* cudaDeviceReset(); */
|
71
|
+
scs_free(p);
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
/*y = (RHO_X * I + A'A)x */
|
76
|
+
static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
|
77
|
+
ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
78
|
+
/* x and y MUST already be loaded to GPU */
|
79
|
+
scs_float *tmp_m = p->tmp_m; /* temp memory */
|
80
|
+
cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
|
81
|
+
SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
|
82
|
+
cudaMemset(y, 0, A->n * sizeof(scs_float));
|
83
|
+
SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
|
84
|
+
CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
|
85
|
+
}
|
86
|
+
|
87
|
+
/* M = inv ( diag ( RHO_X * I + A'A ) ) */
|
88
|
+
static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
|
89
|
+
ScsLinSysWork *p) {
|
90
|
+
scs_int i;
|
91
|
+
scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
|
92
|
+
|
93
|
+
#if EXTRA_VERBOSE > 0
|
94
|
+
scs_printf("getting pre-conditioner\n");
|
95
|
+
#endif
|
96
|
+
|
97
|
+
for (i = 0; i < A->n; ++i) {
|
98
|
+
M[i] = 1 / (stgs->rho_x +
|
99
|
+
SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
|
100
|
+
/* M[i] = 1; */
|
101
|
+
}
|
102
|
+
cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
103
|
+
scs_free(M);
|
104
|
+
|
105
|
+
#if EXTRA_VERBOSE > 0
|
106
|
+
scs_printf("finished getting pre-conditioner\n");
|
107
|
+
#endif
|
108
|
+
}
|
109
|
+
|
110
|
+
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
111
|
+
const ScsSettings *stgs) {
|
112
|
+
cudaError_t err;
|
113
|
+
ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
114
|
+
ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
115
|
+
|
116
|
+
p->cublas_handle = 0;
|
117
|
+
p->cusparse_handle = 0;
|
118
|
+
|
119
|
+
p->total_solve_time = 0;
|
120
|
+
p->tot_cg_its = 0;
|
121
|
+
|
122
|
+
/* Get handle to the CUBLAS context */
|
123
|
+
cublasCreate(&p->cublas_handle);
|
124
|
+
|
125
|
+
/* Get handle to the CUSPARSE context */
|
126
|
+
cusparseCreate(&p->cusparse_handle);
|
127
|
+
|
128
|
+
Ag->n = A->n;
|
129
|
+
Ag->m = A->m;
|
130
|
+
Ag->Annz = A->p[A->n];
|
131
|
+
Ag->descr = 0;
|
132
|
+
/* Matrix description */
|
133
|
+
cusparseCreateMatDescr(&Ag->descr);
|
134
|
+
cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
135
|
+
cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
|
136
|
+
p->Ag = Ag;
|
137
|
+
p->Agt = SCS_NULL;
|
138
|
+
|
139
|
+
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
|
140
|
+
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
|
141
|
+
cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
|
142
|
+
|
143
|
+
cudaMalloc((void **)&p->p, A->n * sizeof(scs_float));
|
144
|
+
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
|
145
|
+
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
|
146
|
+
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
147
|
+
cudaMalloc((void **)&p->tmp_m,
|
148
|
+
A->m * sizeof(scs_float)); /* intermediate result */
|
149
|
+
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
150
|
+
cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
|
151
|
+
|
152
|
+
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
153
|
+
cudaMemcpyHostToDevice);
|
154
|
+
cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
|
155
|
+
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
156
|
+
cudaMemcpyHostToDevice);
|
157
|
+
|
158
|
+
get_preconditioner(A, stgs, p);
|
159
|
+
|
160
|
+
#if GPU_TRANSPOSE_MAT > 0
|
161
|
+
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
162
|
+
p->Agt->n = A->m;
|
163
|
+
p->Agt->m = A->n;
|
164
|
+
p->Agt->Annz = A->p[A->n];
|
165
|
+
p->Agt->descr = 0;
|
166
|
+
/* Matrix description */
|
167
|
+
cusparseCreateMatDescr(&p->Agt->descr);
|
168
|
+
cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
169
|
+
cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
|
170
|
+
|
171
|
+
cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
|
172
|
+
cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
|
173
|
+
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
|
174
|
+
/* transpose Ag into Agt for faster multiplies */
|
175
|
+
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
|
176
|
+
CUSPARSE(csr2csc)
|
177
|
+
(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
|
178
|
+
p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
|
179
|
+
#endif
|
180
|
+
|
181
|
+
err = cudaGetLastError();
|
182
|
+
if (err != cudaSuccess) {
|
183
|
+
printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
|
184
|
+
cudaGetErrorString(err));
|
185
|
+
SCS(free_lin_sys_work)(p);
|
186
|
+
return SCS_NULL;
|
187
|
+
}
|
188
|
+
return p;
|
189
|
+
}
|
190
|
+
|
191
|
+
static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
|
192
|
+
scs_float *z, scs_float *r, scs_int n) {
|
193
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
194
|
+
CUBLAS(tbmv)
|
195
|
+
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
196
|
+
0, M, 1, z, 1);
|
197
|
+
}
|
198
|
+
|
199
|
+
/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
|
200
|
+
static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
|
201
|
+
ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
202
|
+
scs_int max_its, scs_float tol) {
|
203
|
+
scs_int i, n = A->n;
|
204
|
+
scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
|
205
|
+
scs_float onef = 1.0, neg_onef = -1.0;
|
206
|
+
scs_float *p = pr->p; /* cg direction */
|
207
|
+
scs_float *Gp = pr->Gp; /* updated CG direction */
|
208
|
+
scs_float *r = pr->r; /* cg residual */
|
209
|
+
scs_float *z = pr->z; /* preconditioned */
|
210
|
+
scs_float *M = pr->M; /* preconditioner */
|
211
|
+
cublasHandle_t cublas_handle = pr->cublas_handle;
|
212
|
+
|
213
|
+
if (s == SCS_NULL) {
|
214
|
+
cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
215
|
+
cudaMemset(bg, 0, n * sizeof(scs_float));
|
216
|
+
} else {
|
217
|
+
/* p contains bg temporarily */
|
218
|
+
cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
219
|
+
/* bg contains s */
|
220
|
+
cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
221
|
+
mat_vec(A, stgs, pr, bg, r);
|
222
|
+
CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
|
223
|
+
CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
|
224
|
+
}
|
225
|
+
|
226
|
+
/* for some reason nrm2 is VERY slow */
|
227
|
+
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
228
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
229
|
+
nrm_r = SQRTF(nrm_r);
|
230
|
+
/* check to see if we need to run CG at all */
|
231
|
+
if (nrm_r < MIN(tol, 1e-18)) {
|
232
|
+
return 0;
|
233
|
+
}
|
234
|
+
|
235
|
+
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
236
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
237
|
+
/* put z in p, replacing temp mem */
|
238
|
+
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
239
|
+
|
240
|
+
for (i = 0; i < max_its; ++i) {
|
241
|
+
mat_vec(A, stgs, pr, p, Gp);
|
242
|
+
|
243
|
+
CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
|
244
|
+
|
245
|
+
alpha = ipzr / p_gp;
|
246
|
+
neg_alpha = -alpha;
|
247
|
+
|
248
|
+
CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
|
249
|
+
CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
|
250
|
+
|
251
|
+
/* for some reason nrm2 is VERY slow */
|
252
|
+
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
253
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
254
|
+
nrm_r = SQRTF(nrm_r);
|
255
|
+
if (nrm_r < tol) {
|
256
|
+
i++;
|
257
|
+
break;
|
258
|
+
}
|
259
|
+
ipzr_old = ipzr;
|
260
|
+
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
261
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
262
|
+
|
263
|
+
beta = ipzr / ipzr_old;
|
264
|
+
CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
|
265
|
+
CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
|
266
|
+
}
|
267
|
+
#if EXTRA_VERBOSE > 0
|
268
|
+
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
|
269
|
+
#endif
|
270
|
+
return i;
|
271
|
+
}
|
272
|
+
|
273
|
+
scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
|
274
|
+
ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
275
|
+
scs_int iter) {
|
276
|
+
scs_int cg_its;
|
277
|
+
SCS(timer) linsys_timer;
|
278
|
+
scs_float *bg = p->bg;
|
279
|
+
scs_float neg_onef = -1.0;
|
280
|
+
ScsGpuMatrix *Ag = p->Ag;
|
281
|
+
scs_float cg_tol =
|
282
|
+
SCS(norm)(b, Ag->n) *
|
283
|
+
(iter < 0 ? CG_BEST_TOL
|
284
|
+
: CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
|
285
|
+
SCS(tic)(&linsys_timer);
|
286
|
+
/* all on GPU */
|
287
|
+
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
|
288
|
+
SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
|
289
|
+
/* solves (I+A'A)x = b, s warm start, solution stored in b */
|
290
|
+
cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
|
291
|
+
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
292
|
+
SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
|
293
|
+
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
294
|
+
|
295
|
+
if (iter >= 0) {
|
296
|
+
p->tot_cg_its += cg_its;
|
297
|
+
}
|
298
|
+
|
299
|
+
p->total_solve_time += SCS(tocq)(&linsys_timer);
|
300
|
+
#if EXTRAVERBOSE > 0
|
301
|
+
scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
|
302
|
+
#endif
|
303
|
+
return 0;
|
304
|
+
}
|