scs 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +12 -0
- data/LICENSE.txt +22 -0
- data/README.md +98 -0
- data/ext/scs/extconf.rb +29 -0
- data/lib/scs.rb +17 -0
- data/lib/scs/ffi.rb +117 -0
- data/lib/scs/solver.rb +173 -0
- data/lib/scs/version.rb +3 -0
- data/vendor/scs/LICENSE.txt +21 -0
- data/vendor/scs/Makefile +164 -0
- data/vendor/scs/README.md +222 -0
- data/vendor/scs/include/aa.h +56 -0
- data/vendor/scs/include/cones.h +46 -0
- data/vendor/scs/include/ctrlc.h +33 -0
- data/vendor/scs/include/glbopts.h +177 -0
- data/vendor/scs/include/linalg.h +26 -0
- data/vendor/scs/include/linsys.h +64 -0
- data/vendor/scs/include/normalize.h +18 -0
- data/vendor/scs/include/rw.h +17 -0
- data/vendor/scs/include/scs.h +161 -0
- data/vendor/scs/include/scs_blas.h +51 -0
- data/vendor/scs/include/util.h +65 -0
- data/vendor/scs/linsys/amatrix.c +305 -0
- data/vendor/scs/linsys/amatrix.h +36 -0
- data/vendor/scs/linsys/amatrix.o +0 -0
- data/vendor/scs/linsys/cpu/direct/private.c +366 -0
- data/vendor/scs/linsys/cpu/direct/private.h +26 -0
- data/vendor/scs/linsys/cpu/direct/private.o +0 -0
- data/vendor/scs/linsys/cpu/indirect/private.c +256 -0
- data/vendor/scs/linsys/cpu/indirect/private.h +31 -0
- data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
- data/vendor/scs/linsys/external/amd/LICENSE.txt +934 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +469 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +254 -0
- data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
- data/vendor/scs/linsys/external/amd/amd.h +400 -0
- data/vendor/scs/linsys/external/amd/amd_1.c +180 -0
- data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_2.c +1842 -0
- data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_aat.c +184 -0
- data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_control.c +64 -0
- data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.c +37 -0
- data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_dump.c +179 -0
- data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_global.c +16 -0
- data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_info.c +119 -0
- data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_internal.h +304 -0
- data/vendor/scs/linsys/external/amd/amd_order.c +199 -0
- data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.c +120 -0
- data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.c +206 -0
- data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.c +118 -0
- data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
- data/vendor/scs/linsys/external/amd/amd_valid.c +92 -0
- data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
- data/vendor/scs/linsys/external/amd/changes +11 -0
- data/vendor/scs/linsys/external/qdldl/LICENSE +201 -0
- data/vendor/scs/linsys/external/qdldl/README.md +120 -0
- data/vendor/scs/linsys/external/qdldl/changes +4 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.c +298 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.h +177 -0
- data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
- data/vendor/scs/linsys/external/qdldl/qdldl_types.h +21 -0
- data/vendor/scs/linsys/gpu/gpu.c +41 -0
- data/vendor/scs/linsys/gpu/gpu.h +85 -0
- data/vendor/scs/linsys/gpu/indirect/private.c +304 -0
- data/vendor/scs/linsys/gpu/indirect/private.h +36 -0
- data/vendor/scs/scs.mk +181 -0
- data/vendor/scs/src/aa.c +224 -0
- data/vendor/scs/src/aa.o +0 -0
- data/vendor/scs/src/cones.c +802 -0
- data/vendor/scs/src/cones.o +0 -0
- data/vendor/scs/src/ctrlc.c +77 -0
- data/vendor/scs/src/ctrlc.o +0 -0
- data/vendor/scs/src/linalg.c +84 -0
- data/vendor/scs/src/linalg.o +0 -0
- data/vendor/scs/src/normalize.c +93 -0
- data/vendor/scs/src/normalize.o +0 -0
- data/vendor/scs/src/rw.c +167 -0
- data/vendor/scs/src/rw.o +0 -0
- data/vendor/scs/src/scs.c +978 -0
- data/vendor/scs/src/scs.o +0 -0
- data/vendor/scs/src/scs_version.c +5 -0
- data/vendor/scs/src/scs_version.o +0 -0
- data/vendor/scs/src/util.c +196 -0
- data/vendor/scs/src/util.o +0 -0
- data/vendor/scs/test/data/small_random_socp +0 -0
- data/vendor/scs/test/minunit.h +13 -0
- data/vendor/scs/test/problem_utils.h +93 -0
- data/vendor/scs/test/problems/rob_gauss_cov_est.h +85 -0
- data/vendor/scs/test/problems/small_lp.h +50 -0
- data/vendor/scs/test/problems/small_random_socp.h +33 -0
- data/vendor/scs/test/random_socp_prob.c +171 -0
- data/vendor/scs/test/run_from_file.c +69 -0
- data/vendor/scs/test/run_tests +2 -0
- data/vendor/scs/test/run_tests.c +32 -0
- metadata +203 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
#ifndef QDLDL_H
|
|
2
|
+
#define QDLDL_H
|
|
3
|
+
|
|
4
|
+
// Include qdldl type options
|
|
5
|
+
#include "qdldl_types.h"
|
|
6
|
+
|
|
7
|
+
# ifdef __cplusplus
|
|
8
|
+
extern "C" {
|
|
9
|
+
# endif // ifdef __cplusplus
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Compute the elimination tree for a quasidefinite matrix
|
|
13
|
+
* in compressed sparse column form, where the input matrix is
|
|
14
|
+
* assumed to contain data for the upper triangular part of A only,
|
|
15
|
+
* and there are no duplicate indices.
|
|
16
|
+
*
|
|
17
|
+
* Returns an elimination tree for the factorization A = LDL^T and a
|
|
18
|
+
* count of the nonzeros in each column of L that are strictly below the
|
|
19
|
+
* diagonal.
|
|
20
|
+
*
|
|
21
|
+
* Does not use MALLOC. It is assumed that the arrays work, Lnz, and
|
|
22
|
+
* etree will be allocated with a number of elements equal to n.
|
|
23
|
+
*
|
|
24
|
+
* The data in (n,Ap,Ai) are from a square matrix A in CSC format, and
|
|
25
|
+
* should include the upper triangular part of A only.
|
|
26
|
+
*
|
|
27
|
+
* This function is only intended for factorisation of QD matrices specified
|
|
28
|
+
* by their upper triangular part. An error is returned if any column has
|
|
29
|
+
* data below the diagonal or s completely empty.
|
|
30
|
+
*
|
|
31
|
+
* For matrices with a non-empty column but a zero on the corresponding diagonal,
|
|
32
|
+
* this function will *not* return an error, as it may still be possible to factor
|
|
33
|
+
* such a matrix in LDL form. No promises are made in this case though...
|
|
34
|
+
*
|
|
35
|
+
* @param n number of columns in CSC matrix A (assumed square)
|
|
36
|
+
* @param Ap column pointers (size n+1) for columns of A
|
|
37
|
+
* @param Ai row indices of A. Has Ap[n] elements
|
|
38
|
+
* @param work work vector (size n) (no meaning on return)
|
|
39
|
+
* @param Lnz count of nonzeros in each column of L (size n) below diagonal
|
|
40
|
+
* @param etree elimination tree (size n)
|
|
41
|
+
* @return total sum of Lnz (i.e. total nonzeros in L below diagonal). Returns
|
|
42
|
+
* -1 if the input does not have triu structure or has an empty
|
|
43
|
+
* column.
|
|
44
|
+
*
|
|
45
|
+
*
|
|
46
|
+
*/
|
|
47
|
+
|
|
48
|
+
QDLDL_int QDLDL_etree(const QDLDL_int n,
|
|
49
|
+
const QDLDL_int* Ap,
|
|
50
|
+
const QDLDL_int* Ai,
|
|
51
|
+
QDLDL_int* work,
|
|
52
|
+
QDLDL_int* Lnz,
|
|
53
|
+
QDLDL_int* etree);
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Compute an LDL decomposition for a quasidefinite matrix
|
|
57
|
+
* in compressed sparse column form, where the input matrix is
|
|
58
|
+
* assumed to contain data for the upper triangular part of A only,
|
|
59
|
+
* and there are no duplicate indices.
|
|
60
|
+
*
|
|
61
|
+
* Returns factors L, D and Dinv = 1./D.
|
|
62
|
+
*
|
|
63
|
+
* Does not use MALLOC. It is assumed that L will be a compressed
|
|
64
|
+
* sparse column matrix with data (Ln,Lp,Li) with sufficient space
|
|
65
|
+
* allocated, with a number of nonzeros equal to the count given
|
|
66
|
+
* as a return value by osqp_ldl_etree
|
|
67
|
+
*
|
|
68
|
+
* @param n number of columns in L and A (both square)
|
|
69
|
+
* @param Ap column pointers (size n+1) for columns of A
|
|
70
|
+
* @param Ai row indices of A. Has Ap[n] elements
|
|
71
|
+
* @param Ln number of columns in CSC matrix L
|
|
72
|
+
* @param Lp column pointers (size Ln+1) for columns of L
|
|
73
|
+
* @param Li row indices of L. Has Lp[Ln] elements
|
|
74
|
+
* @param D vectorized factor D. Length is n
|
|
75
|
+
* @param Dinv reciprocal of D. Length is n
|
|
76
|
+
* @param Lnz count of nonzeros in each column of L below diagonal,
|
|
77
|
+
* as given by osqp_ldl_etree (not modified)
|
|
78
|
+
* @param etree elimination tree as as given by osqp_ldl_etree (not modified)
|
|
79
|
+
* @param bwork working array of bools. Length is n
|
|
80
|
+
* @param iwork working array of integers. Length is 3*n
|
|
81
|
+
* @param fwork working array of floats. Length is n
|
|
82
|
+
* @return Returns a count of the number of positive elements
|
|
83
|
+
* in D. Returns -1 and exits immediately if any element
|
|
84
|
+
* of D evaluates exactly to zero (matrix is not quasidefinite
|
|
85
|
+
* or otherwise LDL factorisable)
|
|
86
|
+
*
|
|
87
|
+
*/
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
QDLDL_int QDLDL_factor(const QDLDL_int n,
|
|
91
|
+
const QDLDL_int* Ap,
|
|
92
|
+
const QDLDL_int* Ai,
|
|
93
|
+
const QDLDL_float* Ax,
|
|
94
|
+
QDLDL_int* Lp,
|
|
95
|
+
QDLDL_int* Li,
|
|
96
|
+
QDLDL_float* Lx,
|
|
97
|
+
QDLDL_float* D,
|
|
98
|
+
QDLDL_float* Dinv,
|
|
99
|
+
const QDLDL_int* Lnz,
|
|
100
|
+
const QDLDL_int* etree,
|
|
101
|
+
QDLDL_bool* bwork,
|
|
102
|
+
QDLDL_int* iwork,
|
|
103
|
+
QDLDL_float* fwork);
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Solves LDL'x = b
|
|
108
|
+
*
|
|
109
|
+
* It is assumed that L will be a compressed
|
|
110
|
+
* sparse column matrix with data (Ln,Lp,Li).
|
|
111
|
+
*
|
|
112
|
+
* @param n number of columns in L (both square)
|
|
113
|
+
* @param Ln number of columns in CSC matrix L
|
|
114
|
+
* @param Lp column pointers (size Ln+1) for columns of L
|
|
115
|
+
* @param Li row indices of L. Has Lp[Ln] elements
|
|
116
|
+
* @param Dinv reciprocal of D. Length is n
|
|
117
|
+
* @param x initialized to b. Equal to x on return
|
|
118
|
+
*
|
|
119
|
+
*
|
|
120
|
+
*/
|
|
121
|
+
void QDLDL_solve(const QDLDL_int n,
|
|
122
|
+
const QDLDL_int* Lp,
|
|
123
|
+
const QDLDL_int* Li,
|
|
124
|
+
const QDLDL_float* Lx,
|
|
125
|
+
const QDLDL_float* Dinv,
|
|
126
|
+
QDLDL_float* x);
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Solves (L+I)x = b
|
|
131
|
+
*
|
|
132
|
+
* It is assumed that L will be a compressed
|
|
133
|
+
* sparse column matrix with data (Ln,Lp,Li).
|
|
134
|
+
*
|
|
135
|
+
* @param n number of columns in L (both square)
|
|
136
|
+
* @param Ln number of columns in CSC matrix L
|
|
137
|
+
* @param Lp column pointers (size Ln+1) for columns of L
|
|
138
|
+
* @param Li row indices of L. Has Lp[Ln] elements
|
|
139
|
+
* @param Dinv reciprocal of D. Length is n
|
|
140
|
+
* @param x initialized to b. Equal to x on return
|
|
141
|
+
*
|
|
142
|
+
*
|
|
143
|
+
*/
|
|
144
|
+
|
|
145
|
+
void QDLDL_Lsolve(const QDLDL_int n,
|
|
146
|
+
const QDLDL_int* Lp,
|
|
147
|
+
const QDLDL_int* Li,
|
|
148
|
+
const QDLDL_float* Lx,
|
|
149
|
+
QDLDL_float* x);
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Solves (L+I)'x = b
|
|
153
|
+
*
|
|
154
|
+
* It is assumed that L will be a compressed
|
|
155
|
+
* sparse column matrix with data (Ln,Lp,Li).
|
|
156
|
+
*
|
|
157
|
+
* @param n number of columns in L (both square)
|
|
158
|
+
* @param Ln number of columns in CSC matrix L
|
|
159
|
+
* @param Lp column pointers (size Ln+1) for columns of L
|
|
160
|
+
* @param Li row indices of L. Has Lp[Ln] elements
|
|
161
|
+
* @param Dinv reciprocal of D. Length is n
|
|
162
|
+
* @param x initialized to b. Equal to x on return
|
|
163
|
+
*
|
|
164
|
+
*
|
|
165
|
+
*/
|
|
166
|
+
|
|
167
|
+
void QDLDL_Ltsolve(const QDLDL_int n,
|
|
168
|
+
const QDLDL_int* Lp,
|
|
169
|
+
const QDLDL_int* Li,
|
|
170
|
+
const QDLDL_float* Lx,
|
|
171
|
+
QDLDL_float* x);
|
|
172
|
+
|
|
173
|
+
# ifdef __cplusplus
|
|
174
|
+
}
|
|
175
|
+
# endif // ifdef __cplusplus
|
|
176
|
+
|
|
177
|
+
#endif // ifndef QDLDL_H
|
|
Binary file
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#ifndef QDLDL_TYPES_H
|
|
2
|
+
# define QDLDL_TYPES_H
|
|
3
|
+
|
|
4
|
+
#include "glbopts.h"
|
|
5
|
+
|
|
6
|
+
# ifdef __cplusplus
|
|
7
|
+
extern "C" {
|
|
8
|
+
# endif /* ifdef __cplusplus */
|
|
9
|
+
|
|
10
|
+
// QDLDL integer and float types
|
|
11
|
+
|
|
12
|
+
#define QDLDL_int scs_int
|
|
13
|
+
#define QDLDL_float scs_float
|
|
14
|
+
#define QDLDL_bool scs_int
|
|
15
|
+
|
|
16
|
+
# ifdef __cplusplus
|
|
17
|
+
}
|
|
18
|
+
# endif /* ifdef __cplusplus */
|
|
19
|
+
|
|
20
|
+
#endif /* ifndef QDLDL_TYPES_H */
|
|
21
|
+
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
#include "gpu.h"
|
|
2
|
+
|
|
3
|
+
void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
|
|
4
|
+
scs_float *y, cusparseHandle_t cusparse_handle) {
|
|
5
|
+
/* y += A'*x
|
|
6
|
+
x and y MUST be on GPU already
|
|
7
|
+
*/
|
|
8
|
+
const scs_float onef = 1.0;
|
|
9
|
+
CUSPARSE(csrmv)
|
|
10
|
+
(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, Ag->n, Ag->m, Ag->Annz,
|
|
11
|
+
&onef, Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
|
|
15
|
+
scs_float *y, cusparseHandle_t cusparse_handle) {
|
|
16
|
+
/* y += A*x
|
|
17
|
+
x and y MUST be on GPU already
|
|
18
|
+
*/
|
|
19
|
+
const scs_float onef = 1.0;
|
|
20
|
+
/* The A matrix idx pointers must be ORDERED */
|
|
21
|
+
CUSPARSE(csrmv)
|
|
22
|
+
(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, Ag->n, Ag->m, Ag->Annz, &onef,
|
|
23
|
+
Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
void SCS(free_gpu_matrix)(ScsGpuMatrix *A) {
|
|
27
|
+
cudaFree(A->x);
|
|
28
|
+
cudaFree(A->i);
|
|
29
|
+
cudaFree(A->p);
|
|
30
|
+
cusparseDestroyMatDescr(A->descr);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
void SCS(normalize_a)(ScsMatrix *A, const ScsSettings *stgs, const ScsCone *k,
|
|
34
|
+
ScsScaling *scal) {
|
|
35
|
+
SCS(_normalize_a)(A, stgs, k, scal);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
void SCS(un_normalize_a)(ScsMatrix *A, const ScsSettings *stgs,
|
|
39
|
+
const ScsScaling *scal) {
|
|
40
|
+
SCS(_un_normalize_a)(A, stgs, scal);
|
|
41
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#ifndef SCSGPU_H_GUARD
|
|
2
|
+
#define SCSGPU_H_GUARD
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
#include <cublas_v2.h>
|
|
9
|
+
#include <cuda.h>
|
|
10
|
+
#include <cuda_runtime_api.h>
|
|
11
|
+
#include <cusparse.h>
|
|
12
|
+
|
|
13
|
+
#include "amatrix.h"
|
|
14
|
+
#include "glbopts.h"
|
|
15
|
+
#include "linalg.h"
|
|
16
|
+
#include "linsys.h"
|
|
17
|
+
#include "scs.h"
|
|
18
|
+
#include "util.h"
|
|
19
|
+
|
|
20
|
+
#define CUDA_CHECK_ERR \
|
|
21
|
+
do { \
|
|
22
|
+
cudaError_t err = cudaGetLastError(); \
|
|
23
|
+
if (err != cudaSuccess) { \
|
|
24
|
+
printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
|
|
25
|
+
cudaGetErrorString(err)); \
|
|
26
|
+
} \
|
|
27
|
+
} while (0)
|
|
28
|
+
|
|
29
|
+
#ifndef EXTRA_VERBOSE
|
|
30
|
+
#ifndef SFLOAT
|
|
31
|
+
#define CUBLAS(x) cublasD##x
|
|
32
|
+
#define CUSPARSE(x) cusparseD##x
|
|
33
|
+
#else
|
|
34
|
+
#define CUBLAS(x) cublasS##x
|
|
35
|
+
#define CUSPARSE(x) cusparseS##x
|
|
36
|
+
#endif
|
|
37
|
+
#else
|
|
38
|
+
#ifndef SFLOAT
|
|
39
|
+
#define CUBLAS(x) \
|
|
40
|
+
CUDA_CHECK_ERR; \
|
|
41
|
+
cublasD##x
|
|
42
|
+
#define CUSPARSE(x) \
|
|
43
|
+
CUDA_CHECK_ERR; \
|
|
44
|
+
cusparseD##x
|
|
45
|
+
#else
|
|
46
|
+
#define CUBLAS(x) \
|
|
47
|
+
CUDA_CHECK_ERR; \
|
|
48
|
+
cublasS##x
|
|
49
|
+
#define CUSPARSE(x) \
|
|
50
|
+
CUDA_CHECK_ERR; \
|
|
51
|
+
cusparseS##x
|
|
52
|
+
#endif
|
|
53
|
+
#endif
|
|
54
|
+
|
|
55
|
+
/*
|
|
56
|
+
CUDA matrix routines only for CSR, not CSC matrices:
|
|
57
|
+
CSC CSR GPU Mult
|
|
58
|
+
A (m x n) A' (n x m) Ag accum_by_a_trans_gpu
|
|
59
|
+
A'(n x m) A (m x n) Agt accum_by_a_gpu
|
|
60
|
+
*/
|
|
61
|
+
|
|
62
|
+
/* this struct defines the data matrix A on GPU */
|
|
63
|
+
typedef struct SCS_GPU_A_DATA_MATRIX {
|
|
64
|
+
/* A is supplied in column compressed format */
|
|
65
|
+
scs_float *x; /* A values, size: NNZ A */
|
|
66
|
+
scs_int *i; /* A row index, size: NNZ A */
|
|
67
|
+
scs_int *p; /* A column pointer, size: n+1 */
|
|
68
|
+
scs_int m, n; /* m rows, n cols */
|
|
69
|
+
scs_int Annz; /* num non-zeros in A matrix */
|
|
70
|
+
/* CUDA */
|
|
71
|
+
cusparseMatDescr_t descr;
|
|
72
|
+
} ScsGpuMatrix;
|
|
73
|
+
|
|
74
|
+
void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const scs_float *x,
|
|
75
|
+
scs_float *y, cusparseHandle_t cusparse_handle);
|
|
76
|
+
|
|
77
|
+
void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const scs_float *x,
|
|
78
|
+
scs_float *y, cusparseHandle_t cusparse_handle);
|
|
79
|
+
|
|
80
|
+
void SCS(free_gpu_matrix)(ScsGpuMatrix *A);
|
|
81
|
+
|
|
82
|
+
#ifdef __cplusplus
|
|
83
|
+
}
|
|
84
|
+
#endif
|
|
85
|
+
#endif
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
#include "private.h"
|
|
2
|
+
|
|
3
|
+
#define CG_BEST_TOL 1e-9
|
|
4
|
+
#define CG_MIN_TOL 1e-1
|
|
5
|
+
|
|
6
|
+
/* do not use within pcg, reuses memory */
|
|
7
|
+
void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
|
|
8
|
+
const scs_float *x, scs_float *y) {
|
|
9
|
+
scs_float *v_m = p->tmp_m;
|
|
10
|
+
scs_float *v_n = p->r;
|
|
11
|
+
cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
12
|
+
cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
13
|
+
SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
|
|
14
|
+
cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/* do not use within pcg, reuses memory */
|
|
18
|
+
void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
|
|
19
|
+
scs_float *y) {
|
|
20
|
+
scs_float *v_m = p->tmp_m;
|
|
21
|
+
scs_float *v_n = p->r;
|
|
22
|
+
cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
23
|
+
cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
24
|
+
#if GPU_TRANSPOSE_MAT > 0
|
|
25
|
+
SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
|
|
26
|
+
#else
|
|
27
|
+
SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
|
|
28
|
+
#endif
|
|
29
|
+
cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
|
|
33
|
+
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
|
34
|
+
sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
|
|
35
|
+
(long)A->p[A->n], stgs->cg_rate);
|
|
36
|
+
return str;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
|
|
40
|
+
char *str = (char *)scs_malloc(sizeof(char) * 128);
|
|
41
|
+
sprintf(str,
|
|
42
|
+
"\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
|
|
43
|
+
(scs_float)p->tot_cg_its / (info->iter + 1),
|
|
44
|
+
p->total_solve_time / (info->iter + 1) / 1e3);
|
|
45
|
+
p->tot_cg_its = 0;
|
|
46
|
+
p->total_solve_time = 0;
|
|
47
|
+
return str;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
|
|
51
|
+
if (p) {
|
|
52
|
+
cudaFree(p->p);
|
|
53
|
+
cudaFree(p->r);
|
|
54
|
+
cudaFree(p->Gp);
|
|
55
|
+
cudaFree(p->bg);
|
|
56
|
+
cudaFree(p->tmp_m);
|
|
57
|
+
cudaFree(p->z);
|
|
58
|
+
cudaFree(p->M);
|
|
59
|
+
if (p->Ag) {
|
|
60
|
+
SCS(free_gpu_matrix)(p->Ag);
|
|
61
|
+
scs_free(p->Ag);
|
|
62
|
+
}
|
|
63
|
+
if (p->Agt) {
|
|
64
|
+
SCS(free_gpu_matrix)(p->Agt);
|
|
65
|
+
scs_free(p->Agt);
|
|
66
|
+
}
|
|
67
|
+
cusparseDestroy(p->cusparse_handle);
|
|
68
|
+
cublasDestroy(p->cublas_handle);
|
|
69
|
+
/* Don't reset because it interferes with other GPU programs. */
|
|
70
|
+
/* cudaDeviceReset(); */
|
|
71
|
+
scs_free(p);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/*y = (RHO_X * I + A'A)x */
|
|
76
|
+
static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
|
|
77
|
+
ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
78
|
+
/* x and y MUST already be loaded to GPU */
|
|
79
|
+
scs_float *tmp_m = p->tmp_m; /* temp memory */
|
|
80
|
+
cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
|
|
81
|
+
SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
|
|
82
|
+
cudaMemset(y, 0, A->n * sizeof(scs_float));
|
|
83
|
+
SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
|
|
84
|
+
CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/* M = inv ( diag ( RHO_X * I + A'A ) ) */
|
|
88
|
+
static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
|
|
89
|
+
ScsLinSysWork *p) {
|
|
90
|
+
scs_int i;
|
|
91
|
+
scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
|
|
92
|
+
|
|
93
|
+
#if EXTRA_VERBOSE > 0
|
|
94
|
+
scs_printf("getting pre-conditioner\n");
|
|
95
|
+
#endif
|
|
96
|
+
|
|
97
|
+
for (i = 0; i < A->n; ++i) {
|
|
98
|
+
M[i] = 1 / (stgs->rho_x +
|
|
99
|
+
SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
|
|
100
|
+
/* M[i] = 1; */
|
|
101
|
+
}
|
|
102
|
+
cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
103
|
+
scs_free(M);
|
|
104
|
+
|
|
105
|
+
#if EXTRA_VERBOSE > 0
|
|
106
|
+
scs_printf("finished getting pre-conditioner\n");
|
|
107
|
+
#endif
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
|
|
111
|
+
const ScsSettings *stgs) {
|
|
112
|
+
cudaError_t err;
|
|
113
|
+
ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
|
|
114
|
+
ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
|
115
|
+
|
|
116
|
+
p->cublas_handle = 0;
|
|
117
|
+
p->cusparse_handle = 0;
|
|
118
|
+
|
|
119
|
+
p->total_solve_time = 0;
|
|
120
|
+
p->tot_cg_its = 0;
|
|
121
|
+
|
|
122
|
+
/* Get handle to the CUBLAS context */
|
|
123
|
+
cublasCreate(&p->cublas_handle);
|
|
124
|
+
|
|
125
|
+
/* Get handle to the CUSPARSE context */
|
|
126
|
+
cusparseCreate(&p->cusparse_handle);
|
|
127
|
+
|
|
128
|
+
Ag->n = A->n;
|
|
129
|
+
Ag->m = A->m;
|
|
130
|
+
Ag->Annz = A->p[A->n];
|
|
131
|
+
Ag->descr = 0;
|
|
132
|
+
/* Matrix description */
|
|
133
|
+
cusparseCreateMatDescr(&Ag->descr);
|
|
134
|
+
cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
|
135
|
+
cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
|
|
136
|
+
p->Ag = Ag;
|
|
137
|
+
p->Agt = SCS_NULL;
|
|
138
|
+
|
|
139
|
+
cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
|
|
140
|
+
cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
|
|
141
|
+
cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
|
|
142
|
+
|
|
143
|
+
cudaMalloc((void **)&p->p, A->n * sizeof(scs_float));
|
|
144
|
+
cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
|
|
145
|
+
cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
|
|
146
|
+
cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
|
|
147
|
+
cudaMalloc((void **)&p->tmp_m,
|
|
148
|
+
A->m * sizeof(scs_float)); /* intermediate result */
|
|
149
|
+
cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
|
|
150
|
+
cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
|
|
151
|
+
|
|
152
|
+
cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
|
|
153
|
+
cudaMemcpyHostToDevice);
|
|
154
|
+
cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
|
|
155
|
+
cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
|
|
156
|
+
cudaMemcpyHostToDevice);
|
|
157
|
+
|
|
158
|
+
get_preconditioner(A, stgs, p);
|
|
159
|
+
|
|
160
|
+
#if GPU_TRANSPOSE_MAT > 0
|
|
161
|
+
p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
|
|
162
|
+
p->Agt->n = A->m;
|
|
163
|
+
p->Agt->m = A->n;
|
|
164
|
+
p->Agt->Annz = A->p[A->n];
|
|
165
|
+
p->Agt->descr = 0;
|
|
166
|
+
/* Matrix description */
|
|
167
|
+
cusparseCreateMatDescr(&p->Agt->descr);
|
|
168
|
+
cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
|
|
169
|
+
cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
|
|
170
|
+
|
|
171
|
+
cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
|
|
172
|
+
cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
|
|
173
|
+
cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
|
|
174
|
+
/* transpose Ag into Agt for faster multiplies */
|
|
175
|
+
/* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
|
|
176
|
+
CUSPARSE(csr2csc)
|
|
177
|
+
(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
|
|
178
|
+
p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
|
|
179
|
+
#endif
|
|
180
|
+
|
|
181
|
+
err = cudaGetLastError();
|
|
182
|
+
if (err != cudaSuccess) {
|
|
183
|
+
printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
|
|
184
|
+
cudaGetErrorString(err));
|
|
185
|
+
SCS(free_lin_sys_work)(p);
|
|
186
|
+
return SCS_NULL;
|
|
187
|
+
}
|
|
188
|
+
return p;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
|
|
192
|
+
scs_float *z, scs_float *r, scs_int n) {
|
|
193
|
+
cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
194
|
+
CUBLAS(tbmv)
|
|
195
|
+
(cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
|
|
196
|
+
0, M, 1, z, 1);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
|
|
200
|
+
static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
|
|
201
|
+
ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
|
|
202
|
+
scs_int max_its, scs_float tol) {
|
|
203
|
+
scs_int i, n = A->n;
|
|
204
|
+
scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
|
|
205
|
+
scs_float onef = 1.0, neg_onef = -1.0;
|
|
206
|
+
scs_float *p = pr->p; /* cg direction */
|
|
207
|
+
scs_float *Gp = pr->Gp; /* updated CG direction */
|
|
208
|
+
scs_float *r = pr->r; /* cg residual */
|
|
209
|
+
scs_float *z = pr->z; /* preconditioned */
|
|
210
|
+
scs_float *M = pr->M; /* preconditioner */
|
|
211
|
+
cublasHandle_t cublas_handle = pr->cublas_handle;
|
|
212
|
+
|
|
213
|
+
if (s == SCS_NULL) {
|
|
214
|
+
cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
215
|
+
cudaMemset(bg, 0, n * sizeof(scs_float));
|
|
216
|
+
} else {
|
|
217
|
+
/* p contains bg temporarily */
|
|
218
|
+
cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
219
|
+
/* bg contains s */
|
|
220
|
+
cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
221
|
+
mat_vec(A, stgs, pr, bg, r);
|
|
222
|
+
CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
|
|
223
|
+
CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/* for some reason nrm2 is VERY slow */
|
|
227
|
+
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
|
228
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
|
229
|
+
nrm_r = SQRTF(nrm_r);
|
|
230
|
+
/* check to see if we need to run CG at all */
|
|
231
|
+
if (nrm_r < MIN(tol, 1e-18)) {
|
|
232
|
+
return 0;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
|
236
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
|
237
|
+
/* put z in p, replacing temp mem */
|
|
238
|
+
cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
|
|
239
|
+
|
|
240
|
+
for (i = 0; i < max_its; ++i) {
|
|
241
|
+
mat_vec(A, stgs, pr, p, Gp);
|
|
242
|
+
|
|
243
|
+
CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
|
|
244
|
+
|
|
245
|
+
alpha = ipzr / p_gp;
|
|
246
|
+
neg_alpha = -alpha;
|
|
247
|
+
|
|
248
|
+
CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
|
|
249
|
+
CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
|
|
250
|
+
|
|
251
|
+
/* for some reason nrm2 is VERY slow */
|
|
252
|
+
/* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
|
|
253
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
|
|
254
|
+
nrm_r = SQRTF(nrm_r);
|
|
255
|
+
if (nrm_r < tol) {
|
|
256
|
+
i++;
|
|
257
|
+
break;
|
|
258
|
+
}
|
|
259
|
+
ipzr_old = ipzr;
|
|
260
|
+
apply_pre_conditioner(cublas_handle, M, z, r, n);
|
|
261
|
+
CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
|
|
262
|
+
|
|
263
|
+
beta = ipzr / ipzr_old;
|
|
264
|
+
CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
|
|
265
|
+
CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
|
|
266
|
+
}
|
|
267
|
+
#if EXTRA_VERBOSE > 0
|
|
268
|
+
scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
|
|
269
|
+
#endif
|
|
270
|
+
return i;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
|
|
274
|
+
ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
275
|
+
scs_int iter) {
|
|
276
|
+
scs_int cg_its;
|
|
277
|
+
SCS(timer) linsys_timer;
|
|
278
|
+
scs_float *bg = p->bg;
|
|
279
|
+
scs_float neg_onef = -1.0;
|
|
280
|
+
ScsGpuMatrix *Ag = p->Ag;
|
|
281
|
+
scs_float cg_tol =
|
|
282
|
+
SCS(norm)(b, Ag->n) *
|
|
283
|
+
(iter < 0 ? CG_BEST_TOL
|
|
284
|
+
: CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
|
|
285
|
+
SCS(tic)(&linsys_timer);
|
|
286
|
+
/* all on GPU */
|
|
287
|
+
cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
|
|
288
|
+
SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
|
|
289
|
+
/* solves (I+A'A)x = b, s warm start, solution stored in b */
|
|
290
|
+
cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
|
|
291
|
+
CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
|
|
292
|
+
SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
|
|
293
|
+
cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
|
|
294
|
+
|
|
295
|
+
if (iter >= 0) {
|
|
296
|
+
p->tot_cg_its += cg_its;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
p->total_solve_time += SCS(tocq)(&linsys_timer);
|
|
300
|
+
#if EXTRAVERBOSE > 0
|
|
301
|
+
scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
|
|
302
|
+
#endif
|
|
303
|
+
return 0;
|
|
304
|
+
}
|