scs 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +12 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +98 -0
  5. data/ext/scs/extconf.rb +29 -0
  6. data/lib/scs.rb +17 -0
  7. data/lib/scs/ffi.rb +117 -0
  8. data/lib/scs/solver.rb +173 -0
  9. data/lib/scs/version.rb +3 -0
  10. data/vendor/scs/LICENSE.txt +21 -0
  11. data/vendor/scs/Makefile +164 -0
  12. data/vendor/scs/README.md +222 -0
  13. data/vendor/scs/include/aa.h +56 -0
  14. data/vendor/scs/include/cones.h +46 -0
  15. data/vendor/scs/include/ctrlc.h +33 -0
  16. data/vendor/scs/include/glbopts.h +177 -0
  17. data/vendor/scs/include/linalg.h +26 -0
  18. data/vendor/scs/include/linsys.h +64 -0
  19. data/vendor/scs/include/normalize.h +18 -0
  20. data/vendor/scs/include/rw.h +17 -0
  21. data/vendor/scs/include/scs.h +161 -0
  22. data/vendor/scs/include/scs_blas.h +51 -0
  23. data/vendor/scs/include/util.h +65 -0
  24. data/vendor/scs/linsys/amatrix.c +305 -0
  25. data/vendor/scs/linsys/amatrix.h +36 -0
  26. data/vendor/scs/linsys/amatrix.o +0 -0
  27. data/vendor/scs/linsys/cpu/direct/private.c +366 -0
  28. data/vendor/scs/linsys/cpu/direct/private.h +26 -0
  29. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  30. data/vendor/scs/linsys/cpu/indirect/private.c +256 -0
  31. data/vendor/scs/linsys/cpu/indirect/private.h +31 -0
  32. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  33. data/vendor/scs/linsys/external/amd/LICENSE.txt +934 -0
  34. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +469 -0
  35. data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +254 -0
  36. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  37. data/vendor/scs/linsys/external/amd/amd.h +400 -0
  38. data/vendor/scs/linsys/external/amd/amd_1.c +180 -0
  39. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  40. data/vendor/scs/linsys/external/amd/amd_2.c +1842 -0
  41. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  42. data/vendor/scs/linsys/external/amd/amd_aat.c +184 -0
  43. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  44. data/vendor/scs/linsys/external/amd/amd_control.c +64 -0
  45. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  46. data/vendor/scs/linsys/external/amd/amd_defaults.c +37 -0
  47. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  48. data/vendor/scs/linsys/external/amd/amd_dump.c +179 -0
  49. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  50. data/vendor/scs/linsys/external/amd/amd_global.c +16 -0
  51. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  52. data/vendor/scs/linsys/external/amd/amd_info.c +119 -0
  53. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  54. data/vendor/scs/linsys/external/amd/amd_internal.h +304 -0
  55. data/vendor/scs/linsys/external/amd/amd_order.c +199 -0
  56. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  57. data/vendor/scs/linsys/external/amd/amd_post_tree.c +120 -0
  58. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  59. data/vendor/scs/linsys/external/amd/amd_postorder.c +206 -0
  60. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  61. data/vendor/scs/linsys/external/amd/amd_preprocess.c +118 -0
  62. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  63. data/vendor/scs/linsys/external/amd/amd_valid.c +92 -0
  64. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  65. data/vendor/scs/linsys/external/amd/changes +11 -0
  66. data/vendor/scs/linsys/external/qdldl/LICENSE +201 -0
  67. data/vendor/scs/linsys/external/qdldl/README.md +120 -0
  68. data/vendor/scs/linsys/external/qdldl/changes +4 -0
  69. data/vendor/scs/linsys/external/qdldl/qdldl.c +298 -0
  70. data/vendor/scs/linsys/external/qdldl/qdldl.h +177 -0
  71. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  72. data/vendor/scs/linsys/external/qdldl/qdldl_types.h +21 -0
  73. data/vendor/scs/linsys/gpu/gpu.c +41 -0
  74. data/vendor/scs/linsys/gpu/gpu.h +85 -0
  75. data/vendor/scs/linsys/gpu/indirect/private.c +304 -0
  76. data/vendor/scs/linsys/gpu/indirect/private.h +36 -0
  77. data/vendor/scs/scs.mk +181 -0
  78. data/vendor/scs/src/aa.c +224 -0
  79. data/vendor/scs/src/aa.o +0 -0
  80. data/vendor/scs/src/cones.c +802 -0
  81. data/vendor/scs/src/cones.o +0 -0
  82. data/vendor/scs/src/ctrlc.c +77 -0
  83. data/vendor/scs/src/ctrlc.o +0 -0
  84. data/vendor/scs/src/linalg.c +84 -0
  85. data/vendor/scs/src/linalg.o +0 -0
  86. data/vendor/scs/src/normalize.c +93 -0
  87. data/vendor/scs/src/normalize.o +0 -0
  88. data/vendor/scs/src/rw.c +167 -0
  89. data/vendor/scs/src/rw.o +0 -0
  90. data/vendor/scs/src/scs.c +978 -0
  91. data/vendor/scs/src/scs.o +0 -0
  92. data/vendor/scs/src/scs_version.c +5 -0
  93. data/vendor/scs/src/scs_version.o +0 -0
  94. data/vendor/scs/src/util.c +196 -0
  95. data/vendor/scs/src/util.o +0 -0
  96. data/vendor/scs/test/data/small_random_socp +0 -0
  97. data/vendor/scs/test/minunit.h +13 -0
  98. data/vendor/scs/test/problem_utils.h +93 -0
  99. data/vendor/scs/test/problems/rob_gauss_cov_est.h +85 -0
  100. data/vendor/scs/test/problems/small_lp.h +50 -0
  101. data/vendor/scs/test/problems/small_random_socp.h +33 -0
  102. data/vendor/scs/test/random_socp_prob.c +171 -0
  103. data/vendor/scs/test/run_from_file.c +69 -0
  104. data/vendor/scs/test/run_tests +2 -0
  105. data/vendor/scs/test/run_tests.c +32 -0
  106. metadata +203 -0
@@ -0,0 +1,177 @@
1
+ #ifndef QDLDL_H
2
+ #define QDLDL_H
3
+
4
+ // Include qdldl type options
5
+ #include "qdldl_types.h"
6
+
7
+ # ifdef __cplusplus
8
+ extern "C" {
9
+ # endif // ifdef __cplusplus
10
+
11
+ /**
12
+ * Compute the elimination tree for a quasidefinite matrix
13
+ * in compressed sparse column form, where the input matrix is
14
+ * assumed to contain data for the upper triangular part of A only,
15
+ * and there are no duplicate indices.
16
+ *
17
+ * Returns an elimination tree for the factorization A = LDL^T and a
18
+ * count of the nonzeros in each column of L that are strictly below the
19
+ * diagonal.
20
+ *
21
+ * Does not use MALLOC. It is assumed that the arrays work, Lnz, and
22
+ * etree will be allocated with a number of elements equal to n.
23
+ *
24
+ * The data in (n,Ap,Ai) are from a square matrix A in CSC format, and
25
+ * should include the upper triangular part of A only.
26
+ *
27
+ * This function is only intended for factorisation of QD matrices specified
28
+ * by their upper triangular part. An error is returned if any column has
29
+ * data below the diagonal or s completely empty.
30
+ *
31
+ * For matrices with a non-empty column but a zero on the corresponding diagonal,
32
+ * this function will *not* return an error, as it may still be possible to factor
33
+ * such a matrix in LDL form. No promises are made in this case though...
34
+ *
35
+ * @param n number of columns in CSC matrix A (assumed square)
36
+ * @param Ap column pointers (size n+1) for columns of A
37
+ * @param Ai row indices of A. Has Ap[n] elements
38
+ * @param work work vector (size n) (no meaning on return)
39
+ * @param Lnz count of nonzeros in each column of L (size n) below diagonal
40
+ * @param etree elimination tree (size n)
41
+ * @return total sum of Lnz (i.e. total nonzeros in L below diagonal). Returns
42
+ * -1 if the input does not have triu structure or has an empty
43
+ * column.
44
+ *
45
+ *
46
+ */
47
+
48
+ QDLDL_int QDLDL_etree(const QDLDL_int n,
49
+ const QDLDL_int* Ap,
50
+ const QDLDL_int* Ai,
51
+ QDLDL_int* work,
52
+ QDLDL_int* Lnz,
53
+ QDLDL_int* etree);
54
+
55
+ /**
56
+ * Compute an LDL decomposition for a quasidefinite matrix
57
+ * in compressed sparse column form, where the input matrix is
58
+ * assumed to contain data for the upper triangular part of A only,
59
+ * and there are no duplicate indices.
60
+ *
61
+ * Returns factors L, D and Dinv = 1./D.
62
+ *
63
+ * Does not use MALLOC. It is assumed that L will be a compressed
64
+ * sparse column matrix with data (Ln,Lp,Li) with sufficient space
65
+ * allocated, with a number of nonzeros equal to the count given
66
+ * as a return value by osqp_ldl_etree
67
+ *
68
+ * @param n number of columns in L and A (both square)
69
+ * @param Ap column pointers (size n+1) for columns of A
70
+ * @param Ai row indices of A. Has Ap[n] elements
71
+ * @param Ln number of columns in CSC matrix L
72
+ * @param Lp column pointers (size Ln+1) for columns of L
73
+ * @param Li row indices of L. Has Lp[Ln] elements
74
+ * @param D vectorized factor D. Length is n
75
+ * @param Dinv reciprocal of D. Length is n
76
+ * @param Lnz count of nonzeros in each column of L below diagonal,
77
+ * as given by osqp_ldl_etree (not modified)
78
+ * @param etree elimination tree as as given by osqp_ldl_etree (not modified)
79
+ * @param bwork working array of bools. Length is n
80
+ * @param iwork working array of integers. Length is 3*n
81
+ * @param fwork working array of floats. Length is n
82
+ * @return Returns a count of the number of positive elements
83
+ * in D. Returns -1 and exits immediately if any element
84
+ * of D evaluates exactly to zero (matrix is not quasidefinite
85
+ * or otherwise LDL factorisable)
86
+ *
87
+ */
88
+
89
+
90
+ QDLDL_int QDLDL_factor(const QDLDL_int n,
91
+ const QDLDL_int* Ap,
92
+ const QDLDL_int* Ai,
93
+ const QDLDL_float* Ax,
94
+ QDLDL_int* Lp,
95
+ QDLDL_int* Li,
96
+ QDLDL_float* Lx,
97
+ QDLDL_float* D,
98
+ QDLDL_float* Dinv,
99
+ const QDLDL_int* Lnz,
100
+ const QDLDL_int* etree,
101
+ QDLDL_bool* bwork,
102
+ QDLDL_int* iwork,
103
+ QDLDL_float* fwork);
104
+
105
+
106
+ /**
107
+ * Solves LDL'x = b
108
+ *
109
+ * It is assumed that L will be a compressed
110
+ * sparse column matrix with data (Ln,Lp,Li).
111
+ *
112
+ * @param n number of columns in L (both square)
113
+ * @param Ln number of columns in CSC matrix L
114
+ * @param Lp column pointers (size Ln+1) for columns of L
115
+ * @param Li row indices of L. Has Lp[Ln] elements
116
+ * @param Dinv reciprocal of D. Length is n
117
+ * @param x initialized to b. Equal to x on return
118
+ *
119
+ *
120
+ */
121
+ void QDLDL_solve(const QDLDL_int n,
122
+ const QDLDL_int* Lp,
123
+ const QDLDL_int* Li,
124
+ const QDLDL_float* Lx,
125
+ const QDLDL_float* Dinv,
126
+ QDLDL_float* x);
127
+
128
+
129
+ /**
130
+ * Solves (L+I)x = b
131
+ *
132
+ * It is assumed that L will be a compressed
133
+ * sparse column matrix with data (Ln,Lp,Li).
134
+ *
135
+ * @param n number of columns in L (both square)
136
+ * @param Ln number of columns in CSC matrix L
137
+ * @param Lp column pointers (size Ln+1) for columns of L
138
+ * @param Li row indices of L. Has Lp[Ln] elements
139
+ * @param Dinv reciprocal of D. Length is n
140
+ * @param x initialized to b. Equal to x on return
141
+ *
142
+ *
143
+ */
144
+
145
+ void QDLDL_Lsolve(const QDLDL_int n,
146
+ const QDLDL_int* Lp,
147
+ const QDLDL_int* Li,
148
+ const QDLDL_float* Lx,
149
+ QDLDL_float* x);
150
+
151
+ /**
152
+ * Solves (L+I)'x = b
153
+ *
154
+ * It is assumed that L will be a compressed
155
+ * sparse column matrix with data (Ln,Lp,Li).
156
+ *
157
+ * @param n number of columns in L (both square)
158
+ * @param Ln number of columns in CSC matrix L
159
+ * @param Lp column pointers (size Ln+1) for columns of L
160
+ * @param Li row indices of L. Has Lp[Ln] elements
161
+ * @param Dinv reciprocal of D. Length is n
162
+ * @param x initialized to b. Equal to x on return
163
+ *
164
+ *
165
+ */
166
+
167
+ void QDLDL_Ltsolve(const QDLDL_int n,
168
+ const QDLDL_int* Lp,
169
+ const QDLDL_int* Li,
170
+ const QDLDL_float* Lx,
171
+ QDLDL_float* x);
172
+
173
+ # ifdef __cplusplus
174
+ }
175
+ # endif // ifdef __cplusplus
176
+
177
+ #endif // ifndef QDLDL_H
@@ -0,0 +1,21 @@
1
+ #ifndef QDLDL_TYPES_H
2
+ # define QDLDL_TYPES_H
3
+
4
+ #include "glbopts.h"
5
+
6
+ # ifdef __cplusplus
7
+ extern "C" {
8
+ # endif /* ifdef __cplusplus */
9
+
10
+ // QDLDL integer and float types
11
+
12
+ #define QDLDL_int scs_int
13
+ #define QDLDL_float scs_float
14
+ #define QDLDL_bool scs_int
15
+
16
+ # ifdef __cplusplus
17
+ }
18
+ # endif /* ifdef __cplusplus */
19
+
20
+ #endif /* ifndef QDLDL_TYPES_H */
21
+
@@ -0,0 +1,41 @@
1
+ #include "gpu.h"
2
+
3
+ void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
4
+ scs_float *y, cusparseHandle_t cusparse_handle) {
5
+ /* y += A'*x
6
+ x and y MUST be on GPU already
7
+ */
8
+ const scs_float onef = 1.0;
9
+ CUSPARSE(csrmv)
10
+ (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, Ag->n, Ag->m, Ag->Annz,
11
+ &onef, Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
12
+ }
13
+
14
+ void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
15
+ scs_float *y, cusparseHandle_t cusparse_handle) {
16
+ /* y += A*x
17
+ x and y MUST be on GPU already
18
+ */
19
+ const scs_float onef = 1.0;
20
+ /* The A matrix idx pointers must be ORDERED */
21
+ CUSPARSE(csrmv)
22
+ (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, Ag->n, Ag->m, Ag->Annz, &onef,
23
+ Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
24
+ }
25
+
26
+ void SCS(free_gpu_matrix)(ScsGpuMatrix *A) {
27
+ cudaFree(A->x);
28
+ cudaFree(A->i);
29
+ cudaFree(A->p);
30
+ cusparseDestroyMatDescr(A->descr);
31
+ }
32
+
33
+ void SCS(normalize_a)(ScsMatrix *A, const ScsSettings *stgs, const ScsCone *k,
34
+ ScsScaling *scal) {
35
+ SCS(_normalize_a)(A, stgs, k, scal);
36
+ }
37
+
38
+ void SCS(un_normalize_a)(ScsMatrix *A, const ScsSettings *stgs,
39
+ const ScsScaling *scal) {
40
+ SCS(_un_normalize_a)(A, stgs, scal);
41
+ }
@@ -0,0 +1,85 @@
1
+ #ifndef SCSGPU_H_GUARD
2
+ #define SCSGPU_H_GUARD
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #include <cublas_v2.h>
9
+ #include <cuda.h>
10
+ #include <cuda_runtime_api.h>
11
+ #include <cusparse.h>
12
+
13
+ #include "amatrix.h"
14
+ #include "glbopts.h"
15
+ #include "linalg.h"
16
+ #include "linsys.h"
17
+ #include "scs.h"
18
+ #include "util.h"
19
+
20
+ #define CUDA_CHECK_ERR \
21
+ do { \
22
+ cudaError_t err = cudaGetLastError(); \
23
+ if (err != cudaSuccess) { \
24
+ printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
25
+ cudaGetErrorString(err)); \
26
+ } \
27
+ } while (0)
28
+
29
+ #ifndef EXTRA_VERBOSE
30
+ #ifndef SFLOAT
31
+ #define CUBLAS(x) cublasD##x
32
+ #define CUSPARSE(x) cusparseD##x
33
+ #else
34
+ #define CUBLAS(x) cublasS##x
35
+ #define CUSPARSE(x) cusparseS##x
36
+ #endif
37
+ #else
38
+ #ifndef SFLOAT
39
+ #define CUBLAS(x) \
40
+ CUDA_CHECK_ERR; \
41
+ cublasD##x
42
+ #define CUSPARSE(x) \
43
+ CUDA_CHECK_ERR; \
44
+ cusparseD##x
45
+ #else
46
+ #define CUBLAS(x) \
47
+ CUDA_CHECK_ERR; \
48
+ cublasS##x
49
+ #define CUSPARSE(x) \
50
+ CUDA_CHECK_ERR; \
51
+ cusparseS##x
52
+ #endif
53
+ #endif
54
+
55
+ /*
56
+ CUDA matrix routines only for CSR, not CSC matrices:
57
+ CSC CSR GPU Mult
58
+ A (m x n) A' (n x m) Ag accum_by_a_trans_gpu
59
+ A'(n x m) A (m x n) Agt accum_by_a_gpu
60
+ */
61
+
62
+ /* this struct defines the data matrix A on GPU */
63
+ typedef struct SCS_GPU_A_DATA_MATRIX {
64
+ /* A is supplied in column compressed format */
65
+ scs_float *x; /* A values, size: NNZ A */
66
+ scs_int *i; /* A row index, size: NNZ A */
67
+ scs_int *p; /* A column pointer, size: n+1 */
68
+ scs_int m, n; /* m rows, n cols */
69
+ scs_int Annz; /* num non-zeros in A matrix */
70
+ /* CUDA */
71
+ cusparseMatDescr_t descr;
72
+ } ScsGpuMatrix;
73
+
74
+ void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const scs_float *x,
75
+ scs_float *y, cusparseHandle_t cusparse_handle);
76
+
77
+ void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const scs_float *x,
78
+ scs_float *y, cusparseHandle_t cusparse_handle);
79
+
80
+ void SCS(free_gpu_matrix)(ScsGpuMatrix *A);
81
+
82
+ #ifdef __cplusplus
83
+ }
84
+ #endif
85
+ #endif
@@ -0,0 +1,304 @@
1
+ #include "private.h"
2
+
3
+ #define CG_BEST_TOL 1e-9
4
+ #define CG_MIN_TOL 1e-1
5
+
6
+ /* do not use within pcg, reuses memory */
7
+ void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
8
+ const scs_float *x, scs_float *y) {
9
+ scs_float *v_m = p->tmp_m;
10
+ scs_float *v_n = p->r;
11
+ cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
12
+ cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
13
+ SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
14
+ cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
15
+ }
16
+
17
+ /* do not use within pcg, reuses memory */
18
+ void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
19
+ scs_float *y) {
20
+ scs_float *v_m = p->tmp_m;
21
+ scs_float *v_n = p->r;
22
+ cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
23
+ cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
24
+ #if GPU_TRANSPOSE_MAT > 0
25
+ SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
26
+ #else
27
+ SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
28
+ #endif
29
+ cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
30
+ }
31
+
32
+ char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
33
+ char *str = (char *)scs_malloc(sizeof(char) * 128);
34
+ sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
35
+ (long)A->p[A->n], stgs->cg_rate);
36
+ return str;
37
+ }
38
+
39
+ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
40
+ char *str = (char *)scs_malloc(sizeof(char) * 128);
41
+ sprintf(str,
42
+ "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
43
+ (scs_float)p->tot_cg_its / (info->iter + 1),
44
+ p->total_solve_time / (info->iter + 1) / 1e3);
45
+ p->tot_cg_its = 0;
46
+ p->total_solve_time = 0;
47
+ return str;
48
+ }
49
+
50
+ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
51
+ if (p) {
52
+ cudaFree(p->p);
53
+ cudaFree(p->r);
54
+ cudaFree(p->Gp);
55
+ cudaFree(p->bg);
56
+ cudaFree(p->tmp_m);
57
+ cudaFree(p->z);
58
+ cudaFree(p->M);
59
+ if (p->Ag) {
60
+ SCS(free_gpu_matrix)(p->Ag);
61
+ scs_free(p->Ag);
62
+ }
63
+ if (p->Agt) {
64
+ SCS(free_gpu_matrix)(p->Agt);
65
+ scs_free(p->Agt);
66
+ }
67
+ cusparseDestroy(p->cusparse_handle);
68
+ cublasDestroy(p->cublas_handle);
69
+ /* Don't reset because it interferes with other GPU programs. */
70
+ /* cudaDeviceReset(); */
71
+ scs_free(p);
72
+ }
73
+ }
74
+
75
+ /*y = (RHO_X * I + A'A)x */
76
+ static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
77
+ ScsLinSysWork *p, const scs_float *x, scs_float *y) {
78
+ /* x and y MUST already be loaded to GPU */
79
+ scs_float *tmp_m = p->tmp_m; /* temp memory */
80
+ cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
81
+ SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
82
+ cudaMemset(y, 0, A->n * sizeof(scs_float));
83
+ SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
84
+ CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
85
+ }
86
+
87
+ /* M = inv ( diag ( RHO_X * I + A'A ) ) */
88
+ static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
89
+ ScsLinSysWork *p) {
90
+ scs_int i;
91
+ scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
92
+
93
+ #if EXTRA_VERBOSE > 0
94
+ scs_printf("getting pre-conditioner\n");
95
+ #endif
96
+
97
+ for (i = 0; i < A->n; ++i) {
98
+ M[i] = 1 / (stgs->rho_x +
99
+ SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
100
+ /* M[i] = 1; */
101
+ }
102
+ cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
103
+ scs_free(M);
104
+
105
+ #if EXTRA_VERBOSE > 0
106
+ scs_printf("finished getting pre-conditioner\n");
107
+ #endif
108
+ }
109
+
110
+ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
111
+ const ScsSettings *stgs) {
112
+ cudaError_t err;
113
+ ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
114
+ ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
115
+
116
+ p->cublas_handle = 0;
117
+ p->cusparse_handle = 0;
118
+
119
+ p->total_solve_time = 0;
120
+ p->tot_cg_its = 0;
121
+
122
+ /* Get handle to the CUBLAS context */
123
+ cublasCreate(&p->cublas_handle);
124
+
125
+ /* Get handle to the CUSPARSE context */
126
+ cusparseCreate(&p->cusparse_handle);
127
+
128
+ Ag->n = A->n;
129
+ Ag->m = A->m;
130
+ Ag->Annz = A->p[A->n];
131
+ Ag->descr = 0;
132
+ /* Matrix description */
133
+ cusparseCreateMatDescr(&Ag->descr);
134
+ cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
135
+ cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
136
+ p->Ag = Ag;
137
+ p->Agt = SCS_NULL;
138
+
139
+ cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
140
+ cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
141
+ cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
142
+
143
+ cudaMalloc((void **)&p->p, A->n * sizeof(scs_float));
144
+ cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
145
+ cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
146
+ cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
147
+ cudaMalloc((void **)&p->tmp_m,
148
+ A->m * sizeof(scs_float)); /* intermediate result */
149
+ cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
150
+ cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
151
+
152
+ cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
153
+ cudaMemcpyHostToDevice);
154
+ cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
155
+ cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
156
+ cudaMemcpyHostToDevice);
157
+
158
+ get_preconditioner(A, stgs, p);
159
+
160
+ #if GPU_TRANSPOSE_MAT > 0
161
+ p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
162
+ p->Agt->n = A->m;
163
+ p->Agt->m = A->n;
164
+ p->Agt->Annz = A->p[A->n];
165
+ p->Agt->descr = 0;
166
+ /* Matrix description */
167
+ cusparseCreateMatDescr(&p->Agt->descr);
168
+ cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
169
+ cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
170
+
171
+ cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
172
+ cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
173
+ cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
174
+ /* transpose Ag into Agt for faster multiplies */
175
+ /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
176
+ CUSPARSE(csr2csc)
177
+ (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
178
+ p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
179
+ #endif
180
+
181
+ err = cudaGetLastError();
182
+ if (err != cudaSuccess) {
183
+ printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
184
+ cudaGetErrorString(err));
185
+ SCS(free_lin_sys_work)(p);
186
+ return SCS_NULL;
187
+ }
188
+ return p;
189
+ }
190
+
191
+ static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
192
+ scs_float *z, scs_float *r, scs_int n) {
193
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
194
+ CUBLAS(tbmv)
195
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
196
+ 0, M, 1, z, 1);
197
+ }
198
+
199
+ /* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
200
+ static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
201
+ ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
202
+ scs_int max_its, scs_float tol) {
203
+ scs_int i, n = A->n;
204
+ scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
205
+ scs_float onef = 1.0, neg_onef = -1.0;
206
+ scs_float *p = pr->p; /* cg direction */
207
+ scs_float *Gp = pr->Gp; /* updated CG direction */
208
+ scs_float *r = pr->r; /* cg residual */
209
+ scs_float *z = pr->z; /* preconditioned */
210
+ scs_float *M = pr->M; /* preconditioner */
211
+ cublasHandle_t cublas_handle = pr->cublas_handle;
212
+
213
+ if (s == SCS_NULL) {
214
+ cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
215
+ cudaMemset(bg, 0, n * sizeof(scs_float));
216
+ } else {
217
+ /* p contains bg temporarily */
218
+ cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
219
+ /* bg contains s */
220
+ cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
221
+ mat_vec(A, stgs, pr, bg, r);
222
+ CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
223
+ CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
224
+ }
225
+
226
+ /* for some reason nrm2 is VERY slow */
227
+ /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
228
+ CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
229
+ nrm_r = SQRTF(nrm_r);
230
+ /* check to see if we need to run CG at all */
231
+ if (nrm_r < MIN(tol, 1e-18)) {
232
+ return 0;
233
+ }
234
+
235
+ apply_pre_conditioner(cublas_handle, M, z, r, n);
236
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
237
+ /* put z in p, replacing temp mem */
238
+ cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
239
+
240
+ for (i = 0; i < max_its; ++i) {
241
+ mat_vec(A, stgs, pr, p, Gp);
242
+
243
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
244
+
245
+ alpha = ipzr / p_gp;
246
+ neg_alpha = -alpha;
247
+
248
+ CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
249
+ CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
250
+
251
+ /* for some reason nrm2 is VERY slow */
252
+ /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
253
+ CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
254
+ nrm_r = SQRTF(nrm_r);
255
+ if (nrm_r < tol) {
256
+ i++;
257
+ break;
258
+ }
259
+ ipzr_old = ipzr;
260
+ apply_pre_conditioner(cublas_handle, M, z, r, n);
261
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
262
+
263
+ beta = ipzr / ipzr_old;
264
+ CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
265
+ CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
266
+ }
267
+ #if EXTRA_VERBOSE > 0
268
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
269
+ #endif
270
+ return i;
271
+ }
272
+
273
+ scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
274
+ ScsLinSysWork *p, scs_float *b, const scs_float *s,
275
+ scs_int iter) {
276
+ scs_int cg_its;
277
+ SCS(timer) linsys_timer;
278
+ scs_float *bg = p->bg;
279
+ scs_float neg_onef = -1.0;
280
+ ScsGpuMatrix *Ag = p->Ag;
281
+ scs_float cg_tol =
282
+ SCS(norm)(b, Ag->n) *
283
+ (iter < 0 ? CG_BEST_TOL
284
+ : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
285
+ SCS(tic)(&linsys_timer);
286
+ /* all on GPU */
287
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
288
+ SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
289
+ /* solves (I+A'A)x = b, s warm start, solution stored in b */
290
+ cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
291
+ CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
292
+ SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
293
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
294
+
295
+ if (iter >= 0) {
296
+ p->tot_cg_its += cg_its;
297
+ }
298
+
299
+ p->total_solve_time += SCS(tocq)(&linsys_timer);
300
+ #if EXTRAVERBOSE > 0
301
+ scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
302
+ #endif
303
+ return 0;
304
+ }