scs 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +12 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +98 -0
  5. data/ext/scs/extconf.rb +29 -0
  6. data/lib/scs.rb +17 -0
  7. data/lib/scs/ffi.rb +117 -0
  8. data/lib/scs/solver.rb +173 -0
  9. data/lib/scs/version.rb +3 -0
  10. data/vendor/scs/LICENSE.txt +21 -0
  11. data/vendor/scs/Makefile +164 -0
  12. data/vendor/scs/README.md +222 -0
  13. data/vendor/scs/include/aa.h +56 -0
  14. data/vendor/scs/include/cones.h +46 -0
  15. data/vendor/scs/include/ctrlc.h +33 -0
  16. data/vendor/scs/include/glbopts.h +177 -0
  17. data/vendor/scs/include/linalg.h +26 -0
  18. data/vendor/scs/include/linsys.h +64 -0
  19. data/vendor/scs/include/normalize.h +18 -0
  20. data/vendor/scs/include/rw.h +17 -0
  21. data/vendor/scs/include/scs.h +161 -0
  22. data/vendor/scs/include/scs_blas.h +51 -0
  23. data/vendor/scs/include/util.h +65 -0
  24. data/vendor/scs/linsys/amatrix.c +305 -0
  25. data/vendor/scs/linsys/amatrix.h +36 -0
  26. data/vendor/scs/linsys/amatrix.o +0 -0
  27. data/vendor/scs/linsys/cpu/direct/private.c +366 -0
  28. data/vendor/scs/linsys/cpu/direct/private.h +26 -0
  29. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  30. data/vendor/scs/linsys/cpu/indirect/private.c +256 -0
  31. data/vendor/scs/linsys/cpu/indirect/private.h +31 -0
  32. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  33. data/vendor/scs/linsys/external/amd/LICENSE.txt +934 -0
  34. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +469 -0
  35. data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +254 -0
  36. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  37. data/vendor/scs/linsys/external/amd/amd.h +400 -0
  38. data/vendor/scs/linsys/external/amd/amd_1.c +180 -0
  39. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  40. data/vendor/scs/linsys/external/amd/amd_2.c +1842 -0
  41. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  42. data/vendor/scs/linsys/external/amd/amd_aat.c +184 -0
  43. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  44. data/vendor/scs/linsys/external/amd/amd_control.c +64 -0
  45. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  46. data/vendor/scs/linsys/external/amd/amd_defaults.c +37 -0
  47. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  48. data/vendor/scs/linsys/external/amd/amd_dump.c +179 -0
  49. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  50. data/vendor/scs/linsys/external/amd/amd_global.c +16 -0
  51. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  52. data/vendor/scs/linsys/external/amd/amd_info.c +119 -0
  53. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  54. data/vendor/scs/linsys/external/amd/amd_internal.h +304 -0
  55. data/vendor/scs/linsys/external/amd/amd_order.c +199 -0
  56. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  57. data/vendor/scs/linsys/external/amd/amd_post_tree.c +120 -0
  58. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  59. data/vendor/scs/linsys/external/amd/amd_postorder.c +206 -0
  60. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  61. data/vendor/scs/linsys/external/amd/amd_preprocess.c +118 -0
  62. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  63. data/vendor/scs/linsys/external/amd/amd_valid.c +92 -0
  64. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  65. data/vendor/scs/linsys/external/amd/changes +11 -0
  66. data/vendor/scs/linsys/external/qdldl/LICENSE +201 -0
  67. data/vendor/scs/linsys/external/qdldl/README.md +120 -0
  68. data/vendor/scs/linsys/external/qdldl/changes +4 -0
  69. data/vendor/scs/linsys/external/qdldl/qdldl.c +298 -0
  70. data/vendor/scs/linsys/external/qdldl/qdldl.h +177 -0
  71. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  72. data/vendor/scs/linsys/external/qdldl/qdldl_types.h +21 -0
  73. data/vendor/scs/linsys/gpu/gpu.c +41 -0
  74. data/vendor/scs/linsys/gpu/gpu.h +85 -0
  75. data/vendor/scs/linsys/gpu/indirect/private.c +304 -0
  76. data/vendor/scs/linsys/gpu/indirect/private.h +36 -0
  77. data/vendor/scs/scs.mk +181 -0
  78. data/vendor/scs/src/aa.c +224 -0
  79. data/vendor/scs/src/aa.o +0 -0
  80. data/vendor/scs/src/cones.c +802 -0
  81. data/vendor/scs/src/cones.o +0 -0
  82. data/vendor/scs/src/ctrlc.c +77 -0
  83. data/vendor/scs/src/ctrlc.o +0 -0
  84. data/vendor/scs/src/linalg.c +84 -0
  85. data/vendor/scs/src/linalg.o +0 -0
  86. data/vendor/scs/src/normalize.c +93 -0
  87. data/vendor/scs/src/normalize.o +0 -0
  88. data/vendor/scs/src/rw.c +167 -0
  89. data/vendor/scs/src/rw.o +0 -0
  90. data/vendor/scs/src/scs.c +978 -0
  91. data/vendor/scs/src/scs.o +0 -0
  92. data/vendor/scs/src/scs_version.c +5 -0
  93. data/vendor/scs/src/scs_version.o +0 -0
  94. data/vendor/scs/src/util.c +196 -0
  95. data/vendor/scs/src/util.o +0 -0
  96. data/vendor/scs/test/data/small_random_socp +0 -0
  97. data/vendor/scs/test/minunit.h +13 -0
  98. data/vendor/scs/test/problem_utils.h +93 -0
  99. data/vendor/scs/test/problems/rob_gauss_cov_est.h +85 -0
  100. data/vendor/scs/test/problems/small_lp.h +50 -0
  101. data/vendor/scs/test/problems/small_random_socp.h +33 -0
  102. data/vendor/scs/test/random_socp_prob.c +171 -0
  103. data/vendor/scs/test/run_from_file.c +69 -0
  104. data/vendor/scs/test/run_tests +2 -0
  105. data/vendor/scs/test/run_tests.c +32 -0
  106. metadata +203 -0
@@ -0,0 +1,177 @@
1
+ #ifndef QDLDL_H
2
+ #define QDLDL_H
3
+
4
+ // Include qdldl type options
5
+ #include "qdldl_types.h"
6
+
7
+ # ifdef __cplusplus
8
+ extern "C" {
9
+ # endif // ifdef __cplusplus
10
+
11
+ /**
12
+ * Compute the elimination tree for a quasidefinite matrix
13
+ * in compressed sparse column form, where the input matrix is
14
+ * assumed to contain data for the upper triangular part of A only,
15
+ * and there are no duplicate indices.
16
+ *
17
+ * Returns an elimination tree for the factorization A = LDL^T and a
18
+ * count of the nonzeros in each column of L that are strictly below the
19
+ * diagonal.
20
+ *
21
+ * Does not use MALLOC. It is assumed that the arrays work, Lnz, and
22
+ * etree will be allocated with a number of elements equal to n.
23
+ *
24
+ * The data in (n,Ap,Ai) are from a square matrix A in CSC format, and
25
+ * should include the upper triangular part of A only.
26
+ *
27
+ * This function is only intended for factorisation of QD matrices specified
28
+ * by their upper triangular part. An error is returned if any column has
29
+ * data below the diagonal or s completely empty.
30
+ *
31
+ * For matrices with a non-empty column but a zero on the corresponding diagonal,
32
+ * this function will *not* return an error, as it may still be possible to factor
33
+ * such a matrix in LDL form. No promises are made in this case though...
34
+ *
35
+ * @param n number of columns in CSC matrix A (assumed square)
36
+ * @param Ap column pointers (size n+1) for columns of A
37
+ * @param Ai row indices of A. Has Ap[n] elements
38
+ * @param work work vector (size n) (no meaning on return)
39
+ * @param Lnz count of nonzeros in each column of L (size n) below diagonal
40
+ * @param etree elimination tree (size n)
41
+ * @return total sum of Lnz (i.e. total nonzeros in L below diagonal). Returns
42
+ * -1 if the input does not have triu structure or has an empty
43
+ * column.
44
+ *
45
+ *
46
+ */
47
+
48
+ QDLDL_int QDLDL_etree(const QDLDL_int n,
49
+ const QDLDL_int* Ap,
50
+ const QDLDL_int* Ai,
51
+ QDLDL_int* work,
52
+ QDLDL_int* Lnz,
53
+ QDLDL_int* etree);
54
+
55
+ /**
56
+ * Compute an LDL decomposition for a quasidefinite matrix
57
+ * in compressed sparse column form, where the input matrix is
58
+ * assumed to contain data for the upper triangular part of A only,
59
+ * and there are no duplicate indices.
60
+ *
61
+ * Returns factors L, D and Dinv = 1./D.
62
+ *
63
+ * Does not use MALLOC. It is assumed that L will be a compressed
64
+ * sparse column matrix with data (Ln,Lp,Li) with sufficient space
65
+ * allocated, with a number of nonzeros equal to the count given
66
+ * as a return value by osqp_ldl_etree
67
+ *
68
+ * @param n number of columns in L and A (both square)
69
+ * @param Ap column pointers (size n+1) for columns of A
70
+ * @param Ai row indices of A. Has Ap[n] elements
71
+ * @param Ln number of columns in CSC matrix L
72
+ * @param Lp column pointers (size Ln+1) for columns of L
73
+ * @param Li row indices of L. Has Lp[Ln] elements
74
+ * @param D vectorized factor D. Length is n
75
+ * @param Dinv reciprocal of D. Length is n
76
+ * @param Lnz count of nonzeros in each column of L below diagonal,
77
+ * as given by osqp_ldl_etree (not modified)
78
+ * @param etree elimination tree as as given by osqp_ldl_etree (not modified)
79
+ * @param bwork working array of bools. Length is n
80
+ * @param iwork working array of integers. Length is 3*n
81
+ * @param fwork working array of floats. Length is n
82
+ * @return Returns a count of the number of positive elements
83
+ * in D. Returns -1 and exits immediately if any element
84
+ * of D evaluates exactly to zero (matrix is not quasidefinite
85
+ * or otherwise LDL factorisable)
86
+ *
87
+ */
88
+
89
+
90
+ QDLDL_int QDLDL_factor(const QDLDL_int n,
91
+ const QDLDL_int* Ap,
92
+ const QDLDL_int* Ai,
93
+ const QDLDL_float* Ax,
94
+ QDLDL_int* Lp,
95
+ QDLDL_int* Li,
96
+ QDLDL_float* Lx,
97
+ QDLDL_float* D,
98
+ QDLDL_float* Dinv,
99
+ const QDLDL_int* Lnz,
100
+ const QDLDL_int* etree,
101
+ QDLDL_bool* bwork,
102
+ QDLDL_int* iwork,
103
+ QDLDL_float* fwork);
104
+
105
+
106
+ /**
107
+ * Solves LDL'x = b
108
+ *
109
+ * It is assumed that L will be a compressed
110
+ * sparse column matrix with data (Ln,Lp,Li).
111
+ *
112
+ * @param n number of columns in L (both square)
113
+ * @param Ln number of columns in CSC matrix L
114
+ * @param Lp column pointers (size Ln+1) for columns of L
115
+ * @param Li row indices of L. Has Lp[Ln] elements
116
+ * @param Dinv reciprocal of D. Length is n
117
+ * @param x initialized to b. Equal to x on return
118
+ *
119
+ *
120
+ */
121
+ void QDLDL_solve(const QDLDL_int n,
122
+ const QDLDL_int* Lp,
123
+ const QDLDL_int* Li,
124
+ const QDLDL_float* Lx,
125
+ const QDLDL_float* Dinv,
126
+ QDLDL_float* x);
127
+
128
+
129
+ /**
130
+ * Solves (L+I)x = b
131
+ *
132
+ * It is assumed that L will be a compressed
133
+ * sparse column matrix with data (Ln,Lp,Li).
134
+ *
135
+ * @param n number of columns in L (both square)
136
+ * @param Ln number of columns in CSC matrix L
137
+ * @param Lp column pointers (size Ln+1) for columns of L
138
+ * @param Li row indices of L. Has Lp[Ln] elements
139
+ * @param Dinv reciprocal of D. Length is n
140
+ * @param x initialized to b. Equal to x on return
141
+ *
142
+ *
143
+ */
144
+
145
+ void QDLDL_Lsolve(const QDLDL_int n,
146
+ const QDLDL_int* Lp,
147
+ const QDLDL_int* Li,
148
+ const QDLDL_float* Lx,
149
+ QDLDL_float* x);
150
+
151
+ /**
152
+ * Solves (L+I)'x = b
153
+ *
154
+ * It is assumed that L will be a compressed
155
+ * sparse column matrix with data (Ln,Lp,Li).
156
+ *
157
+ * @param n number of columns in L (both square)
158
+ * @param Ln number of columns in CSC matrix L
159
+ * @param Lp column pointers (size Ln+1) for columns of L
160
+ * @param Li row indices of L. Has Lp[Ln] elements
161
+ * @param Dinv reciprocal of D. Length is n
162
+ * @param x initialized to b. Equal to x on return
163
+ *
164
+ *
165
+ */
166
+
167
+ void QDLDL_Ltsolve(const QDLDL_int n,
168
+ const QDLDL_int* Lp,
169
+ const QDLDL_int* Li,
170
+ const QDLDL_float* Lx,
171
+ QDLDL_float* x);
172
+
173
+ # ifdef __cplusplus
174
+ }
175
+ # endif // ifdef __cplusplus
176
+
177
+ #endif // ifndef QDLDL_H
@@ -0,0 +1,21 @@
1
+ #ifndef QDLDL_TYPES_H
2
+ # define QDLDL_TYPES_H
3
+
4
+ #include "glbopts.h"
5
+
6
+ # ifdef __cplusplus
7
+ extern "C" {
8
+ # endif /* ifdef __cplusplus */
9
+
10
+ // QDLDL integer and float types
11
+
12
+ #define QDLDL_int scs_int
13
+ #define QDLDL_float scs_float
14
+ #define QDLDL_bool scs_int
15
+
16
+ # ifdef __cplusplus
17
+ }
18
+ # endif /* ifdef __cplusplus */
19
+
20
+ #endif /* ifndef QDLDL_TYPES_H */
21
+
@@ -0,0 +1,41 @@
1
+ #include "gpu.h"
2
+
3
+ void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
4
+ scs_float *y, cusparseHandle_t cusparse_handle) {
5
+ /* y += A'*x
6
+ x and y MUST be on GPU already
7
+ */
8
+ const scs_float onef = 1.0;
9
+ CUSPARSE(csrmv)
10
+ (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, Ag->n, Ag->m, Ag->Annz,
11
+ &onef, Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
12
+ }
13
+
14
+ void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
15
+ scs_float *y, cusparseHandle_t cusparse_handle) {
16
+ /* y += A*x
17
+ x and y MUST be on GPU already
18
+ */
19
+ const scs_float onef = 1.0;
20
+ /* The A matrix idx pointers must be ORDERED */
21
+ CUSPARSE(csrmv)
22
+ (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, Ag->n, Ag->m, Ag->Annz, &onef,
23
+ Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
24
+ }
25
+
26
+ void SCS(free_gpu_matrix)(ScsGpuMatrix *A) {
27
+ cudaFree(A->x);
28
+ cudaFree(A->i);
29
+ cudaFree(A->p);
30
+ cusparseDestroyMatDescr(A->descr);
31
+ }
32
+
33
+ void SCS(normalize_a)(ScsMatrix *A, const ScsSettings *stgs, const ScsCone *k,
34
+ ScsScaling *scal) {
35
+ SCS(_normalize_a)(A, stgs, k, scal);
36
+ }
37
+
38
+ void SCS(un_normalize_a)(ScsMatrix *A, const ScsSettings *stgs,
39
+ const ScsScaling *scal) {
40
+ SCS(_un_normalize_a)(A, stgs, scal);
41
+ }
@@ -0,0 +1,85 @@
1
+ #ifndef SCSGPU_H_GUARD
2
+ #define SCSGPU_H_GUARD
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #include <cublas_v2.h>
9
+ #include <cuda.h>
10
+ #include <cuda_runtime_api.h>
11
+ #include <cusparse.h>
12
+
13
+ #include "amatrix.h"
14
+ #include "glbopts.h"
15
+ #include "linalg.h"
16
+ #include "linsys.h"
17
+ #include "scs.h"
18
+ #include "util.h"
19
+
20
+ #define CUDA_CHECK_ERR \
21
+ do { \
22
+ cudaError_t err = cudaGetLastError(); \
23
+ if (err != cudaSuccess) { \
24
+ printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
25
+ cudaGetErrorString(err)); \
26
+ } \
27
+ } while (0)
28
+
29
+ #ifndef EXTRA_VERBOSE
30
+ #ifndef SFLOAT
31
+ #define CUBLAS(x) cublasD##x
32
+ #define CUSPARSE(x) cusparseD##x
33
+ #else
34
+ #define CUBLAS(x) cublasS##x
35
+ #define CUSPARSE(x) cusparseS##x
36
+ #endif
37
+ #else
38
+ #ifndef SFLOAT
39
+ #define CUBLAS(x) \
40
+ CUDA_CHECK_ERR; \
41
+ cublasD##x
42
+ #define CUSPARSE(x) \
43
+ CUDA_CHECK_ERR; \
44
+ cusparseD##x
45
+ #else
46
+ #define CUBLAS(x) \
47
+ CUDA_CHECK_ERR; \
48
+ cublasS##x
49
+ #define CUSPARSE(x) \
50
+ CUDA_CHECK_ERR; \
51
+ cusparseS##x
52
+ #endif
53
+ #endif
54
+
55
+ /*
56
+ CUDA matrix routines only for CSR, not CSC matrices:
57
+ CSC CSR GPU Mult
58
+ A (m x n) A' (n x m) Ag accum_by_a_trans_gpu
59
+ A'(n x m) A (m x n) Agt accum_by_a_gpu
60
+ */
61
+
62
+ /* this struct defines the data matrix A on GPU */
63
+ typedef struct SCS_GPU_A_DATA_MATRIX {
64
+ /* A is supplied in column compressed format */
65
+ scs_float *x; /* A values, size: NNZ A */
66
+ scs_int *i; /* A row index, size: NNZ A */
67
+ scs_int *p; /* A column pointer, size: n+1 */
68
+ scs_int m, n; /* m rows, n cols */
69
+ scs_int Annz; /* num non-zeros in A matrix */
70
+ /* CUDA */
71
+ cusparseMatDescr_t descr;
72
+ } ScsGpuMatrix;
73
+
74
+ void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const scs_float *x,
75
+ scs_float *y, cusparseHandle_t cusparse_handle);
76
+
77
+ void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const scs_float *x,
78
+ scs_float *y, cusparseHandle_t cusparse_handle);
79
+
80
+ void SCS(free_gpu_matrix)(ScsGpuMatrix *A);
81
+
82
+ #ifdef __cplusplus
83
+ }
84
+ #endif
85
+ #endif
@@ -0,0 +1,304 @@
1
+ #include "private.h"
2
+
3
+ #define CG_BEST_TOL 1e-9
4
+ #define CG_MIN_TOL 1e-1
5
+
6
+ /* do not use within pcg, reuses memory */
7
+ void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
8
+ const scs_float *x, scs_float *y) {
9
+ scs_float *v_m = p->tmp_m;
10
+ scs_float *v_n = p->r;
11
+ cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
12
+ cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
13
+ SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
14
+ cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
15
+ }
16
+
17
+ /* do not use within pcg, reuses memory */
18
+ void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
19
+ scs_float *y) {
20
+ scs_float *v_m = p->tmp_m;
21
+ scs_float *v_n = p->r;
22
+ cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
23
+ cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
24
+ #if GPU_TRANSPOSE_MAT > 0
25
+ SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
26
+ #else
27
+ SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
28
+ #endif
29
+ cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
30
+ }
31
+
32
+ char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
33
+ char *str = (char *)scs_malloc(sizeof(char) * 128);
34
+ sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
35
+ (long)A->p[A->n], stgs->cg_rate);
36
+ return str;
37
+ }
38
+
39
+ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
40
+ char *str = (char *)scs_malloc(sizeof(char) * 128);
41
+ sprintf(str,
42
+ "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
43
+ (scs_float)p->tot_cg_its / (info->iter + 1),
44
+ p->total_solve_time / (info->iter + 1) / 1e3);
45
+ p->tot_cg_its = 0;
46
+ p->total_solve_time = 0;
47
+ return str;
48
+ }
49
+
50
+ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
51
+ if (p) {
52
+ cudaFree(p->p);
53
+ cudaFree(p->r);
54
+ cudaFree(p->Gp);
55
+ cudaFree(p->bg);
56
+ cudaFree(p->tmp_m);
57
+ cudaFree(p->z);
58
+ cudaFree(p->M);
59
+ if (p->Ag) {
60
+ SCS(free_gpu_matrix)(p->Ag);
61
+ scs_free(p->Ag);
62
+ }
63
+ if (p->Agt) {
64
+ SCS(free_gpu_matrix)(p->Agt);
65
+ scs_free(p->Agt);
66
+ }
67
+ cusparseDestroy(p->cusparse_handle);
68
+ cublasDestroy(p->cublas_handle);
69
+ /* Don't reset because it interferes with other GPU programs. */
70
+ /* cudaDeviceReset(); */
71
+ scs_free(p);
72
+ }
73
+ }
74
+
75
+ /*y = (RHO_X * I + A'A)x */
76
+ static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
77
+ ScsLinSysWork *p, const scs_float *x, scs_float *y) {
78
+ /* x and y MUST already be loaded to GPU */
79
+ scs_float *tmp_m = p->tmp_m; /* temp memory */
80
+ cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
81
+ SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
82
+ cudaMemset(y, 0, A->n * sizeof(scs_float));
83
+ SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
84
+ CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
85
+ }
86
+
87
+ /* M = inv ( diag ( RHO_X * I + A'A ) ) */
88
+ static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
89
+ ScsLinSysWork *p) {
90
+ scs_int i;
91
+ scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
92
+
93
+ #if EXTRA_VERBOSE > 0
94
+ scs_printf("getting pre-conditioner\n");
95
+ #endif
96
+
97
+ for (i = 0; i < A->n; ++i) {
98
+ M[i] = 1 / (stgs->rho_x +
99
+ SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
100
+ /* M[i] = 1; */
101
+ }
102
+ cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
103
+ scs_free(M);
104
+
105
+ #if EXTRA_VERBOSE > 0
106
+ scs_printf("finished getting pre-conditioner\n");
107
+ #endif
108
+ }
109
+
110
+ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
111
+ const ScsSettings *stgs) {
112
+ cudaError_t err;
113
+ ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
114
+ ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
115
+
116
+ p->cublas_handle = 0;
117
+ p->cusparse_handle = 0;
118
+
119
+ p->total_solve_time = 0;
120
+ p->tot_cg_its = 0;
121
+
122
+ /* Get handle to the CUBLAS context */
123
+ cublasCreate(&p->cublas_handle);
124
+
125
+ /* Get handle to the CUSPARSE context */
126
+ cusparseCreate(&p->cusparse_handle);
127
+
128
+ Ag->n = A->n;
129
+ Ag->m = A->m;
130
+ Ag->Annz = A->p[A->n];
131
+ Ag->descr = 0;
132
+ /* Matrix description */
133
+ cusparseCreateMatDescr(&Ag->descr);
134
+ cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
135
+ cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
136
+ p->Ag = Ag;
137
+ p->Agt = SCS_NULL;
138
+
139
+ cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
140
+ cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
141
+ cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
142
+
143
+ cudaMalloc((void **)&p->p, A->n * sizeof(scs_float));
144
+ cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
145
+ cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
146
+ cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
147
+ cudaMalloc((void **)&p->tmp_m,
148
+ A->m * sizeof(scs_float)); /* intermediate result */
149
+ cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
150
+ cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
151
+
152
+ cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
153
+ cudaMemcpyHostToDevice);
154
+ cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
155
+ cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
156
+ cudaMemcpyHostToDevice);
157
+
158
+ get_preconditioner(A, stgs, p);
159
+
160
+ #if GPU_TRANSPOSE_MAT > 0
161
+ p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
162
+ p->Agt->n = A->m;
163
+ p->Agt->m = A->n;
164
+ p->Agt->Annz = A->p[A->n];
165
+ p->Agt->descr = 0;
166
+ /* Matrix description */
167
+ cusparseCreateMatDescr(&p->Agt->descr);
168
+ cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
169
+ cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
170
+
171
+ cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
172
+ cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
173
+ cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
174
+ /* transpose Ag into Agt for faster multiplies */
175
+ /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
176
+ CUSPARSE(csr2csc)
177
+ (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
178
+ p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
179
+ #endif
180
+
181
+ err = cudaGetLastError();
182
+ if (err != cudaSuccess) {
183
+ printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
184
+ cudaGetErrorString(err));
185
+ SCS(free_lin_sys_work)(p);
186
+ return SCS_NULL;
187
+ }
188
+ return p;
189
+ }
190
+
191
+ static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
192
+ scs_float *z, scs_float *r, scs_int n) {
193
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
194
+ CUBLAS(tbmv)
195
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
196
+ 0, M, 1, z, 1);
197
+ }
198
+
199
+ /* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
200
+ static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
201
+ ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
202
+ scs_int max_its, scs_float tol) {
203
+ scs_int i, n = A->n;
204
+ scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
205
+ scs_float onef = 1.0, neg_onef = -1.0;
206
+ scs_float *p = pr->p; /* cg direction */
207
+ scs_float *Gp = pr->Gp; /* updated CG direction */
208
+ scs_float *r = pr->r; /* cg residual */
209
+ scs_float *z = pr->z; /* preconditioned */
210
+ scs_float *M = pr->M; /* preconditioner */
211
+ cublasHandle_t cublas_handle = pr->cublas_handle;
212
+
213
+ if (s == SCS_NULL) {
214
+ cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
215
+ cudaMemset(bg, 0, n * sizeof(scs_float));
216
+ } else {
217
+ /* p contains bg temporarily */
218
+ cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
219
+ /* bg contains s */
220
+ cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
221
+ mat_vec(A, stgs, pr, bg, r);
222
+ CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
223
+ CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
224
+ }
225
+
226
+ /* for some reason nrm2 is VERY slow */
227
+ /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
228
+ CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
229
+ nrm_r = SQRTF(nrm_r);
230
+ /* check to see if we need to run CG at all */
231
+ if (nrm_r < MIN(tol, 1e-18)) {
232
+ return 0;
233
+ }
234
+
235
+ apply_pre_conditioner(cublas_handle, M, z, r, n);
236
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
237
+ /* put z in p, replacing temp mem */
238
+ cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
239
+
240
+ for (i = 0; i < max_its; ++i) {
241
+ mat_vec(A, stgs, pr, p, Gp);
242
+
243
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
244
+
245
+ alpha = ipzr / p_gp;
246
+ neg_alpha = -alpha;
247
+
248
+ CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
249
+ CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
250
+
251
+ /* for some reason nrm2 is VERY slow */
252
+ /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
253
+ CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
254
+ nrm_r = SQRTF(nrm_r);
255
+ if (nrm_r < tol) {
256
+ i++;
257
+ break;
258
+ }
259
+ ipzr_old = ipzr;
260
+ apply_pre_conditioner(cublas_handle, M, z, r, n);
261
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
262
+
263
+ beta = ipzr / ipzr_old;
264
+ CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
265
+ CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
266
+ }
267
+ #if EXTRA_VERBOSE > 0
268
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
269
+ #endif
270
+ return i;
271
+ }
272
+
273
+ scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
274
+ ScsLinSysWork *p, scs_float *b, const scs_float *s,
275
+ scs_int iter) {
276
+ scs_int cg_its;
277
+ SCS(timer) linsys_timer;
278
+ scs_float *bg = p->bg;
279
+ scs_float neg_onef = -1.0;
280
+ ScsGpuMatrix *Ag = p->Ag;
281
+ scs_float cg_tol =
282
+ SCS(norm)(b, Ag->n) *
283
+ (iter < 0 ? CG_BEST_TOL
284
+ : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
285
+ SCS(tic)(&linsys_timer);
286
+ /* all on GPU */
287
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
288
+ SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
289
+ /* solves (I+A'A)x = b, s warm start, solution stored in b */
290
+ cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
291
+ CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
292
+ SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
293
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
294
+
295
+ if (iter >= 0) {
296
+ p->tot_cg_its += cg_its;
297
+ }
298
+
299
+ p->total_solve_time += SCS(tocq)(&linsys_timer);
300
+ #if EXTRAVERBOSE > 0
301
+ scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
302
+ #endif
303
+ return 0;
304
+ }