scs 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -0
  3. data/LICENSE.txt +18 -18
  4. data/README.md +28 -9
  5. data/ext/scs/extconf.rb +29 -0
  6. data/lib/scs/ffi.rb +30 -13
  7. data/lib/scs/solver.rb +32 -14
  8. data/lib/scs/version.rb +1 -1
  9. data/vendor/scs/CITATION.cff +39 -0
  10. data/vendor/scs/CMakeLists.txt +272 -0
  11. data/vendor/scs/Makefile +24 -15
  12. data/vendor/scs/README.md +8 -216
  13. data/vendor/scs/include/aa.h +67 -23
  14. data/vendor/scs/include/cones.h +17 -17
  15. data/vendor/scs/include/glbopts.h +98 -32
  16. data/vendor/scs/include/linalg.h +2 -4
  17. data/vendor/scs/include/linsys.h +58 -44
  18. data/vendor/scs/include/normalize.h +3 -3
  19. data/vendor/scs/include/rw.h +8 -2
  20. data/vendor/scs/include/scs.h +293 -133
  21. data/vendor/scs/include/util.h +3 -15
  22. data/vendor/scs/linsys/cpu/direct/private.c +220 -224
  23. data/vendor/scs/linsys/cpu/direct/private.h +13 -7
  24. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  25. data/vendor/scs/linsys/cpu/indirect/private.c +177 -110
  26. data/vendor/scs/linsys/cpu/indirect/private.h +8 -4
  27. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  28. data/vendor/scs/linsys/csparse.c +87 -0
  29. data/vendor/scs/linsys/csparse.h +34 -0
  30. data/vendor/scs/linsys/csparse.o +0 -0
  31. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +1 -1
  32. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  33. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  34. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  35. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  36. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  37. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  38. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  39. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  40. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  41. data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
  42. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  43. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  44. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  45. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  46. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  47. data/vendor/scs/linsys/external/qdldl/changes +2 -0
  48. data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
  49. data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
  50. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  51. data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
  52. data/vendor/scs/linsys/gpu/gpu.c +58 -21
  53. data/vendor/scs/linsys/gpu/gpu.h +66 -28
  54. data/vendor/scs/linsys/gpu/indirect/private.c +368 -154
  55. data/vendor/scs/linsys/gpu/indirect/private.h +26 -12
  56. data/vendor/scs/linsys/scs_matrix.c +498 -0
  57. data/vendor/scs/linsys/scs_matrix.h +70 -0
  58. data/vendor/scs/linsys/scs_matrix.o +0 -0
  59. data/vendor/scs/scs.mk +13 -9
  60. data/vendor/scs/src/aa.c +384 -109
  61. data/vendor/scs/src/aa.o +0 -0
  62. data/vendor/scs/src/cones.c +440 -353
  63. data/vendor/scs/src/cones.o +0 -0
  64. data/vendor/scs/src/ctrlc.c +15 -5
  65. data/vendor/scs/src/ctrlc.o +0 -0
  66. data/vendor/scs/src/linalg.c +84 -28
  67. data/vendor/scs/src/linalg.o +0 -0
  68. data/vendor/scs/src/normalize.c +22 -64
  69. data/vendor/scs/src/normalize.o +0 -0
  70. data/vendor/scs/src/rw.c +161 -22
  71. data/vendor/scs/src/rw.o +0 -0
  72. data/vendor/scs/src/scs.c +768 -561
  73. data/vendor/scs/src/scs.o +0 -0
  74. data/vendor/scs/src/scs_indir.o +0 -0
  75. data/vendor/scs/src/scs_version.c +9 -3
  76. data/vendor/scs/src/scs_version.o +0 -0
  77. data/vendor/scs/src/util.c +37 -106
  78. data/vendor/scs/src/util.o +0 -0
  79. data/vendor/scs/test/minunit.h +17 -8
  80. data/vendor/scs/test/problem_utils.h +176 -14
  81. data/vendor/scs/test/problems/degenerate.h +130 -0
  82. data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
  83. data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
  84. data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
  85. data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
  86. data/vendor/scs/test/problems/random_prob +0 -0
  87. data/vendor/scs/test/problems/random_prob.h +45 -0
  88. data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
  89. data/vendor/scs/test/problems/small_lp.h +13 -14
  90. data/vendor/scs/test/problems/test_fails.h +43 -0
  91. data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
  92. data/vendor/scs/test/random_socp_prob.c +54 -53
  93. data/vendor/scs/test/rng.h +109 -0
  94. data/vendor/scs/test/run_from_file.c +19 -10
  95. data/vendor/scs/test/run_tests.c +27 -3
  96. metadata +30 -73
  97. data/ext/scs/Rakefile +0 -11
  98. data/vendor/scs/linsys/amatrix.c +0 -305
  99. data/vendor/scs/linsys/amatrix.h +0 -36
  100. data/vendor/scs/linsys/amatrix.o +0 -0
  101. data/vendor/scs/test/data/small_random_socp +0 -0
  102. data/vendor/scs/test/problems/small_random_socp.h +0 -33
  103. data/vendor/scs/test/run_tests +0 -2
@@ -1,54 +1,89 @@
1
1
  #include "private.h"
2
+ #include "linsys.h"
2
3
 
3
- #define CG_BEST_TOL 1e-9
4
- #define CG_MIN_TOL 1e-1
5
-
6
- /* do not use within pcg, reuses memory */
7
- void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
8
- const scs_float *x, scs_float *y) {
9
- scs_float *v_m = p->tmp_m;
10
- scs_float *v_n = p->r;
11
- cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
12
- cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
13
- SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
14
- cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
15
- }
4
+ /* norm to use when deciding convergence */
5
+ /* should be consistent with CG_NORM in glbopts.h */
6
+ #define USE_L2_NORM (0)
16
7
 
17
- /* do not use within pcg, reuses memory */
18
- void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
19
- scs_float *y) {
20
- scs_float *v_m = p->tmp_m;
21
- scs_float *v_n = p->r;
22
- cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
23
- cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
24
- #if GPU_TRANSPOSE_MAT > 0
25
- SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
8
+ static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
9
+ scs_int n) {
10
+ #if USE_L2_NORM > 0
11
+ scs_float nrm;
12
+ CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
26
13
  #else
27
- SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
14
+ scs_int idx;
15
+ scs_float nrm;
16
+ CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
17
+ /* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
18
+ cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
19
+ nrm = ABS(nrm);
28
20
  #endif
29
- cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
21
+ return nrm;
30
22
  }
31
23
 
32
- char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
33
- char *str = (char *)scs_malloc(sizeof(char) * 128);
34
- sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
35
- (long)A->p[A->n], stgs->cg_rate);
36
- return str;
24
+ const char *SCS(get_lin_sys_method)() {
25
+ return "sparse-indirect GPU";
37
26
  }
38
27
 
28
+ /*
39
29
  char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
40
30
  char *str = (char *)scs_malloc(sizeof(char) * 128);
41
- sprintf(str,
42
- "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
43
- (scs_float)p->tot_cg_its / (info->iter + 1),
44
- p->total_solve_time / (info->iter + 1) / 1e3);
31
+ sprintf(str, "lin-sys: avg cg its: %2.2f\n",
32
+ (scs_float)p->tot_cg_its / (info->iter + 1));
45
33
  p->tot_cg_its = 0;
46
- p->total_solve_time = 0;
47
34
  return str;
48
35
  }
36
+ */
37
+
38
+ /* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
39
+ static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
40
+ scs_int i, k;
41
+ const ScsMatrix *A = p->A;
42
+ const ScsMatrix *P = p->P;
43
+ scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
44
+
45
+ #if VERBOSITY > 0
46
+ scs_printf("getting pre-conditioner\n");
47
+ #endif
48
+
49
+ for (i = 0; i < A->n; ++i) { /* cols */
50
+ M[i] = p->rho_x;
51
+ /* diag(A' R_y^{-1} A) */
52
+ for (k = A->p[i]; k < A->p[i + 1]; ++k) {
53
+ /* A->i[k] is row of entry k with value A->x[k] */
54
+ M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
55
+ }
56
+ if (P) {
57
+ for (k = P->p[i]; k < P->p[i + 1]; k++) {
58
+ /* diagonal element only */
59
+ if (P->i[k] == i) { /* row == col */
60
+ M[i] += P->x[k];
61
+ break;
62
+ }
63
+ }
64
+ }
65
+ M[i] = 1. / M[i];
66
+ }
67
+ cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
68
+ scs_free(M);
69
+ #if VERBOSITY > 0
70
+ scs_printf("finished getting pre-conditioner\n");
71
+ #endif
72
+ }
73
+
74
+ /* no need to update anything in this case */
75
+ void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
76
+ scs_int i;
77
+ for (i = 0; i < p->m; ++i)
78
+ p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
79
+ cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
80
+ cudaMemcpyHostToDevice);
81
+ set_preconditioner(p, rho_y_vec);
82
+ }
49
83
 
50
84
  void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
51
85
  if (p) {
86
+ scs_free(p->inv_rho_y_vec);
52
87
  cudaFree(p->p);
53
88
  cudaFree(p->r);
54
89
  cudaFree(p->Gp);
@@ -56,6 +91,11 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
56
91
  cudaFree(p->tmp_m);
57
92
  cudaFree(p->z);
58
93
  cudaFree(p->M);
94
+ cudaFree(p->inv_rho_y_vec_gpu);
95
+ if (p->Pg) {
96
+ SCS(free_gpu_matrix)(p->Pg);
97
+ scs_free(p->Pg);
98
+ }
59
99
  if (p->Ag) {
60
100
  SCS(free_gpu_matrix)(p->Ag);
61
101
  scs_free(p->Ag);
@@ -64,6 +104,12 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
64
104
  SCS(free_gpu_matrix)(p->Agt);
65
105
  scs_free(p->Agt);
66
106
  }
107
+ if (p->buffer != SCS_NULL) {
108
+ cudaFree(p->buffer);
109
+ }
110
+ cusparseDestroyDnVec(p->dn_vec_m);
111
+ cusparseDestroyDnVec(p->dn_vec_n);
112
+ cusparseDestroyDnVec(p->dn_vec_n_p);
67
113
  cusparseDestroy(p->cusparse_handle);
68
114
  cublasDestroy(p->cublas_handle);
69
115
  /* Don't reset because it interferes with other GPU programs. */
@@ -72,53 +118,110 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
72
118
  }
73
119
  }
74
120
 
75
- /*y = (RHO_X * I + A'A)x */
76
- static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
77
- ScsLinSysWork *p, const scs_float *x, scs_float *y) {
78
- /* x and y MUST already be loaded to GPU */
79
- scs_float *tmp_m = p->tmp_m; /* temp memory */
80
- cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
81
- SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
82
- cudaMemset(y, 0, A->n * sizeof(scs_float));
83
- SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
84
- CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
121
+ /* z = M * z elementwise in place, assumes M, z on GPU */
122
+ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
123
+ scs_float *z, scs_int n) {
124
+ CUBLAS(tbmv)
125
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
126
+ 0, M, 1, z, 1);
85
127
  }
86
128
 
87
- /* M = inv ( diag ( RHO_X * I + A'A ) ) */
88
- static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
89
- ScsLinSysWork *p) {
90
- scs_int i;
91
- scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
129
+ /* y = (rho_x * I + P + A' R_y^{-1} A) x */
130
+ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
131
+ /* x and y MUST already be loaded to GPU */
132
+ scs_float *z = p->tmp_m; /* temp memory */
133
+ cudaMemset(y, 0, p->n * sizeof(scs_float));
134
+ cudaMemset(z, 0, p->m * sizeof(scs_float));
135
+
136
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
137
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
138
+ cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
139
+
140
+ /* y = rho_x * x */
141
+ CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
142
+
143
+ if (p->Pg) {
144
+ /* y = rho_x * x + Px */
145
+ SCS(accum_by_p_gpu)
146
+ (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
147
+ &p->buffer);
148
+ }
92
149
 
93
- #if EXTRA_VERBOSE > 0
94
- scs_printf("getting pre-conditioner\n");
150
+ /* z = Ax */
151
+ #if GPU_TRANSPOSE_MAT > 0
152
+ SCS(accum_by_atrans_gpu)
153
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
154
+ &p->buffer);
155
+ #else
156
+ SCS(accum_by_a_gpu)
157
+ (p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
158
+ &p->buffer);
95
159
  #endif
160
+ /* z = R_y^{-1} A x */
161
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
96
162
 
97
- for (i = 0; i < A->n; ++i) {
98
- M[i] = 1 / (stgs->rho_x +
99
- SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
100
- /* M[i] = 1; */
101
- }
102
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
103
- scs_free(M);
163
+ /* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
164
+ SCS(accum_by_atrans_gpu)
165
+ (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
166
+ &p->buffer);
167
+ }
104
168
 
105
- #if EXTRA_VERBOSE > 0
106
- scs_printf("finished getting pre-conditioner\n");
107
- #endif
169
+ /* P comes in upper triangular, expand to full
170
+ * First compute triplet version of full matrix, then compress to csc
171
+ * */
172
+ static csc *fill_p_matrix(const ScsMatrix *P) {
173
+ scs_int i, j, k, kk;
174
+ scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
175
+ csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
176
+ csc *P_full;
177
+ kk = 0;
178
+ for (j = 0; j < P->n; j++) { /* cols */
179
+ for (k = P->p[j]; k < P->p[j + 1]; k++) {
180
+ i = P->i[k]; /* row */
181
+ if (i > j) { /* only upper triangular needed */
182
+ break;
183
+ }
184
+ P_tmp->i[kk] = i;
185
+ P_tmp->p[kk] = j;
186
+ P_tmp->x[kk] = P->x[k];
187
+ kk++;
188
+ if (i == j) { /* diagonal */
189
+ continue;
190
+ }
191
+ P_tmp->i[kk] = j;
192
+ P_tmp->p[kk] = i;
193
+ P_tmp->x[kk] = P->x[k];
194
+ kk++;
195
+ }
196
+ }
197
+ P_tmp->nz = kk; /* set number of nonzeros */
198
+ P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
199
+ SCS(cs_spfree)(P_tmp);
200
+ return P_full;
108
201
  }
109
202
 
110
- ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
111
- const ScsSettings *stgs) {
203
+ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
204
+ scs_float *rho_y_vec, scs_float rho_x) {
112
205
  cudaError_t err;
206
+ scs_int i;
207
+ csc *P_full;
113
208
  ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
114
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
209
+ ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
210
+ ScsGpuMatrix *Pg = SCS_NULL;
115
211
 
212
+ #if GPU_TRANSPOSE_MAT > 0
213
+ size_t new_buffer_size = 0;
214
+ #endif
215
+
216
+ p->rho_x = rho_x;
116
217
  p->cublas_handle = 0;
117
218
  p->cusparse_handle = 0;
118
219
 
119
- p->total_solve_time = 0;
120
220
  p->tot_cg_its = 0;
121
221
 
222
+ p->buffer_size = 0;
223
+ p->buffer = SCS_NULL;
224
+
122
225
  /* Get handle to the CUBLAS context */
123
226
  cublasCreate(&p->cublas_handle);
124
227
 
@@ -127,15 +230,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
127
230
 
128
231
  Ag->n = A->n;
129
232
  Ag->m = A->m;
130
- Ag->Annz = A->p[A->n];
233
+ Ag->nnz = A->p[A->n];
131
234
  Ag->descr = 0;
132
- /* Matrix description */
133
- cusparseCreateMatDescr(&Ag->descr);
134
- cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
135
- cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
136
- p->Ag = Ag;
137
- p->Agt = SCS_NULL;
138
-
139
235
  cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
140
236
  cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
141
237
  cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
@@ -144,10 +240,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
144
240
  cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
145
241
  cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
146
242
  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
147
- cudaMalloc((void **)&p->tmp_m,
148
- A->m * sizeof(scs_float)); /* intermediate result */
243
+ cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
149
244
  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
150
245
  cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
246
+ cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
151
247
 
152
248
  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
153
249
  cudaMemcpyHostToDevice);
@@ -155,32 +251,94 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
155
251
  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
156
252
  cudaMemcpyHostToDevice);
157
253
 
158
- get_preconditioner(A, stgs, p);
254
+ p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
255
+ for (i = 0; i < A->m; ++i)
256
+ p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
257
+ cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
258
+ cudaMemcpyHostToDevice);
259
+
260
+ cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
261
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
262
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
263
+
264
+ if (P) {
265
+ Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
266
+ P_full = fill_p_matrix(P);
267
+ Pg->n = P_full->n;
268
+ Pg->m = P_full->m;
269
+ Pg->nnz = P_full->p[P_full->n];
270
+ Pg->descr = 0;
271
+ cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
272
+ cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
273
+ cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
274
+
275
+ cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
276
+ cudaMemcpyHostToDevice);
277
+ cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
278
+ cudaMemcpyHostToDevice);
279
+ cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
280
+ cudaMemcpyHostToDevice);
281
+
282
+ cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
283
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
284
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
285
+
286
+ SCS(cs_spfree)(P_full);
287
+ } else {
288
+ Pg = SCS_NULL;
289
+ }
290
+
291
+ p->Ag = Ag;
292
+ p->Pg = Pg;
293
+ p->Agt = SCS_NULL;
294
+
295
+ /* we initialize with tmp_m but always overwrite it so it doesn't matter */
296
+ cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
297
+ cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
298
+ cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
299
+
300
+ set_preconditioner(p, rho_y_vec);
159
301
 
160
302
  #if GPU_TRANSPOSE_MAT > 0
161
303
  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
162
304
  p->Agt->n = A->m;
163
305
  p->Agt->m = A->n;
164
- p->Agt->Annz = A->p[A->n];
306
+ p->Agt->nnz = A->p[A->n];
165
307
  p->Agt->descr = 0;
166
308
  /* Matrix description */
167
- cusparseCreateMatDescr(&p->Agt->descr);
168
- cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
169
- cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
170
309
 
171
310
  cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
172
311
  cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
173
312
  cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
174
313
  /* transpose Ag into Agt for faster multiplies */
175
314
  /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
176
- CUSPARSE(csr2csc)
177
- (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
178
- p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
315
+ cusparseCsr2cscEx2_bufferSize(
316
+ p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
317
+ p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
318
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
319
+
320
+ if (new_buffer_size > p->buffer_size) {
321
+ if (p->buffer != SCS_NULL) {
322
+ cudaFree(p->buffer);
323
+ }
324
+ cudaMalloc(&p->buffer, new_buffer_size);
325
+ p->buffer_size = new_buffer_size;
326
+ }
327
+
328
+ cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
329
+ Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
330
+ CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
331
+ SCS_CSR2CSC_ALG, p->buffer);
332
+
333
+ cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
334
+ p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
335
+ SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
336
+ SCS_CUDA_FLOAT);
179
337
  #endif
180
338
 
181
339
  err = cudaGetLastError();
182
340
  if (err != cudaSuccess) {
183
- printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
341
+ printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
184
342
  cudaGetErrorString(err));
185
343
  SCS(free_lin_sys_work)(p);
186
344
  return SCS_NULL;
@@ -188,117 +346,173 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
188
346
  return p;
189
347
  }
190
348
 
191
- static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
192
- scs_float *z, scs_float *r, scs_int n) {
193
- cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
194
- CUBLAS(tbmv)
195
- (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
196
- 0, M, 1, z, 1);
197
- }
198
-
199
- /* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
200
- static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
201
- ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
349
+ /* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
350
+ * b */
351
+ /* on GPU */
352
+ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
202
353
  scs_int max_its, scs_float tol) {
203
- scs_int i, n = A->n;
204
- scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
354
+ scs_int i, n = pr->n;
355
+ scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
205
356
  scs_float onef = 1.0, neg_onef = -1.0;
206
357
  scs_float *p = pr->p; /* cg direction */
207
358
  scs_float *Gp = pr->Gp; /* updated CG direction */
208
359
  scs_float *r = pr->r; /* cg residual */
209
360
  scs_float *z = pr->z; /* preconditioned */
210
- scs_float *M = pr->M; /* preconditioner */
211
361
  cublasHandle_t cublas_handle = pr->cublas_handle;
212
362
 
213
- if (s == SCS_NULL) {
363
+ if (!s) {
364
+ /* take s = 0 */
365
+ /* r = b */
214
366
  cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
367
+ /* b = 0 */
215
368
  cudaMemset(bg, 0, n * sizeof(scs_float));
216
369
  } else {
217
370
  /* p contains bg temporarily */
218
371
  cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
219
- /* bg contains s */
372
+ /* bg = s */
220
373
  cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
221
- mat_vec(A, stgs, pr, bg, r);
374
+ /* r = Mat * s */
375
+ mat_vec(pr, bg, r);
376
+ /* r = Mat * s - b */
222
377
  CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
378
+ /* r = b - Mat * s */
223
379
  CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
224
380
  }
225
381
 
226
- /* for some reason nrm2 is VERY slow */
227
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
228
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
229
- nrm_r = SQRTF(nrm_r);
230
382
  /* check to see if we need to run CG at all */
231
- if (nrm_r < MIN(tol, 1e-18)) {
383
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
232
384
  return 0;
233
385
  }
234
386
 
235
- apply_pre_conditioner(cublas_handle, M, z, r, n);
236
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
237
- /* put z in p, replacing temp mem */
387
+ /* z = M r */
388
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
389
+ scale_by_diag(cublas_handle, pr->M, z, n);
390
+ /* ztr = z'r */
391
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
392
+ /* p = z */
238
393
  cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
239
394
 
240
395
  for (i = 0; i < max_its; ++i) {
241
- mat_vec(A, stgs, pr, p, Gp);
242
-
243
- CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
244
-
245
- alpha = ipzr / p_gp;
396
+ /* Gp = Mat * p */
397
+ mat_vec(pr, p, Gp);
398
+ /* ptGp = p'Gp */
399
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
400
+ /* alpha = z'r / p'G p */
401
+ alpha = ztr / ptGp;
246
402
  neg_alpha = -alpha;
247
-
403
+ /* b += alpha * p */
248
404
  CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
405
+ /* r -= alpha * G p */
249
406
  CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
250
407
 
251
- /* for some reason nrm2 is VERY slow */
252
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
253
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
254
- nrm_r = SQRTF(nrm_r);
255
- if (nrm_r < tol) {
256
- i++;
257
- break;
258
- }
259
- ipzr_old = ipzr;
260
- apply_pre_conditioner(cublas_handle, M, z, r, n);
261
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
408
+ #if VERBOSITY > 3
409
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
410
+ cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
411
+ #endif
262
412
 
263
- beta = ipzr / ipzr_old;
413
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
414
+ return i + 1;
415
+ }
416
+ /* z = M r */
417
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
418
+ scale_by_diag(cublas_handle, pr->M, z, n);
419
+ ztr_prev = ztr;
420
+ /* ztr = z'r */
421
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
422
+ beta = ztr / ztr_prev;
423
+ /* p = beta * p, where beta = ztr / ztr_prev */
264
424
  CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
425
+ /* p = z + beta * p */
265
426
  CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
266
427
  }
267
- #if EXTRA_VERBOSE > 0
268
- scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
269
- #endif
270
428
  return i;
271
429
  }
272
430
 
273
- scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
274
- ScsLinSysWork *p, scs_float *b, const scs_float *s,
275
- scs_int iter) {
276
- scs_int cg_its;
277
- SCS(timer) linsys_timer;
278
- scs_float *bg = p->bg;
431
+ /* solves Mx = b, for x but stores result in b */
432
+ /* s contains warm-start (if available) */
433
+ /*
434
+ * [x] = [rho_x I + P A' ]^{-1} [rx]
435
+ * [y] [ A -R_y ] [ry]
436
+ *
437
+ * R_y = diag(rho_y_vec)
438
+ *
439
+ * becomes:
440
+ *
441
+ * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
442
+ * y = R_y^{-1} (Ax - ry)
443
+ *
444
+ */
445
+ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
446
+ scs_float tol) {
447
+ scs_int cg_its, max_iters;
279
448
  scs_float neg_onef = -1.0;
449
+
450
+ /* these are on GPU */
451
+ scs_float *bg = p->bg;
452
+ scs_float *tmp_m = p->tmp_m;
280
453
  ScsGpuMatrix *Ag = p->Ag;
281
- scs_float cg_tol =
282
- SCS(norm)(b, Ag->n) *
283
- (iter < 0 ? CG_BEST_TOL
284
- : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
285
- SCS(tic)(&linsys_timer);
286
- /* all on GPU */
287
- cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
288
- SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
289
- /* solves (I+A'A)x = b, s warm start, solution stored in b */
290
- cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
291
- CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
292
- SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
293
- cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
454
+ ScsGpuMatrix *Pg = p->Pg;
294
455
 
295
- if (iter >= 0) {
296
- p->tot_cg_its += cg_its;
456
+ if (CG_NORM(b, p->n + p->m) <= 1e-12) {
457
+ memset(b, 0, (p->n + p->m) * sizeof(scs_float));
458
+ return 0;
459
+ }
460
+
461
+ if (tol <= 0.) {
462
+ scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
463
+ "INDIRECT flag.\n",
464
+ tol);
297
465
  }
298
466
 
299
- p->total_solve_time += SCS(tocq)(&linsys_timer);
300
- #if EXTRAVERBOSE > 0
301
- scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
467
+ /* bg = b = [rx; ry] */
468
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
469
+ cudaMemcpyHostToDevice);
470
+ /* tmp = ry */
471
+ cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
472
+ cudaMemcpyDeviceToDevice);
473
+ /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
474
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
475
+
476
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
477
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
478
+ /* bg[:n] = rx + A' R ry */
479
+ SCS(accum_by_atrans_gpu)
480
+ (Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
481
+ &p->buffer);
482
+
483
+ /* set max_iters to 10 * n (though in theory n is enough for any tol) */
484
+ max_iters = 10 * Ag->n;
485
+
486
+ /* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
487
+ * in bg */
488
+ cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
489
+
490
+ /* bg[n:] = -ry */
491
+ CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
492
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
493
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
494
+
495
+ /* b[n:] = Ax - ry */
496
+ #if GPU_TRANSPOSE_MAT > 0
497
+ SCS(accum_by_atrans_gpu)
498
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
499
+ &p->buffer);
500
+ #else
501
+ SCS(accum_by_a_gpu)
502
+ (Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
503
+ &p->buffer);
504
+ #endif
505
+
506
+ /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
507
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
508
+
509
+ /* copy bg = [x; y] back to b */
510
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
511
+ cudaMemcpyDeviceToHost);
512
+ p->tot_cg_its += cg_its;
513
+ #if VERBOSITY > 1
514
+ scs_printf("tol %.3e\n", tol);
515
+ scs_printf("cg_its %i\n", (int)cg_its);
302
516
  #endif
303
517
  return 0;
304
518
  }