scs 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -0
  3. data/LICENSE.txt +18 -18
  4. data/README.md +28 -9
  5. data/ext/scs/extconf.rb +29 -0
  6. data/lib/scs/ffi.rb +30 -13
  7. data/lib/scs/solver.rb +32 -14
  8. data/lib/scs/version.rb +1 -1
  9. data/vendor/scs/CITATION.cff +39 -0
  10. data/vendor/scs/CMakeLists.txt +272 -0
  11. data/vendor/scs/Makefile +24 -15
  12. data/vendor/scs/README.md +8 -216
  13. data/vendor/scs/include/aa.h +67 -23
  14. data/vendor/scs/include/cones.h +17 -17
  15. data/vendor/scs/include/glbopts.h +98 -32
  16. data/vendor/scs/include/linalg.h +2 -4
  17. data/vendor/scs/include/linsys.h +58 -44
  18. data/vendor/scs/include/normalize.h +3 -3
  19. data/vendor/scs/include/rw.h +8 -2
  20. data/vendor/scs/include/scs.h +293 -133
  21. data/vendor/scs/include/util.h +3 -15
  22. data/vendor/scs/linsys/cpu/direct/private.c +220 -224
  23. data/vendor/scs/linsys/cpu/direct/private.h +13 -7
  24. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  25. data/vendor/scs/linsys/cpu/indirect/private.c +177 -110
  26. data/vendor/scs/linsys/cpu/indirect/private.h +8 -4
  27. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  28. data/vendor/scs/linsys/csparse.c +87 -0
  29. data/vendor/scs/linsys/csparse.h +34 -0
  30. data/vendor/scs/linsys/csparse.o +0 -0
  31. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +1 -1
  32. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  33. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  34. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  35. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  36. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  37. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  38. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  39. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  40. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  41. data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
  42. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  43. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  44. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  45. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  46. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  47. data/vendor/scs/linsys/external/qdldl/changes +2 -0
  48. data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
  49. data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
  50. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  51. data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
  52. data/vendor/scs/linsys/gpu/gpu.c +58 -21
  53. data/vendor/scs/linsys/gpu/gpu.h +66 -28
  54. data/vendor/scs/linsys/gpu/indirect/private.c +368 -154
  55. data/vendor/scs/linsys/gpu/indirect/private.h +26 -12
  56. data/vendor/scs/linsys/scs_matrix.c +498 -0
  57. data/vendor/scs/linsys/scs_matrix.h +70 -0
  58. data/vendor/scs/linsys/scs_matrix.o +0 -0
  59. data/vendor/scs/scs.mk +13 -9
  60. data/vendor/scs/src/aa.c +384 -109
  61. data/vendor/scs/src/aa.o +0 -0
  62. data/vendor/scs/src/cones.c +440 -353
  63. data/vendor/scs/src/cones.o +0 -0
  64. data/vendor/scs/src/ctrlc.c +15 -5
  65. data/vendor/scs/src/ctrlc.o +0 -0
  66. data/vendor/scs/src/linalg.c +84 -28
  67. data/vendor/scs/src/linalg.o +0 -0
  68. data/vendor/scs/src/normalize.c +22 -64
  69. data/vendor/scs/src/normalize.o +0 -0
  70. data/vendor/scs/src/rw.c +161 -22
  71. data/vendor/scs/src/rw.o +0 -0
  72. data/vendor/scs/src/scs.c +768 -561
  73. data/vendor/scs/src/scs.o +0 -0
  74. data/vendor/scs/src/scs_indir.o +0 -0
  75. data/vendor/scs/src/scs_version.c +9 -3
  76. data/vendor/scs/src/scs_version.o +0 -0
  77. data/vendor/scs/src/util.c +37 -106
  78. data/vendor/scs/src/util.o +0 -0
  79. data/vendor/scs/test/minunit.h +17 -8
  80. data/vendor/scs/test/problem_utils.h +176 -14
  81. data/vendor/scs/test/problems/degenerate.h +130 -0
  82. data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
  83. data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
  84. data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
  85. data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
  86. data/vendor/scs/test/problems/random_prob +0 -0
  87. data/vendor/scs/test/problems/random_prob.h +45 -0
  88. data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
  89. data/vendor/scs/test/problems/small_lp.h +13 -14
  90. data/vendor/scs/test/problems/test_fails.h +43 -0
  91. data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
  92. data/vendor/scs/test/random_socp_prob.c +54 -53
  93. data/vendor/scs/test/rng.h +109 -0
  94. data/vendor/scs/test/run_from_file.c +19 -10
  95. data/vendor/scs/test/run_tests.c +27 -3
  96. metadata +30 -73
  97. data/ext/scs/Rakefile +0 -11
  98. data/vendor/scs/linsys/amatrix.c +0 -305
  99. data/vendor/scs/linsys/amatrix.h +0 -36
  100. data/vendor/scs/linsys/amatrix.o +0 -0
  101. data/vendor/scs/test/data/small_random_socp +0 -0
  102. data/vendor/scs/test/problems/small_random_socp.h +0 -33
  103. data/vendor/scs/test/run_tests +0 -2
@@ -1,54 +1,89 @@
1
1
  #include "private.h"
2
+ #include "linsys.h"
2
3
 
3
- #define CG_BEST_TOL 1e-9
4
- #define CG_MIN_TOL 1e-1
5
-
6
- /* do not use within pcg, reuses memory */
7
- void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
8
- const scs_float *x, scs_float *y) {
9
- scs_float *v_m = p->tmp_m;
10
- scs_float *v_n = p->r;
11
- cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
12
- cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
13
- SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
14
- cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
15
- }
4
+ /* norm to use when deciding convergence */
5
+ /* should be consistent with CG_NORM in glbopts.h */
6
+ #define USE_L2_NORM (0)
16
7
 
17
- /* do not use within pcg, reuses memory */
18
- void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
19
- scs_float *y) {
20
- scs_float *v_m = p->tmp_m;
21
- scs_float *v_n = p->r;
22
- cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
23
- cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
24
- #if GPU_TRANSPOSE_MAT > 0
25
- SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
8
+ static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
9
+ scs_int n) {
10
+ #if USE_L2_NORM > 0
11
+ scs_float nrm;
12
+ CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
26
13
  #else
27
- SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
14
+ scs_int idx;
15
+ scs_float nrm;
16
+ CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
17
+ /* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
18
+ cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
19
+ nrm = ABS(nrm);
28
20
  #endif
29
- cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
21
+ return nrm;
30
22
  }
31
23
 
32
- char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
33
- char *str = (char *)scs_malloc(sizeof(char) * 128);
34
- sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
35
- (long)A->p[A->n], stgs->cg_rate);
36
- return str;
24
+ const char *SCS(get_lin_sys_method)() {
25
+ return "sparse-indirect GPU";
37
26
  }
38
27
 
28
+ /*
39
29
  char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
40
30
  char *str = (char *)scs_malloc(sizeof(char) * 128);
41
- sprintf(str,
42
- "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
43
- (scs_float)p->tot_cg_its / (info->iter + 1),
44
- p->total_solve_time / (info->iter + 1) / 1e3);
31
+ sprintf(str, "lin-sys: avg cg its: %2.2f\n",
32
+ (scs_float)p->tot_cg_its / (info->iter + 1));
45
33
  p->tot_cg_its = 0;
46
- p->total_solve_time = 0;
47
34
  return str;
48
35
  }
36
+ */
37
+
38
+ /* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
39
+ static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
40
+ scs_int i, k;
41
+ const ScsMatrix *A = p->A;
42
+ const ScsMatrix *P = p->P;
43
+ scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
44
+
45
+ #if VERBOSITY > 0
46
+ scs_printf("getting pre-conditioner\n");
47
+ #endif
48
+
49
+ for (i = 0; i < A->n; ++i) { /* cols */
50
+ M[i] = p->rho_x;
51
+ /* diag(A' R_y^{-1} A) */
52
+ for (k = A->p[i]; k < A->p[i + 1]; ++k) {
53
+ /* A->i[k] is row of entry k with value A->x[k] */
54
+ M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
55
+ }
56
+ if (P) {
57
+ for (k = P->p[i]; k < P->p[i + 1]; k++) {
58
+ /* diagonal element only */
59
+ if (P->i[k] == i) { /* row == col */
60
+ M[i] += P->x[k];
61
+ break;
62
+ }
63
+ }
64
+ }
65
+ M[i] = 1. / M[i];
66
+ }
67
+ cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
68
+ scs_free(M);
69
+ #if VERBOSITY > 0
70
+ scs_printf("finished getting pre-conditioner\n");
71
+ #endif
72
+ }
73
+
74
+ /* no need to update anything in this case */
75
+ void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
76
+ scs_int i;
77
+ for (i = 0; i < p->m; ++i)
78
+ p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
79
+ cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
80
+ cudaMemcpyHostToDevice);
81
+ set_preconditioner(p, rho_y_vec);
82
+ }
49
83
 
50
84
  void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
51
85
  if (p) {
86
+ scs_free(p->inv_rho_y_vec);
52
87
  cudaFree(p->p);
53
88
  cudaFree(p->r);
54
89
  cudaFree(p->Gp);
@@ -56,6 +91,11 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
56
91
  cudaFree(p->tmp_m);
57
92
  cudaFree(p->z);
58
93
  cudaFree(p->M);
94
+ cudaFree(p->inv_rho_y_vec_gpu);
95
+ if (p->Pg) {
96
+ SCS(free_gpu_matrix)(p->Pg);
97
+ scs_free(p->Pg);
98
+ }
59
99
  if (p->Ag) {
60
100
  SCS(free_gpu_matrix)(p->Ag);
61
101
  scs_free(p->Ag);
@@ -64,6 +104,12 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
64
104
  SCS(free_gpu_matrix)(p->Agt);
65
105
  scs_free(p->Agt);
66
106
  }
107
+ if (p->buffer != SCS_NULL) {
108
+ cudaFree(p->buffer);
109
+ }
110
+ cusparseDestroyDnVec(p->dn_vec_m);
111
+ cusparseDestroyDnVec(p->dn_vec_n);
112
+ cusparseDestroyDnVec(p->dn_vec_n_p);
67
113
  cusparseDestroy(p->cusparse_handle);
68
114
  cublasDestroy(p->cublas_handle);
69
115
  /* Don't reset because it interferes with other GPU programs. */
@@ -72,53 +118,110 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
72
118
  }
73
119
  }
74
120
 
75
- /*y = (RHO_X * I + A'A)x */
76
- static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
77
- ScsLinSysWork *p, const scs_float *x, scs_float *y) {
78
- /* x and y MUST already be loaded to GPU */
79
- scs_float *tmp_m = p->tmp_m; /* temp memory */
80
- cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
81
- SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
82
- cudaMemset(y, 0, A->n * sizeof(scs_float));
83
- SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
84
- CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
121
+ /* z = M * z elementwise in place, assumes M, z on GPU */
122
+ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
123
+ scs_float *z, scs_int n) {
124
+ CUBLAS(tbmv)
125
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
126
+ 0, M, 1, z, 1);
85
127
  }
86
128
 
87
- /* M = inv ( diag ( RHO_X * I + A'A ) ) */
88
- static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
89
- ScsLinSysWork *p) {
90
- scs_int i;
91
- scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
129
+ /* y = (rho_x * I + P + A' R_y^{-1} A) x */
130
+ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
131
+ /* x and y MUST already be loaded to GPU */
132
+ scs_float *z = p->tmp_m; /* temp memory */
133
+ cudaMemset(y, 0, p->n * sizeof(scs_float));
134
+ cudaMemset(z, 0, p->m * sizeof(scs_float));
135
+
136
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
137
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
138
+ cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
139
+
140
+ /* y = rho_x * x */
141
+ CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
142
+
143
+ if (p->Pg) {
144
+ /* y = rho_x * x + Px */
145
+ SCS(accum_by_p_gpu)
146
+ (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
147
+ &p->buffer);
148
+ }
92
149
 
93
- #if EXTRA_VERBOSE > 0
94
- scs_printf("getting pre-conditioner\n");
150
+ /* z = Ax */
151
+ #if GPU_TRANSPOSE_MAT > 0
152
+ SCS(accum_by_atrans_gpu)
153
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
154
+ &p->buffer);
155
+ #else
156
+ SCS(accum_by_a_gpu)
157
+ (p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
158
+ &p->buffer);
95
159
  #endif
160
+ /* z = R_y^{-1} A x */
161
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
96
162
 
97
- for (i = 0; i < A->n; ++i) {
98
- M[i] = 1 / (stgs->rho_x +
99
- SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
100
- /* M[i] = 1; */
101
- }
102
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
103
- scs_free(M);
163
+ /* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
164
+ SCS(accum_by_atrans_gpu)
165
+ (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
166
+ &p->buffer);
167
+ }
104
168
 
105
- #if EXTRA_VERBOSE > 0
106
- scs_printf("finished getting pre-conditioner\n");
107
- #endif
169
+ /* P comes in upper triangular, expand to full
170
+ * First compute triplet version of full matrix, then compress to csc
171
+ * */
172
+ static csc *fill_p_matrix(const ScsMatrix *P) {
173
+ scs_int i, j, k, kk;
174
+ scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
175
+ csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
176
+ csc *P_full;
177
+ kk = 0;
178
+ for (j = 0; j < P->n; j++) { /* cols */
179
+ for (k = P->p[j]; k < P->p[j + 1]; k++) {
180
+ i = P->i[k]; /* row */
181
+ if (i > j) { /* only upper triangular needed */
182
+ break;
183
+ }
184
+ P_tmp->i[kk] = i;
185
+ P_tmp->p[kk] = j;
186
+ P_tmp->x[kk] = P->x[k];
187
+ kk++;
188
+ if (i == j) { /* diagonal */
189
+ continue;
190
+ }
191
+ P_tmp->i[kk] = j;
192
+ P_tmp->p[kk] = i;
193
+ P_tmp->x[kk] = P->x[k];
194
+ kk++;
195
+ }
196
+ }
197
+ P_tmp->nz = kk; /* set number of nonzeros */
198
+ P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
199
+ SCS(cs_spfree)(P_tmp);
200
+ return P_full;
108
201
  }
109
202
 
110
- ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
111
- const ScsSettings *stgs) {
203
+ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
204
+ scs_float *rho_y_vec, scs_float rho_x) {
112
205
  cudaError_t err;
206
+ scs_int i;
207
+ csc *P_full;
113
208
  ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
114
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
209
+ ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
210
+ ScsGpuMatrix *Pg = SCS_NULL;
115
211
 
212
+ #if GPU_TRANSPOSE_MAT > 0
213
+ size_t new_buffer_size = 0;
214
+ #endif
215
+
216
+ p->rho_x = rho_x;
116
217
  p->cublas_handle = 0;
117
218
  p->cusparse_handle = 0;
118
219
 
119
- p->total_solve_time = 0;
120
220
  p->tot_cg_its = 0;
121
221
 
222
+ p->buffer_size = 0;
223
+ p->buffer = SCS_NULL;
224
+
122
225
  /* Get handle to the CUBLAS context */
123
226
  cublasCreate(&p->cublas_handle);
124
227
 
@@ -127,15 +230,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
127
230
 
128
231
  Ag->n = A->n;
129
232
  Ag->m = A->m;
130
- Ag->Annz = A->p[A->n];
233
+ Ag->nnz = A->p[A->n];
131
234
  Ag->descr = 0;
132
- /* Matrix description */
133
- cusparseCreateMatDescr(&Ag->descr);
134
- cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
135
- cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
136
- p->Ag = Ag;
137
- p->Agt = SCS_NULL;
138
-
139
235
  cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
140
236
  cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
141
237
  cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
@@ -144,10 +240,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
144
240
  cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
145
241
  cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
146
242
  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
147
- cudaMalloc((void **)&p->tmp_m,
148
- A->m * sizeof(scs_float)); /* intermediate result */
243
+ cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
149
244
  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
150
245
  cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
246
+ cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
151
247
 
152
248
  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
153
249
  cudaMemcpyHostToDevice);
@@ -155,32 +251,94 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
155
251
  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
156
252
  cudaMemcpyHostToDevice);
157
253
 
158
- get_preconditioner(A, stgs, p);
254
+ p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
255
+ for (i = 0; i < A->m; ++i)
256
+ p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
257
+ cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
258
+ cudaMemcpyHostToDevice);
259
+
260
+ cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
261
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
262
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
263
+
264
+ if (P) {
265
+ Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
266
+ P_full = fill_p_matrix(P);
267
+ Pg->n = P_full->n;
268
+ Pg->m = P_full->m;
269
+ Pg->nnz = P_full->p[P_full->n];
270
+ Pg->descr = 0;
271
+ cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
272
+ cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
273
+ cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
274
+
275
+ cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
276
+ cudaMemcpyHostToDevice);
277
+ cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
278
+ cudaMemcpyHostToDevice);
279
+ cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
280
+ cudaMemcpyHostToDevice);
281
+
282
+ cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
283
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
284
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
285
+
286
+ SCS(cs_spfree)(P_full);
287
+ } else {
288
+ Pg = SCS_NULL;
289
+ }
290
+
291
+ p->Ag = Ag;
292
+ p->Pg = Pg;
293
+ p->Agt = SCS_NULL;
294
+
295
+ /* we initialize with tmp_m but always overwrite it so it doesn't matter */
296
+ cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
297
+ cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
298
+ cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
299
+
300
+ set_preconditioner(p, rho_y_vec);
159
301
 
160
302
  #if GPU_TRANSPOSE_MAT > 0
161
303
  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
162
304
  p->Agt->n = A->m;
163
305
  p->Agt->m = A->n;
164
- p->Agt->Annz = A->p[A->n];
306
+ p->Agt->nnz = A->p[A->n];
165
307
  p->Agt->descr = 0;
166
308
  /* Matrix description */
167
- cusparseCreateMatDescr(&p->Agt->descr);
168
- cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
169
- cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
170
309
 
171
310
  cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
172
311
  cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
173
312
  cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
174
313
  /* transpose Ag into Agt for faster multiplies */
175
314
  /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
176
- CUSPARSE(csr2csc)
177
- (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
178
- p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
315
+ cusparseCsr2cscEx2_bufferSize(
316
+ p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
317
+ p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
318
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
319
+
320
+ if (new_buffer_size > p->buffer_size) {
321
+ if (p->buffer != SCS_NULL) {
322
+ cudaFree(p->buffer);
323
+ }
324
+ cudaMalloc(&p->buffer, new_buffer_size);
325
+ p->buffer_size = new_buffer_size;
326
+ }
327
+
328
+ cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
329
+ Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
330
+ CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
331
+ SCS_CSR2CSC_ALG, p->buffer);
332
+
333
+ cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
334
+ p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
335
+ SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
336
+ SCS_CUDA_FLOAT);
179
337
  #endif
180
338
 
181
339
  err = cudaGetLastError();
182
340
  if (err != cudaSuccess) {
183
- printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
341
+ printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
184
342
  cudaGetErrorString(err));
185
343
  SCS(free_lin_sys_work)(p);
186
344
  return SCS_NULL;
@@ -188,117 +346,173 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
188
346
  return p;
189
347
  }
190
348
 
191
- static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
192
- scs_float *z, scs_float *r, scs_int n) {
193
- cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
194
- CUBLAS(tbmv)
195
- (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
196
- 0, M, 1, z, 1);
197
- }
198
-
199
- /* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
200
- static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
201
- ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
349
+ /* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
350
+ * b */
351
+ /* on GPU */
352
+ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
202
353
  scs_int max_its, scs_float tol) {
203
- scs_int i, n = A->n;
204
- scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
354
+ scs_int i, n = pr->n;
355
+ scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
205
356
  scs_float onef = 1.0, neg_onef = -1.0;
206
357
  scs_float *p = pr->p; /* cg direction */
207
358
  scs_float *Gp = pr->Gp; /* updated CG direction */
208
359
  scs_float *r = pr->r; /* cg residual */
209
360
  scs_float *z = pr->z; /* preconditioned */
210
- scs_float *M = pr->M; /* preconditioner */
211
361
  cublasHandle_t cublas_handle = pr->cublas_handle;
212
362
 
213
- if (s == SCS_NULL) {
363
+ if (!s) {
364
+ /* take s = 0 */
365
+ /* r = b */
214
366
  cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
367
+ /* b = 0 */
215
368
  cudaMemset(bg, 0, n * sizeof(scs_float));
216
369
  } else {
217
370
  /* p contains bg temporarily */
218
371
  cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
219
- /* bg contains s */
372
+ /* bg = s */
220
373
  cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
221
- mat_vec(A, stgs, pr, bg, r);
374
+ /* r = Mat * s */
375
+ mat_vec(pr, bg, r);
376
+ /* r = Mat * s - b */
222
377
  CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
378
+ /* r = b - Mat * s */
223
379
  CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
224
380
  }
225
381
 
226
- /* for some reason nrm2 is VERY slow */
227
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
228
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
229
- nrm_r = SQRTF(nrm_r);
230
382
  /* check to see if we need to run CG at all */
231
- if (nrm_r < MIN(tol, 1e-18)) {
383
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
232
384
  return 0;
233
385
  }
234
386
 
235
- apply_pre_conditioner(cublas_handle, M, z, r, n);
236
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
237
- /* put z in p, replacing temp mem */
387
+ /* z = M r */
388
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
389
+ scale_by_diag(cublas_handle, pr->M, z, n);
390
+ /* ztr = z'r */
391
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
392
+ /* p = z */
238
393
  cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
239
394
 
240
395
  for (i = 0; i < max_its; ++i) {
241
- mat_vec(A, stgs, pr, p, Gp);
242
-
243
- CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
244
-
245
- alpha = ipzr / p_gp;
396
+ /* Gp = Mat * p */
397
+ mat_vec(pr, p, Gp);
398
+ /* ptGp = p'Gp */
399
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
400
+ /* alpha = z'r / p'G p */
401
+ alpha = ztr / ptGp;
246
402
  neg_alpha = -alpha;
247
-
403
+ /* b += alpha * p */
248
404
  CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
405
+ /* r -= alpha * G p */
249
406
  CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
250
407
 
251
- /* for some reason nrm2 is VERY slow */
252
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
253
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
254
- nrm_r = SQRTF(nrm_r);
255
- if (nrm_r < tol) {
256
- i++;
257
- break;
258
- }
259
- ipzr_old = ipzr;
260
- apply_pre_conditioner(cublas_handle, M, z, r, n);
261
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
408
+ #if VERBOSITY > 3
409
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
410
+ cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
411
+ #endif
262
412
 
263
- beta = ipzr / ipzr_old;
413
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
414
+ return i + 1;
415
+ }
416
+ /* z = M r */
417
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
418
+ scale_by_diag(cublas_handle, pr->M, z, n);
419
+ ztr_prev = ztr;
420
+ /* ztr = z'r */
421
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
422
+ beta = ztr / ztr_prev;
423
+ /* p = beta * p, where beta = ztr / ztr_prev */
264
424
  CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
425
+ /* p = z + beta * p */
265
426
  CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
266
427
  }
267
- #if EXTRA_VERBOSE > 0
268
- scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
269
- #endif
270
428
  return i;
271
429
  }
272
430
 
273
- scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
274
- ScsLinSysWork *p, scs_float *b, const scs_float *s,
275
- scs_int iter) {
276
- scs_int cg_its;
277
- SCS(timer) linsys_timer;
278
- scs_float *bg = p->bg;
431
+ /* solves Mx = b, for x but stores result in b */
432
+ /* s contains warm-start (if available) */
433
+ /*
434
+ * [x] = [rho_x I + P A' ]^{-1} [rx]
435
+ * [y] [ A -R_y ] [ry]
436
+ *
437
+ * R_y = diag(rho_y_vec)
438
+ *
439
+ * becomes:
440
+ *
441
+ * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
442
+ * y = R_y^{-1} (Ax - ry)
443
+ *
444
+ */
445
+ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
446
+ scs_float tol) {
447
+ scs_int cg_its, max_iters;
279
448
  scs_float neg_onef = -1.0;
449
+
450
+ /* these are on GPU */
451
+ scs_float *bg = p->bg;
452
+ scs_float *tmp_m = p->tmp_m;
280
453
  ScsGpuMatrix *Ag = p->Ag;
281
- scs_float cg_tol =
282
- SCS(norm)(b, Ag->n) *
283
- (iter < 0 ? CG_BEST_TOL
284
- : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
285
- SCS(tic)(&linsys_timer);
286
- /* all on GPU */
287
- cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
288
- SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
289
- /* solves (I+A'A)x = b, s warm start, solution stored in b */
290
- cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
291
- CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
292
- SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
293
- cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
454
+ ScsGpuMatrix *Pg = p->Pg;
294
455
 
295
- if (iter >= 0) {
296
- p->tot_cg_its += cg_its;
456
+ if (CG_NORM(b, p->n + p->m) <= 1e-12) {
457
+ memset(b, 0, (p->n + p->m) * sizeof(scs_float));
458
+ return 0;
459
+ }
460
+
461
+ if (tol <= 0.) {
462
+ scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
463
+ "INDIRECT flag.\n",
464
+ tol);
297
465
  }
298
466
 
299
- p->total_solve_time += SCS(tocq)(&linsys_timer);
300
- #if EXTRAVERBOSE > 0
301
- scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
467
+ /* bg = b = [rx; ry] */
468
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
469
+ cudaMemcpyHostToDevice);
470
+ /* tmp = ry */
471
+ cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
472
+ cudaMemcpyDeviceToDevice);
473
+ /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
474
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
475
+
476
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
477
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
478
+ /* bg[:n] = rx + A' R ry */
479
+ SCS(accum_by_atrans_gpu)
480
+ (Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
481
+ &p->buffer);
482
+
483
+ /* set max_iters to 10 * n (though in theory n is enough for any tol) */
484
+ max_iters = 10 * Ag->n;
485
+
486
+ /* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
487
+ * in bg */
488
+ cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
489
+
490
+ /* bg[n:] = -ry */
491
+ CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
492
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
493
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
494
+
495
+ /* b[n:] = Ax - ry */
496
+ #if GPU_TRANSPOSE_MAT > 0
497
+ SCS(accum_by_atrans_gpu)
498
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
499
+ &p->buffer);
500
+ #else
501
+ SCS(accum_by_a_gpu)
502
+ (Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
503
+ &p->buffer);
504
+ #endif
505
+
506
+ /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
507
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
508
+
509
+ /* copy bg = [x; y] back to b */
510
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
511
+ cudaMemcpyDeviceToHost);
512
+ p->tot_cg_its += cg_its;
513
+ #if VERBOSITY > 1
514
+ scs_printf("tol %.3e\n", tol);
515
+ scs_printf("cg_its %i\n", (int)cg_its);
302
516
  #endif
303
517
  return 0;
304
518
  }