scs 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +11 -6
  4. data/lib/scs/ffi.rb +30 -13
  5. data/lib/scs/solver.rb +32 -9
  6. data/lib/scs/version.rb +1 -1
  7. data/vendor/scs/CITATION.cff +39 -0
  8. data/vendor/scs/CMakeLists.txt +7 -8
  9. data/vendor/scs/Makefile +24 -15
  10. data/vendor/scs/README.md +5 -263
  11. data/vendor/scs/include/aa.h +67 -23
  12. data/vendor/scs/include/cones.h +17 -17
  13. data/vendor/scs/include/glbopts.h +98 -32
  14. data/vendor/scs/include/linalg.h +2 -4
  15. data/vendor/scs/include/linsys.h +58 -44
  16. data/vendor/scs/include/normalize.h +3 -3
  17. data/vendor/scs/include/rw.h +8 -2
  18. data/vendor/scs/include/scs.h +293 -133
  19. data/vendor/scs/include/util.h +3 -15
  20. data/vendor/scs/linsys/cpu/direct/private.c +220 -224
  21. data/vendor/scs/linsys/cpu/direct/private.h +13 -7
  22. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  23. data/vendor/scs/linsys/cpu/indirect/private.c +177 -110
  24. data/vendor/scs/linsys/cpu/indirect/private.h +8 -4
  25. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  26. data/vendor/scs/linsys/csparse.c +87 -0
  27. data/vendor/scs/linsys/csparse.h +34 -0
  28. data/vendor/scs/linsys/csparse.o +0 -0
  29. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +1 -1
  30. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  31. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  32. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  33. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  34. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  35. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  36. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  37. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  38. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  39. data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
  40. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  41. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  42. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  43. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  44. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  45. data/vendor/scs/linsys/external/qdldl/changes +2 -0
  46. data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
  47. data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
  48. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  49. data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
  50. data/vendor/scs/linsys/gpu/gpu.c +31 -33
  51. data/vendor/scs/linsys/gpu/gpu.h +48 -31
  52. data/vendor/scs/linsys/gpu/indirect/private.c +338 -232
  53. data/vendor/scs/linsys/gpu/indirect/private.h +23 -14
  54. data/vendor/scs/linsys/scs_matrix.c +498 -0
  55. data/vendor/scs/linsys/scs_matrix.h +70 -0
  56. data/vendor/scs/linsys/scs_matrix.o +0 -0
  57. data/vendor/scs/scs.mk +13 -9
  58. data/vendor/scs/src/aa.c +384 -109
  59. data/vendor/scs/src/aa.o +0 -0
  60. data/vendor/scs/src/cones.c +440 -353
  61. data/vendor/scs/src/cones.o +0 -0
  62. data/vendor/scs/src/ctrlc.c +15 -5
  63. data/vendor/scs/src/ctrlc.o +0 -0
  64. data/vendor/scs/src/linalg.c +84 -28
  65. data/vendor/scs/src/linalg.o +0 -0
  66. data/vendor/scs/src/normalize.c +22 -64
  67. data/vendor/scs/src/normalize.o +0 -0
  68. data/vendor/scs/src/rw.c +160 -21
  69. data/vendor/scs/src/rw.o +0 -0
  70. data/vendor/scs/src/scs.c +767 -563
  71. data/vendor/scs/src/scs.o +0 -0
  72. data/vendor/scs/src/scs_indir.o +0 -0
  73. data/vendor/scs/src/scs_version.c +9 -3
  74. data/vendor/scs/src/scs_version.o +0 -0
  75. data/vendor/scs/src/util.c +37 -106
  76. data/vendor/scs/src/util.o +0 -0
  77. data/vendor/scs/test/minunit.h +17 -8
  78. data/vendor/scs/test/problem_utils.h +176 -14
  79. data/vendor/scs/test/problems/degenerate.h +130 -0
  80. data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
  81. data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
  82. data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
  83. data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
  84. data/vendor/scs/test/problems/random_prob +0 -0
  85. data/vendor/scs/test/problems/random_prob.h +45 -0
  86. data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
  87. data/vendor/scs/test/problems/small_lp.h +13 -14
  88. data/vendor/scs/test/problems/test_fails.h +43 -0
  89. data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
  90. data/vendor/scs/test/random_socp_prob.c +54 -53
  91. data/vendor/scs/test/rng.h +109 -0
  92. data/vendor/scs/test/run_from_file.c +19 -10
  93. data/vendor/scs/test/run_tests.c +27 -3
  94. metadata +20 -8
  95. data/vendor/scs/linsys/amatrix.c +0 -305
  96. data/vendor/scs/linsys/amatrix.h +0 -36
  97. data/vendor/scs/linsys/amatrix.o +0 -0
  98. data/vendor/scs/test/data/small_random_socp +0 -0
  99. data/vendor/scs/test/problems/small_random_socp.h +0 -33
  100. data/vendor/scs/test/run_tests +0 -2
@@ -1,71 +1,89 @@
1
1
  #include "private.h"
2
+ #include "linsys.h"
2
3
 
3
- #define CG_BEST_TOL 1e-9
4
- #define CG_MIN_TOL 1e-1
5
-
6
- /* do not use within pcg, reuses memory */
7
- void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
8
- const scs_float *x, scs_float *y) {
9
- scs_float *v_m = p->tmp_m;
10
- scs_float *v_n = p->r;
11
- cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
12
- cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
13
-
14
- cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m);
15
- cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n);
16
- SCS(_accum_by_atrans_gpu)(
17
- p->Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
18
- &p->buffer_size, &p->buffer
19
- );
20
-
21
- cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
22
- }
23
-
24
- /* do not use within pcg, reuses memory */
25
- void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
26
- scs_float *y) {
27
- scs_float *v_m = p->tmp_m;
28
- scs_float *v_n = p->r;
29
- cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
30
- cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
4
+ /* norm to use when deciding convergence */
5
+ /* should be consistent with CG_NORM in glbopts.h */
6
+ #define USE_L2_NORM (0)
31
7
 
32
- cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m);
33
- cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n);
34
- #if GPU_TRANSPOSE_MAT > 0
35
- SCS(_accum_by_atrans_gpu)(
36
- p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
37
- &p->buffer_size, &p->buffer
38
- );
8
+ static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
9
+ scs_int n) {
10
+ #if USE_L2_NORM > 0
11
+ scs_float nrm;
12
+ CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
39
13
  #else
40
- SCS(_accum_by_a_gpu)(
41
- p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
42
- &p->buffer_size, &p->buffer
43
- );
14
+ scs_int idx;
15
+ scs_float nrm;
16
+ CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
17
+ /* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
18
+ cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
19
+ nrm = ABS(nrm);
44
20
  #endif
45
-
46
- cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
21
+ return nrm;
47
22
  }
48
23
 
49
- char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
50
- char *str = (char *)scs_malloc(sizeof(char) * 128);
51
- sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
52
- (long)A->p[A->n], stgs->cg_rate);
53
- return str;
24
+ const char *SCS(get_lin_sys_method)() {
25
+ return "sparse-indirect GPU";
54
26
  }
55
27
 
28
+ /*
56
29
  char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
57
30
  char *str = (char *)scs_malloc(sizeof(char) * 128);
58
- sprintf(str,
59
- "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
60
- (scs_float)p->tot_cg_its / (info->iter + 1),
61
- p->total_solve_time / (info->iter + 1) / 1e3);
31
+ sprintf(str, "lin-sys: avg cg its: %2.2f\n",
32
+ (scs_float)p->tot_cg_its / (info->iter + 1));
62
33
  p->tot_cg_its = 0;
63
- p->total_solve_time = 0;
64
34
  return str;
65
35
  }
36
+ */
37
+
38
+ /* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
39
+ static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
40
+ scs_int i, k;
41
+ const ScsMatrix *A = p->A;
42
+ const ScsMatrix *P = p->P;
43
+ scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
44
+
45
+ #if VERBOSITY > 0
46
+ scs_printf("getting pre-conditioner\n");
47
+ #endif
48
+
49
+ for (i = 0; i < A->n; ++i) { /* cols */
50
+ M[i] = p->rho_x;
51
+ /* diag(A' R_y^{-1} A) */
52
+ for (k = A->p[i]; k < A->p[i + 1]; ++k) {
53
+ /* A->i[k] is row of entry k with value A->x[k] */
54
+ M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
55
+ }
56
+ if (P) {
57
+ for (k = P->p[i]; k < P->p[i + 1]; k++) {
58
+ /* diagonal element only */
59
+ if (P->i[k] == i) { /* row == col */
60
+ M[i] += P->x[k];
61
+ break;
62
+ }
63
+ }
64
+ }
65
+ M[i] = 1. / M[i];
66
+ }
67
+ cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
68
+ scs_free(M);
69
+ #if VERBOSITY > 0
70
+ scs_printf("finished getting pre-conditioner\n");
71
+ #endif
72
+ }
73
+
74
+ /* no need to update anything in this case */
75
+ void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
76
+ scs_int i;
77
+ for (i = 0; i < p->m; ++i)
78
+ p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
79
+ cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
80
+ cudaMemcpyHostToDevice);
81
+ set_preconditioner(p, rho_y_vec);
82
+ }
66
83
 
67
84
  void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
68
85
  if (p) {
86
+ scs_free(p->inv_rho_y_vec);
69
87
  cudaFree(p->p);
70
88
  cudaFree(p->r);
71
89
  cudaFree(p->Gp);
@@ -73,6 +91,11 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
73
91
  cudaFree(p->tmp_m);
74
92
  cudaFree(p->z);
75
93
  cudaFree(p->M);
94
+ cudaFree(p->inv_rho_y_vec_gpu);
95
+ if (p->Pg) {
96
+ SCS(free_gpu_matrix)(p->Pg);
97
+ scs_free(p->Pg);
98
+ }
76
99
  if (p->Ag) {
77
100
  SCS(free_gpu_matrix)(p->Ag);
78
101
  scs_free(p->Ag);
@@ -86,6 +109,7 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
86
109
  }
87
110
  cusparseDestroyDnVec(p->dn_vec_m);
88
111
  cusparseDestroyDnVec(p->dn_vec_n);
112
+ cusparseDestroyDnVec(p->dn_vec_n_p);
89
113
  cusparseDestroy(p->cusparse_handle);
90
114
  cublasDestroy(p->cublas_handle);
91
115
  /* Don't reset because it interferes with other GPU programs. */
@@ -94,80 +118,105 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
94
118
  }
95
119
  }
96
120
 
97
- /*y = (RHO_X * I + A'A)x */
98
- static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
99
- ScsLinSysWork *p, const scs_float *x, scs_float *y) {
121
+ /* z = M * z elementwise in place, assumes M, z on GPU */
122
+ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
123
+ scs_float *z, scs_int n) {
124
+ CUBLAS(tbmv)
125
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
126
+ 0, M, 1, z, 1);
127
+ }
128
+
129
+ /* y = (rho_x * I + P + A' R_y^{-1} A) x */
130
+ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
100
131
  /* x and y MUST already be loaded to GPU */
101
- scs_float *tmp_m = p->tmp_m; /* temp memory */
102
- cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
132
+ scs_float *z = p->tmp_m; /* temp memory */
133
+ cudaMemset(y, 0, p->n * sizeof(scs_float));
134
+ cudaMemset(z, 0, p->m * sizeof(scs_float));
135
+
136
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
137
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
138
+ cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
139
+
140
+ /* y = rho_x * x */
141
+ CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
142
+
143
+ if (p->Pg) {
144
+ /* y = rho_x * x + Px */
145
+ SCS(accum_by_p_gpu)
146
+ (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
147
+ &p->buffer);
148
+ }
103
149
 
104
- cusparseDnVecSetValues(p->dn_vec_m, (void *) tmp_m);
105
- cusparseDnVecSetValues(p->dn_vec_n, (void *) x);
150
+ /* z = Ax */
106
151
  #if GPU_TRANSPOSE_MAT > 0
107
- SCS(_accum_by_atrans_gpu)(
108
- p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
109
- &p->buffer_size, &p->buffer
110
- );
152
+ SCS(accum_by_atrans_gpu)
153
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
154
+ &p->buffer);
111
155
  #else
112
- SCS(_accum_by_a_gpu)(
113
- A, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
114
- &p->buffer_size, &p->buffer
115
- );
156
+ SCS(accum_by_a_gpu)
157
+ (p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
158
+ &p->buffer);
116
159
  #endif
160
+ /* z = R_y^{-1} A x */
161
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
117
162
 
118
- cudaMemset(y, 0, A->n * sizeof(scs_float));
119
-
120
- cusparseDnVecSetValues(p->dn_vec_m, (void *) tmp_m);
121
- cusparseDnVecSetValues(p->dn_vec_n, (void *) y);
122
- SCS(_accum_by_atrans_gpu)(
123
- A, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
124
- &p->buffer_size, &p->buffer
125
- );
126
-
127
- CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
163
+ /* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
164
+ SCS(accum_by_atrans_gpu)
165
+ (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
166
+ &p->buffer);
128
167
  }
129
168
 
130
- /* M = inv ( diag ( RHO_X * I + A'A ) ) */
131
- static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
132
- ScsLinSysWork *p) {
133
- scs_int i;
134
- scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
135
-
136
- #if EXTRA_VERBOSE > 0
137
- scs_printf("getting pre-conditioner\n");
138
- #endif
139
-
140
- for (i = 0; i < A->n; ++i) {
141
- M[i] = 1 / (stgs->rho_x +
142
- SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
143
- /* M[i] = 1; */
169
+ /* P comes in upper triangular, expand to full
170
+ * First compute triplet version of full matrix, then compress to csc
171
+ * */
172
+ static csc *fill_p_matrix(const ScsMatrix *P) {
173
+ scs_int i, j, k, kk;
174
+ scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
175
+ csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
176
+ csc *P_full;
177
+ kk = 0;
178
+ for (j = 0; j < P->n; j++) { /* cols */
179
+ for (k = P->p[j]; k < P->p[j + 1]; k++) {
180
+ i = P->i[k]; /* row */
181
+ if (i > j) { /* only upper triangular needed */
182
+ break;
183
+ }
184
+ P_tmp->i[kk] = i;
185
+ P_tmp->p[kk] = j;
186
+ P_tmp->x[kk] = P->x[k];
187
+ kk++;
188
+ if (i == j) { /* diagonal */
189
+ continue;
190
+ }
191
+ P_tmp->i[kk] = j;
192
+ P_tmp->p[kk] = i;
193
+ P_tmp->x[kk] = P->x[k];
194
+ kk++;
195
+ }
144
196
  }
145
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
146
- scs_free(M);
147
-
148
- #if EXTRA_VERBOSE > 0
149
- scs_printf("finished getting pre-conditioner\n");
150
- #endif
197
+ P_tmp->nz = kk; /* set number of nonzeros */
198
+ P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
199
+ SCS(cs_spfree)(P_tmp);
200
+ return P_full;
151
201
  }
152
202
 
153
- ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
154
- const ScsSettings *stgs) {
203
+ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
204
+ scs_float *rho_y_vec, scs_float rho_x) {
155
205
  cudaError_t err;
206
+ scs_int i;
207
+ csc *P_full;
156
208
  ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
157
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
158
-
159
- /* Used for initializing dense vectors */
160
- scs_float *tmp_null_n = SCS_NULL;
161
- scs_float *tmp_null_m = SCS_NULL;
209
+ ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
210
+ ScsGpuMatrix *Pg = SCS_NULL;
162
211
 
163
212
  #if GPU_TRANSPOSE_MAT > 0
164
213
  size_t new_buffer_size = 0;
165
214
  #endif
166
215
 
216
+ p->rho_x = rho_x;
167
217
  p->cublas_handle = 0;
168
218
  p->cusparse_handle = 0;
169
219
 
170
- p->total_solve_time = 0;
171
220
  p->tot_cg_its = 0;
172
221
 
173
222
  p->buffer_size = 0;
@@ -181,13 +230,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
181
230
 
182
231
  Ag->n = A->n;
183
232
  Ag->m = A->m;
184
- Ag->Annz = A->p[A->n];
233
+ Ag->nnz = A->p[A->n];
185
234
  Ag->descr = 0;
186
- /* Matrix description */
187
-
188
- p->Ag = Ag;
189
- p->Agt = SCS_NULL;
190
-
191
235
  cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
192
236
  cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
193
237
  cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
@@ -196,10 +240,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
196
240
  cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
197
241
  cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
198
242
  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
199
- cudaMalloc((void **)&p->tmp_m,
200
- A->m * sizeof(scs_float)); /* intermediate result */
243
+ cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
201
244
  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
202
245
  cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
246
+ cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
203
247
 
204
248
  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
205
249
  cudaMemcpyHostToDevice);
@@ -207,25 +251,59 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
207
251
  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
208
252
  cudaMemcpyHostToDevice);
209
253
 
210
- cusparseCreateCsr
211
- (&Ag->descr, Ag->n, Ag->m, Ag->Annz, Ag->p, Ag->i, Ag->x,
212
- SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
213
- CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
254
+ p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
255
+ for (i = 0; i < A->m; ++i)
256
+ p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
257
+ cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
258
+ cudaMemcpyHostToDevice);
214
259
 
215
- cudaMalloc((void **)&tmp_null_n, A->n * sizeof(scs_float));
216
- cudaMalloc((void **)&tmp_null_m, A->m * sizeof(scs_float));
217
- cusparseCreateDnVec(&p->dn_vec_n, Ag->n, tmp_null_n, SCS_CUDA_FLOAT);
218
- cusparseCreateDnVec(&p->dn_vec_m, Ag->m, tmp_null_m, SCS_CUDA_FLOAT);
219
- cudaFree(tmp_null_n);
220
- cudaFree(tmp_null_m);
260
+ cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
261
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
262
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
263
+
264
+ if (P) {
265
+ Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
266
+ P_full = fill_p_matrix(P);
267
+ Pg->n = P_full->n;
268
+ Pg->m = P_full->m;
269
+ Pg->nnz = P_full->p[P_full->n];
270
+ Pg->descr = 0;
271
+ cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
272
+ cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
273
+ cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
274
+
275
+ cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
276
+ cudaMemcpyHostToDevice);
277
+ cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
278
+ cudaMemcpyHostToDevice);
279
+ cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
280
+ cudaMemcpyHostToDevice);
281
+
282
+ cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
283
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
284
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
285
+
286
+ SCS(cs_spfree)(P_full);
287
+ } else {
288
+ Pg = SCS_NULL;
289
+ }
221
290
 
222
- get_preconditioner(A, stgs, p);
291
+ p->Ag = Ag;
292
+ p->Pg = Pg;
293
+ p->Agt = SCS_NULL;
294
+
295
+ /* we initialize with tmp_m but always overwrite it so it doesn't matter */
296
+ cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
297
+ cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
298
+ cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
299
+
300
+ set_preconditioner(p, rho_y_vec);
223
301
 
224
302
  #if GPU_TRANSPOSE_MAT > 0
225
303
  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
226
304
  p->Agt->n = A->m;
227
305
  p->Agt->m = A->n;
228
- p->Agt->Annz = A->p[A->n];
306
+ p->Agt->nnz = A->p[A->n];
229
307
  p->Agt->descr = 0;
230
308
  /* Matrix description */
231
309
 
@@ -234,13 +312,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
234
312
  cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
235
313
  /* transpose Ag into Agt for faster multiplies */
236
314
  /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
237
- cusparseCsr2cscEx2_bufferSize
238
- (p->cusparse_handle, A->n, A->m, A->p[A->n],
239
- Ag->x, Ag->p, Ag->i,
240
- p->Agt->x, p->Agt->p, p->Agt->i,
241
- SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
242
- CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG,
243
- &new_buffer_size);
315
+ cusparseCsr2cscEx2_bufferSize(
316
+ p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
317
+ p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
318
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
244
319
 
245
320
  if (new_buffer_size > p->buffer_size) {
246
321
  if (p->buffer != SCS_NULL) {
@@ -250,24 +325,20 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
250
325
  p->buffer_size = new_buffer_size;
251
326
  }
252
327
 
253
- cusparseCsr2cscEx2
254
- (p->cusparse_handle, A->n, A->m, A->p[A->n],
255
- Ag->x, Ag->p, Ag->i,
256
- p->Agt->x, p->Agt->p, p->Agt->i,
257
- SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
258
- CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG,
259
- p->buffer);
260
-
261
- cusparseCreateCsr
262
- (&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->Annz,
263
- p->Agt->p, p->Agt->i, p->Agt->x,
264
- SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
265
- CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
328
+ cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
329
+ Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
330
+ CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
331
+ SCS_CSR2CSC_ALG, p->buffer);
332
+
333
+ cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
334
+ p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
335
+ SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
336
+ SCS_CUDA_FLOAT);
266
337
  #endif
267
338
 
268
339
  err = cudaGetLastError();
269
340
  if (err != cudaSuccess) {
270
- printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
341
+ printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
271
342
  cudaGetErrorString(err));
272
343
  SCS(free_lin_sys_work)(p);
273
344
  return SCS_NULL;
@@ -275,138 +346,173 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
275
346
  return p;
276
347
  }
277
348
 
278
- static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
279
- scs_float *z, scs_float *r, scs_int n) {
280
- cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
281
- CUBLAS(tbmv)
282
- (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
283
- 0, M, 1, z, 1);
284
- }
285
-
286
- /* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
287
- static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
288
- ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
349
+ /* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
350
+ * b */
351
+ /* on GPU */
352
+ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
289
353
  scs_int max_its, scs_float tol) {
290
- scs_int i, n = A->n;
291
- scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
354
+ scs_int i, n = pr->n;
355
+ scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
292
356
  scs_float onef = 1.0, neg_onef = -1.0;
293
357
  scs_float *p = pr->p; /* cg direction */
294
358
  scs_float *Gp = pr->Gp; /* updated CG direction */
295
359
  scs_float *r = pr->r; /* cg residual */
296
360
  scs_float *z = pr->z; /* preconditioned */
297
- scs_float *M = pr->M; /* preconditioner */
298
361
  cublasHandle_t cublas_handle = pr->cublas_handle;
299
362
 
300
- if (s == SCS_NULL) {
363
+ if (!s) {
364
+ /* take s = 0 */
365
+ /* r = b */
301
366
  cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
367
+ /* b = 0 */
302
368
  cudaMemset(bg, 0, n * sizeof(scs_float));
303
369
  } else {
304
370
  /* p contains bg temporarily */
305
371
  cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
306
- /* bg contains s */
372
+ /* bg = s */
307
373
  cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
308
- mat_vec(A, stgs, pr, bg, r);
374
+ /* r = Mat * s */
375
+ mat_vec(pr, bg, r);
376
+ /* r = Mat * s - b */
309
377
  CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
378
+ /* r = b - Mat * s */
310
379
  CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
311
380
  }
312
381
 
313
- /* for some reason nrm2 is VERY slow */
314
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
315
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
316
- nrm_r = SQRTF(nrm_r);
317
382
  /* check to see if we need to run CG at all */
318
- if (nrm_r < MIN(tol, 1e-18)) {
383
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
319
384
  return 0;
320
385
  }
321
386
 
322
- apply_pre_conditioner(cublas_handle, M, z, r, n);
323
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
324
- /* put z in p, replacing temp mem */
387
+ /* z = M r */
388
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
389
+ scale_by_diag(cublas_handle, pr->M, z, n);
390
+ /* ztr = z'r */
391
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
392
+ /* p = z */
325
393
  cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
326
394
 
327
395
  for (i = 0; i < max_its; ++i) {
328
- mat_vec(A, stgs, pr, p, Gp);
329
-
330
- CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
331
-
332
- alpha = ipzr / p_gp;
396
+ /* Gp = Mat * p */
397
+ mat_vec(pr, p, Gp);
398
+ /* ptGp = p'Gp */
399
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
400
+ /* alpha = z'r / p'G p */
401
+ alpha = ztr / ptGp;
333
402
  neg_alpha = -alpha;
334
-
403
+ /* b += alpha * p */
335
404
  CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
405
+ /* r -= alpha * G p */
336
406
  CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
337
407
 
338
- /* for some reason nrm2 is VERY slow */
339
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
340
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
341
- nrm_r = SQRTF(nrm_r);
342
- if (nrm_r < tol) {
343
- i++;
344
- break;
345
- }
346
- ipzr_old = ipzr;
347
- apply_pre_conditioner(cublas_handle, M, z, r, n);
348
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
408
+ #if VERBOSITY > 3
409
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
410
+ cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
411
+ #endif
349
412
 
350
- beta = ipzr / ipzr_old;
413
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
414
+ return i + 1;
415
+ }
416
+ /* z = M r */
417
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
418
+ scale_by_diag(cublas_handle, pr->M, z, n);
419
+ ztr_prev = ztr;
420
+ /* ztr = z'r */
421
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
422
+ beta = ztr / ztr_prev;
423
+ /* p = beta * p, where beta = ztr / ztr_prev */
351
424
  CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
425
+ /* p = z + beta * p */
352
426
  CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
353
427
  }
354
- #if EXTRA_VERBOSE > 0
355
- scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
356
- #endif
357
428
  return i;
358
429
  }
359
430
 
360
- scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
361
- ScsLinSysWork *p, scs_float *b, const scs_float *s,
362
- scs_int iter) {
363
- scs_int cg_its;
364
- SCS(timer) linsys_timer;
365
- scs_float *bg = p->bg;
431
+ /* solves Mx = b, for x but stores result in b */
432
+ /* s contains warm-start (if available) */
433
+ /*
434
+ * [x] = [rho_x I + P A' ]^{-1} [rx]
435
+ * [y] [ A -R_y ] [ry]
436
+ *
437
+ * R_y = diag(rho_y_vec)
438
+ *
439
+ * becomes:
440
+ *
441
+ * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
442
+ * y = R_y^{-1} (Ax - ry)
443
+ *
444
+ */
445
+ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
446
+ scs_float tol) {
447
+ scs_int cg_its, max_iters;
366
448
  scs_float neg_onef = -1.0;
449
+
450
+ /* these are on GPU */
451
+ scs_float *bg = p->bg;
452
+ scs_float *tmp_m = p->tmp_m;
367
453
  ScsGpuMatrix *Ag = p->Ag;
368
- scs_float cg_tol =
369
- SCS(norm)(b, Ag->n) *
370
- (iter < 0 ? CG_BEST_TOL
371
- : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
372
- SCS(tic)(&linsys_timer);
373
- /* all on GPU */
374
- cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
375
-
376
- cusparseDnVecSetValues(p->dn_vec_m, (void *) &(bg[Ag->n]));
377
- cusparseDnVecSetValues(p->dn_vec_n, (void *) bg);
378
- SCS(_accum_by_atrans_gpu)(
379
- Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
380
- &p->buffer_size, &p->buffer
381
- );
382
-
383
- /* solves (I+A'A)x = b, s warm start, solution stored in b */
384
- cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
454
+ ScsGpuMatrix *Pg = p->Pg;
455
+
456
+ if (CG_NORM(b, p->n + p->m) <= 1e-12) {
457
+ memset(b, 0, (p->n + p->m) * sizeof(scs_float));
458
+ return 0;
459
+ }
460
+
461
+ if (tol <= 0.) {
462
+ scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
463
+ "INDIRECT flag.\n",
464
+ tol);
465
+ }
466
+
467
+ /* bg = b = [rx; ry] */
468
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
469
+ cudaMemcpyHostToDevice);
470
+ /* tmp = ry */
471
+ cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
472
+ cudaMemcpyDeviceToDevice);
473
+ /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
474
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
475
+
476
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
477
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
478
+ /* bg[:n] = rx + A' R ry */
479
+ SCS(accum_by_atrans_gpu)
480
+ (Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
481
+ &p->buffer);
482
+
483
+ /* set max_iters to 10 * n (though in theory n is enough for any tol) */
484
+ max_iters = 10 * Ag->n;
485
+
486
+ /* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
487
+ * in bg */
488
+ cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
489
+
490
+ /* bg[n:] = -ry */
385
491
  CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
492
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
493
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
386
494
 
387
- cusparseDnVecSetValues(p->dn_vec_m, (void *) &(bg[Ag->n]));
388
- cusparseDnVecSetValues(p->dn_vec_n, (void *) bg);
495
+ /* b[n:] = Ax - ry */
389
496
  #if GPU_TRANSPOSE_MAT > 0
390
- SCS(_accum_by_atrans_gpu)(
391
- p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
392
- &p->buffer_size, &p->buffer
393
- );
497
+ SCS(accum_by_atrans_gpu)
498
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
499
+ &p->buffer);
394
500
  #else
395
- SCS(_accum_by_a_gpu)(
396
- Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
397
- &p->buffer_size, &p->buffer
398
- );
501
+ SCS(accum_by_a_gpu)
502
+ (Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
503
+ &p->buffer);
399
504
  #endif
400
505
 
401
- cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
402
-
403
- if (iter >= 0) {
404
- p->tot_cg_its += cg_its;
405
- }
506
+ /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
507
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
406
508
 
407
- p->total_solve_time += SCS(tocq)(&linsys_timer);
408
- #if EXTRAVERBOSE > 0
409
- scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
509
+ /* copy bg = [x; y] back to b */
510
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
511
+ cudaMemcpyDeviceToHost);
512
+ p->tot_cg_its += cg_its;
513
+ #if VERBOSITY > 1
514
+ scs_printf("tol %.3e\n", tol);
515
+ scs_printf("cg_its %i\n", (int)cg_its);
410
516
  #endif
411
517
  return 0;
412
518
  }