scs 0.2.2 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE.txt +18 -18
  4. data/README.md +19 -14
  5. data/lib/scs/ffi.rb +31 -20
  6. data/lib/scs/solver.rb +32 -9
  7. data/lib/scs/version.rb +1 -1
  8. data/vendor/scs/CITATION.cff +39 -0
  9. data/vendor/scs/CMakeLists.txt +320 -0
  10. data/vendor/scs/Makefile +32 -23
  11. data/vendor/scs/README.md +9 -218
  12. data/vendor/scs/include/aa.h +67 -23
  13. data/vendor/scs/include/cones.h +22 -19
  14. data/vendor/scs/include/glbopts.h +107 -79
  15. data/vendor/scs/include/linalg.h +3 -4
  16. data/vendor/scs/include/linsys.h +58 -44
  17. data/vendor/scs/include/normalize.h +6 -5
  18. data/vendor/scs/include/rw.h +8 -2
  19. data/vendor/scs/include/scs.h +257 -141
  20. data/vendor/scs/include/scs_types.h +34 -0
  21. data/vendor/scs/include/scs_work.h +83 -0
  22. data/vendor/scs/include/util.h +3 -15
  23. data/vendor/scs/linsys/cpu/direct/private.c +241 -232
  24. data/vendor/scs/linsys/cpu/direct/private.h +13 -7
  25. data/vendor/scs/linsys/cpu/indirect/private.c +194 -118
  26. data/vendor/scs/linsys/cpu/indirect/private.h +7 -4
  27. data/vendor/scs/linsys/csparse.c +87 -0
  28. data/vendor/scs/linsys/csparse.h +34 -0
  29. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +6 -6
  30. data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +6 -1
  31. data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
  32. data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
  33. data/vendor/scs/linsys/external/qdldl/changes +2 -0
  34. data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
  35. data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
  36. data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
  37. data/vendor/scs/linsys/gpu/gpu.c +58 -21
  38. data/vendor/scs/linsys/gpu/gpu.h +70 -35
  39. data/vendor/scs/linsys/gpu/indirect/private.c +394 -157
  40. data/vendor/scs/linsys/gpu/indirect/private.h +27 -12
  41. data/vendor/scs/linsys/scs_matrix.c +478 -0
  42. data/vendor/scs/linsys/scs_matrix.h +70 -0
  43. data/vendor/scs/scs.mk +14 -10
  44. data/vendor/scs/src/aa.c +394 -110
  45. data/vendor/scs/src/cones.c +497 -359
  46. data/vendor/scs/src/ctrlc.c +15 -5
  47. data/vendor/scs/src/linalg.c +107 -26
  48. data/vendor/scs/src/normalize.c +30 -72
  49. data/vendor/scs/src/rw.c +202 -27
  50. data/vendor/scs/src/scs.c +769 -571
  51. data/vendor/scs/src/scs_version.c +11 -3
  52. data/vendor/scs/src/util.c +37 -106
  53. data/vendor/scs/test/minunit.h +22 -8
  54. data/vendor/scs/test/problem_utils.h +180 -25
  55. data/vendor/scs/test/problems/degenerate.h +130 -0
  56. data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
  57. data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
  58. data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
  59. data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
  60. data/vendor/scs/test/problems/random_prob +0 -0
  61. data/vendor/scs/test/problems/random_prob.h +45 -0
  62. data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
  63. data/vendor/scs/test/problems/small_lp.h +14 -13
  64. data/vendor/scs/test/problems/small_qp.h +352 -0
  65. data/vendor/scs/test/problems/test_validation.h +43 -0
  66. data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
  67. data/vendor/scs/test/random_socp_prob.c +54 -53
  68. data/vendor/scs/test/rng.h +109 -0
  69. data/vendor/scs/test/run_from_file.c +20 -11
  70. data/vendor/scs/test/run_tests.c +35 -2
  71. metadata +29 -98
  72. data/vendor/scs/linsys/amatrix.c +0 -305
  73. data/vendor/scs/linsys/amatrix.h +0 -36
  74. data/vendor/scs/linsys/amatrix.o +0 -0
  75. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  76. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  77. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  78. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  79. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  80. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  81. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  82. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  83. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  84. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  85. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  86. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  87. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  88. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  89. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  90. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  91. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  92. data/vendor/scs/src/aa.o +0 -0
  93. data/vendor/scs/src/cones.o +0 -0
  94. data/vendor/scs/src/ctrlc.o +0 -0
  95. data/vendor/scs/src/linalg.o +0 -0
  96. data/vendor/scs/src/normalize.o +0 -0
  97. data/vendor/scs/src/rw.o +0 -0
  98. data/vendor/scs/src/scs.o +0 -0
  99. data/vendor/scs/src/scs_version.o +0 -0
  100. data/vendor/scs/src/util.o +0 -0
  101. data/vendor/scs/test/data/small_random_socp +0 -0
  102. data/vendor/scs/test/problems/small_random_socp.h +0 -33
  103. data/vendor/scs/test/run_tests +0 -2
@@ -1,61 +1,115 @@
1
1
  #include "private.h"
2
+ #include "linsys.h"
2
3
 
3
- #define CG_BEST_TOL 1e-9
4
- #define CG_MIN_TOL 1e-1
5
-
6
- /* do not use within pcg, reuses memory */
7
- void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
8
- const scs_float *x, scs_float *y) {
9
- scs_float *v_m = p->tmp_m;
10
- scs_float *v_n = p->r;
11
- cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
12
- cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
13
- SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
14
- cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
15
- }
4
+ /* norm to use when deciding convergence */
5
+ /* should be consistent with CG_NORM in glbopts.h */
6
+ #define USE_L2_NORM (0)
16
7
 
17
- /* do not use within pcg, reuses memory */
18
- void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
19
- scs_float *y) {
20
- scs_float *v_m = p->tmp_m;
21
- scs_float *v_n = p->r;
22
- cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
23
- cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
24
- #if GPU_TRANSPOSE_MAT > 0
25
- SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
8
+ static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
9
+ scs_int n) {
10
+ #if USE_L2_NORM > 0
11
+ scs_float nrm;
12
+ CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
26
13
  #else
27
- SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
14
+ scs_int idx;
15
+ scs_float nrm;
16
+ CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
17
+ /* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
18
+ cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
19
+ nrm = ABS(nrm);
28
20
  #endif
29
- cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
21
+ return nrm;
30
22
  }
31
23
 
32
- char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
33
- char *str = (char *)scs_malloc(sizeof(char) * 128);
34
- sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
35
- (long)A->p[A->n], stgs->cg_rate);
36
- return str;
24
+ const char *SCS(get_lin_sys_method)() {
25
+ return "sparse-indirect GPU";
37
26
  }
38
27
 
28
+ /*
39
29
  char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
40
30
  char *str = (char *)scs_malloc(sizeof(char) * 128);
41
- sprintf(str,
42
- "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
43
- (scs_float)p->tot_cg_its / (info->iter + 1),
44
- p->total_solve_time / (info->iter + 1) / 1e3);
31
+ sprintf(str, "lin-sys: avg cg its: %2.2f\n",
32
+ (scs_float)p->tot_cg_its / (info->iter + 1));
45
33
  p->tot_cg_its = 0;
46
- p->total_solve_time = 0;
47
34
  return str;
48
35
  }
36
+ */
37
+
38
+ /* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
39
+ /* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
40
+ static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
41
+ scs_int i, k;
42
+ const ScsMatrix *A = p->A;
43
+ const ScsMatrix *P = p->P;
44
+ scs_float *M = p->M;
45
+
46
+ #if VERBOSITY > 0
47
+ scs_printf("getting pre-conditioner\n");
48
+ #endif
49
+
50
+ /* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
51
+ for (i = 0; i < A->n; ++i) { /* cols */
52
+ /* M_ii = (R_x)_i */
53
+ M[i] = diag_r[i];
54
+ /* M_ii += a_i' (R_y)^-1 a_i */
55
+ for (k = A->p[i]; k < A->p[i + 1]; ++k) {
56
+ /* A->i[k] is row of entry k with value A->x[k] */
57
+ M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
58
+ }
59
+ if (P) {
60
+ for (k = P->p[i]; k < P->p[i + 1]; k++) {
61
+ /* diagonal element only */
62
+ if (P->i[k] == i) { /* row == col */
63
+ /* M_ii += P_ii */
64
+ M[i] += P->x[k];
65
+ break;
66
+ }
67
+ }
68
+ }
69
+ /* finally invert for pre-conditioner */
70
+ M[i] = 1. / M[i];
71
+ }
72
+ cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
73
+ #if VERBOSITY > 0
74
+ scs_printf("finished getting pre-conditioner\n");
75
+ #endif
76
+ }
77
+
78
+ /* no need to update anything in this case */
79
+ void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
80
+ scs_int i;
81
+
82
+ /* R_x to gpu */
83
+ cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
84
+ cudaMemcpyHostToDevice);
85
+
86
+ /* 1/R_y to gpu */
87
+ for (i = 0; i < p->m; ++i)
88
+ p->inv_r_y[i] = 1. / diag_r[p->n + i];
89
+ cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
90
+ cudaMemcpyHostToDevice);
91
+
92
+ /* set preconditioner M on gpu */
93
+ set_preconditioner(p, diag_r);
94
+ }
49
95
 
50
96
  void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
51
97
  if (p) {
98
+ scs_free(p->M);
99
+ scs_free(p->inv_r_y);
52
100
  cudaFree(p->p);
53
101
  cudaFree(p->r);
54
102
  cudaFree(p->Gp);
55
103
  cudaFree(p->bg);
56
104
  cudaFree(p->tmp_m);
57
105
  cudaFree(p->z);
58
- cudaFree(p->M);
106
+ cudaFree(p->M_gpu);
107
+ cudaFree(p->r_x_gpu);
108
+ cudaFree(p->inv_r_y_gpu);
109
+ if (p->Pg) {
110
+ SCS(free_gpu_matrix)(p->Pg);
111
+ scs_free(p->Pg);
112
+ }
59
113
  if (p->Ag) {
60
114
  SCS(free_gpu_matrix)(p->Ag);
61
115
  scs_free(p->Ag);
@@ -64,6 +118,12 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
64
118
  SCS(free_gpu_matrix)(p->Agt);
65
119
  scs_free(p->Agt);
66
120
  }
121
+ if (p->buffer != SCS_NULL) {
122
+ cudaFree(p->buffer);
123
+ }
124
+ cusparseDestroyDnVec(p->dn_vec_m);
125
+ cusparseDestroyDnVec(p->dn_vec_n);
126
+ cusparseDestroyDnVec(p->dn_vec_n_p);
67
127
  cusparseDestroy(p->cusparse_handle);
68
128
  cublasDestroy(p->cublas_handle);
69
129
  /* Don't reset because it interferes with other GPU programs. */
@@ -72,53 +132,127 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
72
132
  }
73
133
  }
74
134
 
75
- /*y = (RHO_X * I + A'A)x */
76
- static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
77
- ScsLinSysWork *p, const scs_float *x, scs_float *y) {
78
- /* x and y MUST already be loaded to GPU */
79
- scs_float *tmp_m = p->tmp_m; /* temp memory */
80
- cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
81
- SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
82
- cudaMemset(y, 0, A->n * sizeof(scs_float));
83
- SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
84
- CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
135
+ /* z = M * z elementwise in place, assumes M, z on GPU */
136
+ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
137
+ scs_float *z, scs_int n) {
138
+ CUBLAS(tbmv)
139
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
140
+ 0, M, 1, z, 1);
85
141
  }
86
142
 
87
- /* M = inv ( diag ( RHO_X * I + A'A ) ) */
88
- static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
89
- ScsLinSysWork *p) {
90
- scs_int i;
91
- scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
143
+ /* y = (R_x + P + A' R_y^{-1} A) x */
144
+ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
145
+ /* x and y MUST already be loaded to GPU */
146
+ scs_float *z = p->tmp_m; /* temp memory */
147
+ cudaMemset(z, 0, p->m * sizeof(scs_float));
148
+
149
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
150
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
151
+ cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
152
+
153
+ /* y = x */
154
+ cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
155
+ /* y = R_x * x */
156
+ scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
157
+
158
+ if (p->Pg) {
159
+ /* y = R_x * x + P x */
160
+ SCS(accum_by_p_gpu)
161
+ (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
162
+ &p->buffer);
163
+ }
92
164
 
93
- #if EXTRA_VERBOSE > 0
94
- scs_printf("getting pre-conditioner\n");
165
+ /* z = Ax */
166
+ #if GPU_TRANSPOSE_MAT > 0
167
+ SCS(accum_by_atrans_gpu)
168
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
169
+ &p->buffer);
170
+ #else
171
+ SCS(accum_by_a_gpu)
172
+ (p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
173
+ &p->buffer);
95
174
  #endif
175
+ /* z = R_y^{-1} A x */
176
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
96
177
 
97
- for (i = 0; i < A->n; ++i) {
98
- M[i] = 1 / (stgs->rho_x +
99
- SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
100
- /* M[i] = 1; */
101
- }
102
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
103
- scs_free(M);
178
+ /* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
179
+ SCS(accum_by_atrans_gpu)
180
+ (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
181
+ &p->buffer);
182
+ }
104
183
 
105
- #if EXTRA_VERBOSE > 0
106
- scs_printf("finished getting pre-conditioner\n");
107
- #endif
184
+ /* P comes in upper triangular, expand to full
185
+ * First compute triplet version of full matrix, then compress to csc
186
+ * */
187
+ static csc *fill_p_matrix(const ScsMatrix *P) {
188
+ scs_int i, j, k, kk;
189
+ scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
190
+ csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
191
+ csc *P_full;
192
+ kk = 0;
193
+ for (j = 0; j < P->n; j++) { /* cols */
194
+ for (k = P->p[j]; k < P->p[j + 1]; k++) {
195
+ i = P->i[k]; /* row */
196
+ if (i > j) { /* only upper triangular needed */
197
+ break;
198
+ }
199
+ P_tmp->i[kk] = i;
200
+ P_tmp->p[kk] = j;
201
+ P_tmp->x[kk] = P->x[k];
202
+ kk++;
203
+ if (i == j) { /* diagonal */
204
+ continue;
205
+ }
206
+ P_tmp->i[kk] = j;
207
+ P_tmp->p[kk] = i;
208
+ P_tmp->x[kk] = P->x[k];
209
+ kk++;
210
+ }
211
+ }
212
+ P_tmp->nz = kk; /* set number of nonzeros */
213
+ P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
214
+ SCS(cs_spfree)(P_tmp);
215
+ return P_full;
108
216
  }
109
217
 
110
- ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
111
- const ScsSettings *stgs) {
218
+ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
219
+ const scs_float *diag_r) {
112
220
  cudaError_t err;
113
- ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
114
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
221
+ csc *P_full;
222
+ ScsLinSysWork *p = SCS_NULL;
223
+ ScsGpuMatrix *Ag = SCS_NULL;
224
+ ScsGpuMatrix *Pg = SCS_NULL;
225
+ int device_count;
226
+
227
+ err = cudaGetDeviceCount(&device_count);
228
+ if (err > 0) {
229
+ scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
230
+ return SCS_NULL;
231
+ }
232
+
233
+ p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
234
+ Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
235
+
236
+ p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
237
+ p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
238
+
239
+ p->A = A;
240
+ p->P = P;
241
+ p->m = A->m;
242
+ p->n = A->n;
243
+
244
+ #if GPU_TRANSPOSE_MAT > 0
245
+ size_t new_buffer_size = 0;
246
+ #endif
115
247
 
116
248
  p->cublas_handle = 0;
117
249
  p->cusparse_handle = 0;
118
250
 
119
- p->total_solve_time = 0;
120
251
  p->tot_cg_its = 0;
121
252
 
253
+ p->buffer_size = 0;
254
+ p->buffer = SCS_NULL;
255
+
122
256
  /* Get handle to the CUBLAS context */
123
257
  cublasCreate(&p->cublas_handle);
124
258
 
@@ -127,15 +261,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
127
261
 
128
262
  Ag->n = A->n;
129
263
  Ag->m = A->m;
130
- Ag->Annz = A->p[A->n];
264
+ Ag->nnz = A->p[A->n];
131
265
  Ag->descr = 0;
132
- /* Matrix description */
133
- cusparseCreateMatDescr(&Ag->descr);
134
- cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
135
- cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
136
- p->Ag = Ag;
137
- p->Agt = SCS_NULL;
138
-
139
266
  cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
140
267
  cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
141
268
  cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
@@ -144,10 +271,11 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
144
271
  cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
145
272
  cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
146
273
  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
147
- cudaMalloc((void **)&p->tmp_m,
148
- A->m * sizeof(scs_float)); /* intermediate result */
274
+ cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
149
275
  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
150
- cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
276
+ cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
277
+ cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
278
+ cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
151
279
 
152
280
  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
153
281
  cudaMemcpyHostToDevice);
@@ -155,32 +283,89 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
155
283
  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
156
284
  cudaMemcpyHostToDevice);
157
285
 
158
- get_preconditioner(A, stgs, p);
286
+ cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
287
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
288
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
289
+
290
+ if (P) {
291
+ Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
292
+ P_full = fill_p_matrix(P);
293
+ Pg->n = P_full->n;
294
+ Pg->m = P_full->m;
295
+ Pg->nnz = P_full->p[P_full->n];
296
+ Pg->descr = 0;
297
+ cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
298
+ cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
299
+ cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
300
+
301
+ cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
302
+ cudaMemcpyHostToDevice);
303
+ cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
304
+ cudaMemcpyHostToDevice);
305
+ cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
306
+ cudaMemcpyHostToDevice);
307
+
308
+ cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
309
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
310
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
311
+
312
+ SCS(cs_spfree)(P_full);
313
+ } else {
314
+ Pg = SCS_NULL;
315
+ }
316
+
317
+ p->Ag = Ag;
318
+ p->Pg = Pg;
319
+ p->Agt = SCS_NULL;
320
+
321
+ /* we initialize with tmp_m but always overwrite it so it doesn't matter */
322
+ cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
323
+ cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
324
+ cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
325
+
326
+ /* Form preconditioner and copy R_x, 1/R_y to gpu */
327
+ SCS(update_lin_sys_diag_r)(p, diag_r);
159
328
 
160
329
  #if GPU_TRANSPOSE_MAT > 0
161
330
  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
162
331
  p->Agt->n = A->m;
163
332
  p->Agt->m = A->n;
164
- p->Agt->Annz = A->p[A->n];
333
+ p->Agt->nnz = A->p[A->n];
165
334
  p->Agt->descr = 0;
166
335
  /* Matrix description */
167
- cusparseCreateMatDescr(&p->Agt->descr);
168
- cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
169
- cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
170
336
 
171
337
  cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
172
338
  cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
173
339
  cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
174
340
  /* transpose Ag into Agt for faster multiplies */
175
341
  /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
176
- CUSPARSE(csr2csc)
177
- (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
178
- p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
342
+ cusparseCsr2cscEx2_bufferSize(
343
+ p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
344
+ p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
345
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
346
+
347
+ if (new_buffer_size > p->buffer_size) {
348
+ if (p->buffer != SCS_NULL) {
349
+ cudaFree(p->buffer);
350
+ }
351
+ cudaMalloc(&p->buffer, new_buffer_size);
352
+ p->buffer_size = new_buffer_size;
353
+ }
354
+
355
+ cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
356
+ Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
357
+ CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
358
+ SCS_CSR2CSC_ALG, p->buffer);
359
+
360
+ cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
361
+ p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
362
+ SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
363
+ SCS_CUDA_FLOAT);
179
364
  #endif
180
365
 
181
366
  err = cudaGetLastError();
182
367
  if (err != cudaSuccess) {
183
- printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
368
+ printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
184
369
  cudaGetErrorString(err));
185
370
  SCS(free_lin_sys_work)(p);
186
371
  return SCS_NULL;
@@ -188,117 +373,169 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
188
373
  return p;
189
374
  }
190
375
 
191
- static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
192
- scs_float *z, scs_float *r, scs_int n) {
193
- cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
194
- CUBLAS(tbmv)
195
- (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
196
- 0, M, 1, z, 1);
197
- }
198
-
199
- /* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
200
- static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
201
- ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
376
+ /* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
377
+ * b, on GPU */
378
+ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
202
379
  scs_int max_its, scs_float tol) {
203
- scs_int i, n = A->n;
204
- scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
380
+ scs_int i, n = pr->n;
381
+ scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
205
382
  scs_float onef = 1.0, neg_onef = -1.0;
206
383
  scs_float *p = pr->p; /* cg direction */
207
384
  scs_float *Gp = pr->Gp; /* updated CG direction */
208
385
  scs_float *r = pr->r; /* cg residual */
209
386
  scs_float *z = pr->z; /* preconditioned */
210
- scs_float *M = pr->M; /* preconditioner */
211
387
  cublasHandle_t cublas_handle = pr->cublas_handle;
212
388
 
213
- if (s == SCS_NULL) {
389
+ if (!s) {
390
+ /* take s = 0 */
391
+ /* r = b */
214
392
  cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
393
+ /* b = 0 */
215
394
  cudaMemset(bg, 0, n * sizeof(scs_float));
216
395
  } else {
217
396
  /* p contains bg temporarily */
218
397
  cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
219
- /* bg contains s */
398
+ /* bg = s */
220
399
  cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
221
- mat_vec(A, stgs, pr, bg, r);
400
+ /* r = Mat * s */
401
+ mat_vec(pr, bg, r);
402
+ /* r = Mat * s - b */
222
403
  CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
404
+ /* r = b - Mat * s */
223
405
  CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
224
406
  }
225
407
 
226
- /* for some reason nrm2 is VERY slow */
227
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
228
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
229
- nrm_r = SQRTF(nrm_r);
230
408
  /* check to see if we need to run CG at all */
231
- if (nrm_r < MIN(tol, 1e-18)) {
409
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
232
410
  return 0;
233
411
  }
234
412
 
235
- apply_pre_conditioner(cublas_handle, M, z, r, n);
236
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
237
- /* put z in p, replacing temp mem */
413
+ /* z = M r */
414
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
415
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
416
+ /* ztr = z'r */
417
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
418
+ /* p = z */
238
419
  cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
239
420
 
240
421
  for (i = 0; i < max_its; ++i) {
241
- mat_vec(A, stgs, pr, p, Gp);
242
-
243
- CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
244
-
245
- alpha = ipzr / p_gp;
422
+ /* Gp = Mat * p */
423
+ mat_vec(pr, p, Gp);
424
+ /* ptGp = p'Gp */
425
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
426
+ /* alpha = z'r / p'G p */
427
+ alpha = ztr / ptGp;
246
428
  neg_alpha = -alpha;
247
-
429
+ /* b += alpha * p */
248
430
  CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
431
+ /* r -= alpha * G p */
249
432
  CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
250
433
 
251
- /* for some reason nrm2 is VERY slow */
252
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
253
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
254
- nrm_r = SQRTF(nrm_r);
255
- if (nrm_r < tol) {
256
- i++;
257
- break;
258
- }
259
- ipzr_old = ipzr;
260
- apply_pre_conditioner(cublas_handle, M, z, r, n);
261
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
434
+ #if VERBOSITY > 3
435
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
436
+ cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
437
+ #endif
262
438
 
263
- beta = ipzr / ipzr_old;
439
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
440
+ return i + 1;
441
+ }
442
+ /* z = M r */
443
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
444
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
445
+ ztr_prev = ztr;
446
+ /* ztr = z'r */
447
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
448
+ beta = ztr / ztr_prev;
449
+ /* p = beta * p, where beta = ztr / ztr_prev */
264
450
  CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
451
+ /* p = z + beta * p */
265
452
  CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
266
453
  }
267
- #if EXTRA_VERBOSE > 0
268
- scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
269
- #endif
270
454
  return i;
271
455
  }
272
456
 
273
- scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
274
- ScsLinSysWork *p, scs_float *b, const scs_float *s,
275
- scs_int iter) {
276
- scs_int cg_its;
277
- SCS(timer) linsys_timer;
278
- scs_float *bg = p->bg;
457
+ /* solves Mx = b, for x but stores result in b */
458
+ /* s contains warm-start (if available) */
459
+ /*
460
+ * [x] = [R_x + P A' ]^{-1} [rx]
461
+ * [y] [ A -R_y ] [ry]
462
+ *
463
+ * becomes:
464
+ *
465
+ * x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
466
+ * y = R_y^{-1} (Ax - ry)
467
+ *
468
+ */
469
+ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
470
+ scs_float tol) {
471
+ scs_int cg_its, max_iters;
279
472
  scs_float neg_onef = -1.0;
473
+
474
+ /* these are on GPU */
475
+ scs_float *bg = p->bg;
476
+ scs_float *tmp_m = p->tmp_m;
280
477
  ScsGpuMatrix *Ag = p->Ag;
281
- scs_float cg_tol =
282
- SCS(norm)(b, Ag->n) *
283
- (iter < 0 ? CG_BEST_TOL
284
- : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
285
- SCS(tic)(&linsys_timer);
286
- /* all on GPU */
287
- cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
288
- SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
289
- /* solves (I+A'A)x = b, s warm start, solution stored in b */
290
- cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
291
- CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
292
- SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
293
- cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
294
478
 
295
- if (iter >= 0) {
296
- p->tot_cg_its += cg_its;
479
+ if (CG_NORM(b, p->n + p->m) <= 1e-12) {
480
+ memset(b, 0, (p->n + p->m) * sizeof(scs_float));
481
+ return 0;
482
+ }
483
+
484
+ if (tol <= 0.) {
485
+ scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
486
+ "INDIRECT flag.\n",
487
+ tol);
297
488
  }
298
489
 
299
- p->total_solve_time += SCS(tocq)(&linsys_timer);
300
- #if EXTRAVERBOSE > 0
301
- scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
490
+ /* bg = b = [rx; ry] */
491
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
492
+ cudaMemcpyHostToDevice);
493
+ /* tmp = ry */
494
+ cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
495
+ cudaMemcpyDeviceToDevice);
496
+ /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
497
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
498
+
499
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
500
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
501
+ /* bg[:n] = rx + A' R ry */
502
+ SCS(accum_by_atrans_gpu)
503
+ (Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
504
+ &p->buffer);
505
+
506
+ /* set max_iters to 10 * n (though in theory n is enough for any tol) */
507
+ max_iters = 10 * Ag->n;
508
+
509
+ /* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
510
+ * in bg */
511
+ cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
512
+
513
+ /* bg[n:] = -ry */
514
+ CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
515
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
516
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
517
+
518
+ /* b[n:] = Ax - ry */
519
+ #if GPU_TRANSPOSE_MAT > 0
520
+ SCS(accum_by_atrans_gpu)
521
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
522
+ &p->buffer);
523
+ #else
524
+ SCS(accum_by_a_gpu)
525
+ (Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
526
+ &p->buffer);
527
+ #endif
528
+
529
+ /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
530
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
531
+
532
+ /* copy bg = [x; y] back to b */
533
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
534
+ cudaMemcpyDeviceToHost);
535
+ p->tot_cg_its += cg_its;
536
+ #if VERBOSITY > 1
537
+ scs_printf("tol %.3e\n", tol);
538
+ scs_printf("cg_its %i\n", (int)cg_its);
302
539
  #endif
303
540
  return 0;
304
541
  }