scs 0.2.2 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE.txt +18 -18
  4. data/README.md +19 -14
  5. data/lib/scs/ffi.rb +31 -20
  6. data/lib/scs/solver.rb +32 -9
  7. data/lib/scs/version.rb +1 -1
  8. data/vendor/scs/CITATION.cff +39 -0
  9. data/vendor/scs/CMakeLists.txt +320 -0
  10. data/vendor/scs/Makefile +32 -23
  11. data/vendor/scs/README.md +9 -218
  12. data/vendor/scs/include/aa.h +67 -23
  13. data/vendor/scs/include/cones.h +22 -19
  14. data/vendor/scs/include/glbopts.h +107 -79
  15. data/vendor/scs/include/linalg.h +3 -4
  16. data/vendor/scs/include/linsys.h +58 -44
  17. data/vendor/scs/include/normalize.h +6 -5
  18. data/vendor/scs/include/rw.h +8 -2
  19. data/vendor/scs/include/scs.h +257 -141
  20. data/vendor/scs/include/scs_types.h +34 -0
  21. data/vendor/scs/include/scs_work.h +83 -0
  22. data/vendor/scs/include/util.h +3 -15
  23. data/vendor/scs/linsys/cpu/direct/private.c +241 -232
  24. data/vendor/scs/linsys/cpu/direct/private.h +13 -7
  25. data/vendor/scs/linsys/cpu/indirect/private.c +194 -118
  26. data/vendor/scs/linsys/cpu/indirect/private.h +7 -4
  27. data/vendor/scs/linsys/csparse.c +87 -0
  28. data/vendor/scs/linsys/csparse.h +34 -0
  29. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +6 -6
  30. data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +6 -1
  31. data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
  32. data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
  33. data/vendor/scs/linsys/external/qdldl/changes +2 -0
  34. data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
  35. data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
  36. data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
  37. data/vendor/scs/linsys/gpu/gpu.c +58 -21
  38. data/vendor/scs/linsys/gpu/gpu.h +70 -35
  39. data/vendor/scs/linsys/gpu/indirect/private.c +394 -157
  40. data/vendor/scs/linsys/gpu/indirect/private.h +27 -12
  41. data/vendor/scs/linsys/scs_matrix.c +478 -0
  42. data/vendor/scs/linsys/scs_matrix.h +70 -0
  43. data/vendor/scs/scs.mk +14 -10
  44. data/vendor/scs/src/aa.c +394 -110
  45. data/vendor/scs/src/cones.c +497 -359
  46. data/vendor/scs/src/ctrlc.c +15 -5
  47. data/vendor/scs/src/linalg.c +107 -26
  48. data/vendor/scs/src/normalize.c +30 -72
  49. data/vendor/scs/src/rw.c +202 -27
  50. data/vendor/scs/src/scs.c +769 -571
  51. data/vendor/scs/src/scs_version.c +11 -3
  52. data/vendor/scs/src/util.c +37 -106
  53. data/vendor/scs/test/minunit.h +22 -8
  54. data/vendor/scs/test/problem_utils.h +180 -25
  55. data/vendor/scs/test/problems/degenerate.h +130 -0
  56. data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
  57. data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
  58. data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
  59. data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
  60. data/vendor/scs/test/problems/random_prob +0 -0
  61. data/vendor/scs/test/problems/random_prob.h +45 -0
  62. data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
  63. data/vendor/scs/test/problems/small_lp.h +14 -13
  64. data/vendor/scs/test/problems/small_qp.h +352 -0
  65. data/vendor/scs/test/problems/test_validation.h +43 -0
  66. data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
  67. data/vendor/scs/test/random_socp_prob.c +54 -53
  68. data/vendor/scs/test/rng.h +109 -0
  69. data/vendor/scs/test/run_from_file.c +20 -11
  70. data/vendor/scs/test/run_tests.c +35 -2
  71. metadata +29 -98
  72. data/vendor/scs/linsys/amatrix.c +0 -305
  73. data/vendor/scs/linsys/amatrix.h +0 -36
  74. data/vendor/scs/linsys/amatrix.o +0 -0
  75. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  76. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  77. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  78. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  79. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  80. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  81. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  82. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  83. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  84. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  85. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  86. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  87. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  88. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  89. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  90. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  91. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  92. data/vendor/scs/src/aa.o +0 -0
  93. data/vendor/scs/src/cones.o +0 -0
  94. data/vendor/scs/src/ctrlc.o +0 -0
  95. data/vendor/scs/src/linalg.o +0 -0
  96. data/vendor/scs/src/normalize.o +0 -0
  97. data/vendor/scs/src/rw.o +0 -0
  98. data/vendor/scs/src/scs.o +0 -0
  99. data/vendor/scs/src/scs_version.o +0 -0
  100. data/vendor/scs/src/util.o +0 -0
  101. data/vendor/scs/test/data/small_random_socp +0 -0
  102. data/vendor/scs/test/problems/small_random_socp.h +0 -33
  103. data/vendor/scs/test/run_tests +0 -2
@@ -1,61 +1,115 @@
1
1
  #include "private.h"
2
+ #include "linsys.h"
2
3
 
3
- #define CG_BEST_TOL 1e-9
4
- #define CG_MIN_TOL 1e-1
5
-
6
- /* do not use within pcg, reuses memory */
7
- void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
8
- const scs_float *x, scs_float *y) {
9
- scs_float *v_m = p->tmp_m;
10
- scs_float *v_n = p->r;
11
- cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
12
- cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
13
- SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
14
- cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
15
- }
4
+ /* norm to use when deciding convergence */
5
+ /* should be consistent with CG_NORM in glbopts.h */
6
+ #define USE_L2_NORM (0)
16
7
 
17
- /* do not use within pcg, reuses memory */
18
- void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
19
- scs_float *y) {
20
- scs_float *v_m = p->tmp_m;
21
- scs_float *v_n = p->r;
22
- cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
23
- cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
24
- #if GPU_TRANSPOSE_MAT > 0
25
- SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
8
+ static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
9
+ scs_int n) {
10
+ #if USE_L2_NORM > 0
11
+ scs_float nrm;
12
+ CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
26
13
  #else
27
- SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
14
+ scs_int idx;
15
+ scs_float nrm;
16
+ CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
17
+ /* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
18
+ cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
19
+ nrm = ABS(nrm);
28
20
  #endif
29
- cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
21
+ return nrm;
30
22
  }
31
23
 
32
- char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
33
- char *str = (char *)scs_malloc(sizeof(char) * 128);
34
- sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
35
- (long)A->p[A->n], stgs->cg_rate);
36
- return str;
24
+ const char *SCS(get_lin_sys_method)() {
25
+ return "sparse-indirect GPU";
37
26
  }
38
27
 
28
+ /*
39
29
  char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
40
30
  char *str = (char *)scs_malloc(sizeof(char) * 128);
41
- sprintf(str,
42
- "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
43
- (scs_float)p->tot_cg_its / (info->iter + 1),
44
- p->total_solve_time / (info->iter + 1) / 1e3);
31
+ sprintf(str, "lin-sys: avg cg its: %2.2f\n",
32
+ (scs_float)p->tot_cg_its / (info->iter + 1));
45
33
  p->tot_cg_its = 0;
46
- p->total_solve_time = 0;
47
34
  return str;
48
35
  }
36
+ */
37
+
38
+ /* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
39
+ /* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
40
+ static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
41
+ scs_int i, k;
42
+ const ScsMatrix *A = p->A;
43
+ const ScsMatrix *P = p->P;
44
+ scs_float *M = p->M;
45
+
46
+ #if VERBOSITY > 0
47
+ scs_printf("getting pre-conditioner\n");
48
+ #endif
49
+
50
+ /* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
51
+ for (i = 0; i < A->n; ++i) { /* cols */
52
+ /* M_ii = (R_x)_i */
53
+ M[i] = diag_r[i];
54
+ /* M_ii += a_i' (R_y)^-1 a_i */
55
+ for (k = A->p[i]; k < A->p[i + 1]; ++k) {
56
+ /* A->i[k] is row of entry k with value A->x[k] */
57
+ M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
58
+ }
59
+ if (P) {
60
+ for (k = P->p[i]; k < P->p[i + 1]; k++) {
61
+ /* diagonal element only */
62
+ if (P->i[k] == i) { /* row == col */
63
+ /* M_ii += P_ii */
64
+ M[i] += P->x[k];
65
+ break;
66
+ }
67
+ }
68
+ }
69
+ /* finally invert for pre-conditioner */
70
+ M[i] = 1. / M[i];
71
+ }
72
+ cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
73
+ #if VERBOSITY > 0
74
+ scs_printf("finished getting pre-conditioner\n");
75
+ #endif
76
+ }
77
+
78
+ /* no need to update anything in this case */
79
+ void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
80
+ scs_int i;
81
+
82
+ /* R_x to gpu */
83
+ cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
84
+ cudaMemcpyHostToDevice);
85
+
86
+ /* 1/R_y to gpu */
87
+ for (i = 0; i < p->m; ++i)
88
+ p->inv_r_y[i] = 1. / diag_r[p->n + i];
89
+ cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
90
+ cudaMemcpyHostToDevice);
91
+
92
+ /* set preconditioner M on gpu */
93
+ set_preconditioner(p, diag_r);
94
+ }
49
95
 
50
96
  void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
51
97
  if (p) {
98
+ scs_free(p->M);
99
+ scs_free(p->inv_r_y);
52
100
  cudaFree(p->p);
53
101
  cudaFree(p->r);
54
102
  cudaFree(p->Gp);
55
103
  cudaFree(p->bg);
56
104
  cudaFree(p->tmp_m);
57
105
  cudaFree(p->z);
58
- cudaFree(p->M);
106
+ cudaFree(p->M_gpu);
107
+ cudaFree(p->r_x_gpu);
108
+ cudaFree(p->inv_r_y_gpu);
109
+ if (p->Pg) {
110
+ SCS(free_gpu_matrix)(p->Pg);
111
+ scs_free(p->Pg);
112
+ }
59
113
  if (p->Ag) {
60
114
  SCS(free_gpu_matrix)(p->Ag);
61
115
  scs_free(p->Ag);
@@ -64,6 +118,12 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
64
118
  SCS(free_gpu_matrix)(p->Agt);
65
119
  scs_free(p->Agt);
66
120
  }
121
+ if (p->buffer != SCS_NULL) {
122
+ cudaFree(p->buffer);
123
+ }
124
+ cusparseDestroyDnVec(p->dn_vec_m);
125
+ cusparseDestroyDnVec(p->dn_vec_n);
126
+ cusparseDestroyDnVec(p->dn_vec_n_p);
67
127
  cusparseDestroy(p->cusparse_handle);
68
128
  cublasDestroy(p->cublas_handle);
69
129
  /* Don't reset because it interferes with other GPU programs. */
@@ -72,53 +132,127 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
72
132
  }
73
133
  }
74
134
 
75
- /*y = (RHO_X * I + A'A)x */
76
- static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
77
- ScsLinSysWork *p, const scs_float *x, scs_float *y) {
78
- /* x and y MUST already be loaded to GPU */
79
- scs_float *tmp_m = p->tmp_m; /* temp memory */
80
- cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
81
- SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
82
- cudaMemset(y, 0, A->n * sizeof(scs_float));
83
- SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
84
- CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
135
+ /* z = M * z elementwise in place, assumes M, z on GPU */
136
+ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
137
+ scs_float *z, scs_int n) {
138
+ CUBLAS(tbmv)
139
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
140
+ 0, M, 1, z, 1);
85
141
  }
86
142
 
87
- /* M = inv ( diag ( RHO_X * I + A'A ) ) */
88
- static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
89
- ScsLinSysWork *p) {
90
- scs_int i;
91
- scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
143
+ /* y = (R_x + P + A' R_y^{-1} A) x */
144
+ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
145
+ /* x and y MUST already be loaded to GPU */
146
+ scs_float *z = p->tmp_m; /* temp memory */
147
+ cudaMemset(z, 0, p->m * sizeof(scs_float));
148
+
149
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
150
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
151
+ cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
152
+
153
+ /* y = x */
154
+ cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
155
+ /* y = R_x * x */
156
+ scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
157
+
158
+ if (p->Pg) {
159
+ /* y = R_x * x + P x */
160
+ SCS(accum_by_p_gpu)
161
+ (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
162
+ &p->buffer);
163
+ }
92
164
 
93
- #if EXTRA_VERBOSE > 0
94
- scs_printf("getting pre-conditioner\n");
165
+ /* z = Ax */
166
+ #if GPU_TRANSPOSE_MAT > 0
167
+ SCS(accum_by_atrans_gpu)
168
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
169
+ &p->buffer);
170
+ #else
171
+ SCS(accum_by_a_gpu)
172
+ (p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
173
+ &p->buffer);
95
174
  #endif
175
+ /* z = R_y^{-1} A x */
176
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
96
177
 
97
- for (i = 0; i < A->n; ++i) {
98
- M[i] = 1 / (stgs->rho_x +
99
- SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
100
- /* M[i] = 1; */
101
- }
102
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
103
- scs_free(M);
178
+ /* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
179
+ SCS(accum_by_atrans_gpu)
180
+ (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
181
+ &p->buffer);
182
+ }
104
183
 
105
- #if EXTRA_VERBOSE > 0
106
- scs_printf("finished getting pre-conditioner\n");
107
- #endif
184
+ /* P comes in upper triangular, expand to full
185
+ * First compute triplet version of full matrix, then compress to csc
186
+ * */
187
+ static csc *fill_p_matrix(const ScsMatrix *P) {
188
+ scs_int i, j, k, kk;
189
+ scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
190
+ csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
191
+ csc *P_full;
192
+ kk = 0;
193
+ for (j = 0; j < P->n; j++) { /* cols */
194
+ for (k = P->p[j]; k < P->p[j + 1]; k++) {
195
+ i = P->i[k]; /* row */
196
+ if (i > j) { /* only upper triangular needed */
197
+ break;
198
+ }
199
+ P_tmp->i[kk] = i;
200
+ P_tmp->p[kk] = j;
201
+ P_tmp->x[kk] = P->x[k];
202
+ kk++;
203
+ if (i == j) { /* diagonal */
204
+ continue;
205
+ }
206
+ P_tmp->i[kk] = j;
207
+ P_tmp->p[kk] = i;
208
+ P_tmp->x[kk] = P->x[k];
209
+ kk++;
210
+ }
211
+ }
212
+ P_tmp->nz = kk; /* set number of nonzeros */
213
+ P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
214
+ SCS(cs_spfree)(P_tmp);
215
+ return P_full;
108
216
  }
109
217
 
110
- ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
111
- const ScsSettings *stgs) {
218
+ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
219
+ const scs_float *diag_r) {
112
220
  cudaError_t err;
113
- ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
114
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
221
+ csc *P_full;
222
+ ScsLinSysWork *p = SCS_NULL;
223
+ ScsGpuMatrix *Ag = SCS_NULL;
224
+ ScsGpuMatrix *Pg = SCS_NULL;
225
+ int device_count;
226
+
227
+ err = cudaGetDeviceCount(&device_count);
228
+ if (err > 0) {
229
+ scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
230
+ return SCS_NULL;
231
+ }
232
+
233
+ p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
234
+ Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
235
+
236
+ p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
237
+ p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
238
+
239
+ p->A = A;
240
+ p->P = P;
241
+ p->m = A->m;
242
+ p->n = A->n;
243
+
244
+ #if GPU_TRANSPOSE_MAT > 0
245
+ size_t new_buffer_size = 0;
246
+ #endif
115
247
 
116
248
  p->cublas_handle = 0;
117
249
  p->cusparse_handle = 0;
118
250
 
119
- p->total_solve_time = 0;
120
251
  p->tot_cg_its = 0;
121
252
 
253
+ p->buffer_size = 0;
254
+ p->buffer = SCS_NULL;
255
+
122
256
  /* Get handle to the CUBLAS context */
123
257
  cublasCreate(&p->cublas_handle);
124
258
 
@@ -127,15 +261,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
127
261
 
128
262
  Ag->n = A->n;
129
263
  Ag->m = A->m;
130
- Ag->Annz = A->p[A->n];
264
+ Ag->nnz = A->p[A->n];
131
265
  Ag->descr = 0;
132
- /* Matrix description */
133
- cusparseCreateMatDescr(&Ag->descr);
134
- cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
135
- cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
136
- p->Ag = Ag;
137
- p->Agt = SCS_NULL;
138
-
139
266
  cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
140
267
  cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
141
268
  cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
@@ -144,10 +271,11 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
144
271
  cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
145
272
  cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
146
273
  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
147
- cudaMalloc((void **)&p->tmp_m,
148
- A->m * sizeof(scs_float)); /* intermediate result */
274
+ cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
149
275
  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
150
- cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
276
+ cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
277
+ cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
278
+ cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
151
279
 
152
280
  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
153
281
  cudaMemcpyHostToDevice);
@@ -155,32 +283,89 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
155
283
  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
156
284
  cudaMemcpyHostToDevice);
157
285
 
158
- get_preconditioner(A, stgs, p);
286
+ cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
287
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
288
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
289
+
290
+ if (P) {
291
+ Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
292
+ P_full = fill_p_matrix(P);
293
+ Pg->n = P_full->n;
294
+ Pg->m = P_full->m;
295
+ Pg->nnz = P_full->p[P_full->n];
296
+ Pg->descr = 0;
297
+ cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
298
+ cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
299
+ cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
300
+
301
+ cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
302
+ cudaMemcpyHostToDevice);
303
+ cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
304
+ cudaMemcpyHostToDevice);
305
+ cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
306
+ cudaMemcpyHostToDevice);
307
+
308
+ cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
309
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
310
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
311
+
312
+ SCS(cs_spfree)(P_full);
313
+ } else {
314
+ Pg = SCS_NULL;
315
+ }
316
+
317
+ p->Ag = Ag;
318
+ p->Pg = Pg;
319
+ p->Agt = SCS_NULL;
320
+
321
+ /* we initialize with tmp_m but always overwrite it so it doesn't matter */
322
+ cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
323
+ cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
324
+ cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
325
+
326
+ /* Form preconditioner and copy R_x, 1/R_y to gpu */
327
+ SCS(update_lin_sys_diag_r)(p, diag_r);
159
328
 
160
329
  #if GPU_TRANSPOSE_MAT > 0
161
330
  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
162
331
  p->Agt->n = A->m;
163
332
  p->Agt->m = A->n;
164
- p->Agt->Annz = A->p[A->n];
333
+ p->Agt->nnz = A->p[A->n];
165
334
  p->Agt->descr = 0;
166
335
  /* Matrix description */
167
- cusparseCreateMatDescr(&p->Agt->descr);
168
- cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
169
- cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
170
336
 
171
337
  cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
172
338
  cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
173
339
  cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
174
340
  /* transpose Ag into Agt for faster multiplies */
175
341
  /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
176
- CUSPARSE(csr2csc)
177
- (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
178
- p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
342
+ cusparseCsr2cscEx2_bufferSize(
343
+ p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
344
+ p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
345
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
346
+
347
+ if (new_buffer_size > p->buffer_size) {
348
+ if (p->buffer != SCS_NULL) {
349
+ cudaFree(p->buffer);
350
+ }
351
+ cudaMalloc(&p->buffer, new_buffer_size);
352
+ p->buffer_size = new_buffer_size;
353
+ }
354
+
355
+ cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
356
+ Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
357
+ CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
358
+ SCS_CSR2CSC_ALG, p->buffer);
359
+
360
+ cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
361
+ p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
362
+ SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
363
+ SCS_CUDA_FLOAT);
179
364
  #endif
180
365
 
181
366
  err = cudaGetLastError();
182
367
  if (err != cudaSuccess) {
183
- printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
368
+ printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
184
369
  cudaGetErrorString(err));
185
370
  SCS(free_lin_sys_work)(p);
186
371
  return SCS_NULL;
@@ -188,117 +373,169 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
188
373
  return p;
189
374
  }
190
375
 
191
- static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
192
- scs_float *z, scs_float *r, scs_int n) {
193
- cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
194
- CUBLAS(tbmv)
195
- (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
196
- 0, M, 1, z, 1);
197
- }
198
-
199
- /* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
200
- static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
201
- ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
376
+ /* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
377
+ * b, on GPU */
378
+ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
202
379
  scs_int max_its, scs_float tol) {
203
- scs_int i, n = A->n;
204
- scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
380
+ scs_int i, n = pr->n;
381
+ scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
205
382
  scs_float onef = 1.0, neg_onef = -1.0;
206
383
  scs_float *p = pr->p; /* cg direction */
207
384
  scs_float *Gp = pr->Gp; /* updated CG direction */
208
385
  scs_float *r = pr->r; /* cg residual */
209
386
  scs_float *z = pr->z; /* preconditioned */
210
- scs_float *M = pr->M; /* preconditioner */
211
387
  cublasHandle_t cublas_handle = pr->cublas_handle;
212
388
 
213
- if (s == SCS_NULL) {
389
+ if (!s) {
390
+ /* take s = 0 */
391
+ /* r = b */
214
392
  cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
393
+ /* b = 0 */
215
394
  cudaMemset(bg, 0, n * sizeof(scs_float));
216
395
  } else {
217
396
  /* p contains bg temporarily */
218
397
  cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
219
- /* bg contains s */
398
+ /* bg = s */
220
399
  cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
221
- mat_vec(A, stgs, pr, bg, r);
400
+ /* r = Mat * s */
401
+ mat_vec(pr, bg, r);
402
+ /* r = Mat * s - b */
222
403
  CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
404
+ /* r = b - Mat * s */
223
405
  CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
224
406
  }
225
407
 
226
- /* for some reason nrm2 is VERY slow */
227
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
228
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
229
- nrm_r = SQRTF(nrm_r);
230
408
  /* check to see if we need to run CG at all */
231
- if (nrm_r < MIN(tol, 1e-18)) {
409
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
232
410
  return 0;
233
411
  }
234
412
 
235
- apply_pre_conditioner(cublas_handle, M, z, r, n);
236
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
237
- /* put z in p, replacing temp mem */
413
+ /* z = M r */
414
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
415
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
416
+ /* ztr = z'r */
417
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
418
+ /* p = z */
238
419
  cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
239
420
 
240
421
  for (i = 0; i < max_its; ++i) {
241
- mat_vec(A, stgs, pr, p, Gp);
242
-
243
- CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
244
-
245
- alpha = ipzr / p_gp;
422
+ /* Gp = Mat * p */
423
+ mat_vec(pr, p, Gp);
424
+ /* ptGp = p'Gp */
425
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
426
+ /* alpha = z'r / p'G p */
427
+ alpha = ztr / ptGp;
246
428
  neg_alpha = -alpha;
247
-
429
+ /* b += alpha * p */
248
430
  CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
431
+ /* r -= alpha * G p */
249
432
  CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
250
433
 
251
- /* for some reason nrm2 is VERY slow */
252
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
253
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
254
- nrm_r = SQRTF(nrm_r);
255
- if (nrm_r < tol) {
256
- i++;
257
- break;
258
- }
259
- ipzr_old = ipzr;
260
- apply_pre_conditioner(cublas_handle, M, z, r, n);
261
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
434
+ #if VERBOSITY > 3
435
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
436
+ cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
437
+ #endif
262
438
 
263
- beta = ipzr / ipzr_old;
439
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
440
+ return i + 1;
441
+ }
442
+ /* z = M r */
443
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
444
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
445
+ ztr_prev = ztr;
446
+ /* ztr = z'r */
447
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
448
+ beta = ztr / ztr_prev;
449
+ /* p = beta * p, where beta = ztr / ztr_prev */
264
450
  CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
451
+ /* p = z + beta * p */
265
452
  CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
266
453
  }
267
- #if EXTRA_VERBOSE > 0
268
- scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
269
- #endif
270
454
  return i;
271
455
  }
272
456
 
273
- scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
274
- ScsLinSysWork *p, scs_float *b, const scs_float *s,
275
- scs_int iter) {
276
- scs_int cg_its;
277
- SCS(timer) linsys_timer;
278
- scs_float *bg = p->bg;
457
+ /* solves Mx = b, for x but stores result in b */
458
+ /* s contains warm-start (if available) */
459
+ /*
460
+ * [x] = [R_x + P A' ]^{-1} [rx]
461
+ * [y] [ A -R_y ] [ry]
462
+ *
463
+ * becomes:
464
+ *
465
+ * x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
466
+ * y = R_y^{-1} (Ax - ry)
467
+ *
468
+ */
469
+ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
470
+ scs_float tol) {
471
+ scs_int cg_its, max_iters;
279
472
  scs_float neg_onef = -1.0;
473
+
474
+ /* these are on GPU */
475
+ scs_float *bg = p->bg;
476
+ scs_float *tmp_m = p->tmp_m;
280
477
  ScsGpuMatrix *Ag = p->Ag;
281
- scs_float cg_tol =
282
- SCS(norm)(b, Ag->n) *
283
- (iter < 0 ? CG_BEST_TOL
284
- : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
285
- SCS(tic)(&linsys_timer);
286
- /* all on GPU */
287
- cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
288
- SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
289
- /* solves (I+A'A)x = b, s warm start, solution stored in b */
290
- cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
291
- CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
292
- SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
293
- cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
294
478
 
295
- if (iter >= 0) {
296
- p->tot_cg_its += cg_its;
479
+ if (CG_NORM(b, p->n + p->m) <= 1e-12) {
480
+ memset(b, 0, (p->n + p->m) * sizeof(scs_float));
481
+ return 0;
482
+ }
483
+
484
+ if (tol <= 0.) {
485
+ scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
486
+ "INDIRECT flag.\n",
487
+ tol);
297
488
  }
298
489
 
299
- p->total_solve_time += SCS(tocq)(&linsys_timer);
300
- #if EXTRAVERBOSE > 0
301
- scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
490
+ /* bg = b = [rx; ry] */
491
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
492
+ cudaMemcpyHostToDevice);
493
+ /* tmp = ry */
494
+ cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
495
+ cudaMemcpyDeviceToDevice);
496
+ /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
497
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
498
+
499
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
500
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
501
+ /* bg[:n] = rx + A' R ry */
502
+ SCS(accum_by_atrans_gpu)
503
+ (Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
504
+ &p->buffer);
505
+
506
+ /* set max_iters to 10 * n (though in theory n is enough for any tol) */
507
+ max_iters = 10 * Ag->n;
508
+
509
+ /* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
510
+ * in bg */
511
+ cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
512
+
513
+ /* bg[n:] = -ry */
514
+ CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
515
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
516
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
517
+
518
+ /* b[n:] = Ax - ry */
519
+ #if GPU_TRANSPOSE_MAT > 0
520
+ SCS(accum_by_atrans_gpu)
521
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
522
+ &p->buffer);
523
+ #else
524
+ SCS(accum_by_a_gpu)
525
+ (Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
526
+ &p->buffer);
527
+ #endif
528
+
529
+ /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
530
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
531
+
532
+ /* copy bg = [x; y] back to b */
533
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
534
+ cudaMemcpyDeviceToHost);
535
+ p->tot_cg_its += cg_its;
536
+ #if VERBOSITY > 1
537
+ scs_printf("tol %.3e\n", tol);
538
+ scs_printf("cg_its %i\n", (int)cg_its);
302
539
  #endif
303
540
  return 0;
304
541
  }