scs 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +11 -6
  4. data/lib/scs/ffi.rb +30 -13
  5. data/lib/scs/solver.rb +32 -9
  6. data/lib/scs/version.rb +1 -1
  7. data/vendor/scs/CITATION.cff +39 -0
  8. data/vendor/scs/CMakeLists.txt +7 -8
  9. data/vendor/scs/Makefile +24 -15
  10. data/vendor/scs/README.md +5 -263
  11. data/vendor/scs/include/aa.h +67 -23
  12. data/vendor/scs/include/cones.h +17 -17
  13. data/vendor/scs/include/glbopts.h +98 -32
  14. data/vendor/scs/include/linalg.h +2 -4
  15. data/vendor/scs/include/linsys.h +58 -44
  16. data/vendor/scs/include/normalize.h +3 -3
  17. data/vendor/scs/include/rw.h +8 -2
  18. data/vendor/scs/include/scs.h +293 -133
  19. data/vendor/scs/include/util.h +3 -15
  20. data/vendor/scs/linsys/cpu/direct/private.c +220 -224
  21. data/vendor/scs/linsys/cpu/direct/private.h +13 -7
  22. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  23. data/vendor/scs/linsys/cpu/indirect/private.c +177 -110
  24. data/vendor/scs/linsys/cpu/indirect/private.h +8 -4
  25. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  26. data/vendor/scs/linsys/csparse.c +87 -0
  27. data/vendor/scs/linsys/csparse.h +34 -0
  28. data/vendor/scs/linsys/csparse.o +0 -0
  29. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +1 -1
  30. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  31. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  32. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  33. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  34. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  35. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  36. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  37. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  38. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  39. data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
  40. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  41. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  42. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  43. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  44. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  45. data/vendor/scs/linsys/external/qdldl/changes +2 -0
  46. data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
  47. data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
  48. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  49. data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
  50. data/vendor/scs/linsys/gpu/gpu.c +31 -33
  51. data/vendor/scs/linsys/gpu/gpu.h +48 -31
  52. data/vendor/scs/linsys/gpu/indirect/private.c +338 -232
  53. data/vendor/scs/linsys/gpu/indirect/private.h +23 -14
  54. data/vendor/scs/linsys/scs_matrix.c +498 -0
  55. data/vendor/scs/linsys/scs_matrix.h +70 -0
  56. data/vendor/scs/linsys/scs_matrix.o +0 -0
  57. data/vendor/scs/scs.mk +13 -9
  58. data/vendor/scs/src/aa.c +384 -109
  59. data/vendor/scs/src/aa.o +0 -0
  60. data/vendor/scs/src/cones.c +440 -353
  61. data/vendor/scs/src/cones.o +0 -0
  62. data/vendor/scs/src/ctrlc.c +15 -5
  63. data/vendor/scs/src/ctrlc.o +0 -0
  64. data/vendor/scs/src/linalg.c +84 -28
  65. data/vendor/scs/src/linalg.o +0 -0
  66. data/vendor/scs/src/normalize.c +22 -64
  67. data/vendor/scs/src/normalize.o +0 -0
  68. data/vendor/scs/src/rw.c +160 -21
  69. data/vendor/scs/src/rw.o +0 -0
  70. data/vendor/scs/src/scs.c +767 -563
  71. data/vendor/scs/src/scs.o +0 -0
  72. data/vendor/scs/src/scs_indir.o +0 -0
  73. data/vendor/scs/src/scs_version.c +9 -3
  74. data/vendor/scs/src/scs_version.o +0 -0
  75. data/vendor/scs/src/util.c +37 -106
  76. data/vendor/scs/src/util.o +0 -0
  77. data/vendor/scs/test/minunit.h +17 -8
  78. data/vendor/scs/test/problem_utils.h +176 -14
  79. data/vendor/scs/test/problems/degenerate.h +130 -0
  80. data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
  81. data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
  82. data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
  83. data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
  84. data/vendor/scs/test/problems/random_prob +0 -0
  85. data/vendor/scs/test/problems/random_prob.h +45 -0
  86. data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
  87. data/vendor/scs/test/problems/small_lp.h +13 -14
  88. data/vendor/scs/test/problems/test_fails.h +43 -0
  89. data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
  90. data/vendor/scs/test/random_socp_prob.c +54 -53
  91. data/vendor/scs/test/rng.h +109 -0
  92. data/vendor/scs/test/run_from_file.c +19 -10
  93. data/vendor/scs/test/run_tests.c +27 -3
  94. metadata +20 -8
  95. data/vendor/scs/linsys/amatrix.c +0 -305
  96. data/vendor/scs/linsys/amatrix.h +0 -36
  97. data/vendor/scs/linsys/amatrix.o +0 -0
  98. data/vendor/scs/test/data/small_random_socp +0 -0
  99. data/vendor/scs/test/problems/small_random_socp.h +0 -33
  100. data/vendor/scs/test/run_tests +0 -2
@@ -1,71 +1,89 @@
1
1
  #include "private.h"
2
+ #include "linsys.h"
2
3
 
3
- #define CG_BEST_TOL 1e-9
4
- #define CG_MIN_TOL 1e-1
5
-
6
- /* do not use within pcg, reuses memory */
7
- void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
8
- const scs_float *x, scs_float *y) {
9
- scs_float *v_m = p->tmp_m;
10
- scs_float *v_n = p->r;
11
- cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
12
- cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
13
-
14
- cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m);
15
- cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n);
16
- SCS(_accum_by_atrans_gpu)(
17
- p->Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
18
- &p->buffer_size, &p->buffer
19
- );
20
-
21
- cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
22
- }
23
-
24
- /* do not use within pcg, reuses memory */
25
- void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
26
- scs_float *y) {
27
- scs_float *v_m = p->tmp_m;
28
- scs_float *v_n = p->r;
29
- cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
30
- cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
4
+ /* norm to use when deciding convergence */
5
+ /* should be consistent with CG_NORM in glbopts.h */
6
+ #define USE_L2_NORM (0)
31
7
 
32
- cusparseDnVecSetValues(p->dn_vec_m, (void *) v_m);
33
- cusparseDnVecSetValues(p->dn_vec_n, (void *) v_n);
34
- #if GPU_TRANSPOSE_MAT > 0
35
- SCS(_accum_by_atrans_gpu)(
36
- p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
37
- &p->buffer_size, &p->buffer
38
- );
8
+ static scs_float cg_gpu_norm(cublasHandle_t cublas_handle, scs_float *r,
9
+ scs_int n) {
10
+ #if USE_L2_NORM > 0
11
+ scs_float nrm;
12
+ CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm);
39
13
  #else
40
- SCS(_accum_by_a_gpu)(
41
- p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
42
- &p->buffer_size, &p->buffer
43
- );
14
+ scs_int idx;
15
+ scs_float nrm;
16
+ CUBLASI(amax)(cublas_handle, n, r, 1, &idx);
17
+ /* NOTE: we take idx -1 here since the routine above returns Fortran idxs */
18
+ cudaMemcpy(&nrm, &(r[idx - 1]), sizeof(scs_float), cudaMemcpyDeviceToHost);
19
+ nrm = ABS(nrm);
44
20
  #endif
45
-
46
- cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
21
+ return nrm;
47
22
  }
48
23
 
49
- char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
50
- char *str = (char *)scs_malloc(sizeof(char) * 128);
51
- sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
52
- (long)A->p[A->n], stgs->cg_rate);
53
- return str;
24
+ const char *SCS(get_lin_sys_method)() {
25
+ return "sparse-indirect GPU";
54
26
  }
55
27
 
28
+ /*
56
29
  char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
57
30
  char *str = (char *)scs_malloc(sizeof(char) * 128);
58
- sprintf(str,
59
- "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
60
- (scs_float)p->tot_cg_its / (info->iter + 1),
61
- p->total_solve_time / (info->iter + 1) / 1e3);
31
+ sprintf(str, "lin-sys: avg cg its: %2.2f\n",
32
+ (scs_float)p->tot_cg_its / (info->iter + 1));
62
33
  p->tot_cg_its = 0;
63
- p->total_solve_time = 0;
64
34
  return str;
65
35
  }
36
+ */
37
+
38
+ /* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
39
+ static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
40
+ scs_int i, k;
41
+ const ScsMatrix *A = p->A;
42
+ const ScsMatrix *P = p->P;
43
+ scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
44
+
45
+ #if VERBOSITY > 0
46
+ scs_printf("getting pre-conditioner\n");
47
+ #endif
48
+
49
+ for (i = 0; i < A->n; ++i) { /* cols */
50
+ M[i] = p->rho_x;
51
+ /* diag(A' R_y^{-1} A) */
52
+ for (k = A->p[i]; k < A->p[i + 1]; ++k) {
53
+ /* A->i[k] is row of entry k with value A->x[k] */
54
+ M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
55
+ }
56
+ if (P) {
57
+ for (k = P->p[i]; k < P->p[i + 1]; k++) {
58
+ /* diagonal element only */
59
+ if (P->i[k] == i) { /* row == col */
60
+ M[i] += P->x[k];
61
+ break;
62
+ }
63
+ }
64
+ }
65
+ M[i] = 1. / M[i];
66
+ }
67
+ cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
68
+ scs_free(M);
69
+ #if VERBOSITY > 0
70
+ scs_printf("finished getting pre-conditioner\n");
71
+ #endif
72
+ }
73
+
74
+ /* no need to update anything in this case */
75
+ void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
76
+ scs_int i;
77
+ for (i = 0; i < p->m; ++i)
78
+ p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
79
+ cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
80
+ cudaMemcpyHostToDevice);
81
+ set_preconditioner(p, rho_y_vec);
82
+ }
66
83
 
67
84
  void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
68
85
  if (p) {
86
+ scs_free(p->inv_rho_y_vec);
69
87
  cudaFree(p->p);
70
88
  cudaFree(p->r);
71
89
  cudaFree(p->Gp);
@@ -73,6 +91,11 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
73
91
  cudaFree(p->tmp_m);
74
92
  cudaFree(p->z);
75
93
  cudaFree(p->M);
94
+ cudaFree(p->inv_rho_y_vec_gpu);
95
+ if (p->Pg) {
96
+ SCS(free_gpu_matrix)(p->Pg);
97
+ scs_free(p->Pg);
98
+ }
76
99
  if (p->Ag) {
77
100
  SCS(free_gpu_matrix)(p->Ag);
78
101
  scs_free(p->Ag);
@@ -86,6 +109,7 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
86
109
  }
87
110
  cusparseDestroyDnVec(p->dn_vec_m);
88
111
  cusparseDestroyDnVec(p->dn_vec_n);
112
+ cusparseDestroyDnVec(p->dn_vec_n_p);
89
113
  cusparseDestroy(p->cusparse_handle);
90
114
  cublasDestroy(p->cublas_handle);
91
115
  /* Don't reset because it interferes with other GPU programs. */
@@ -94,80 +118,105 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
94
118
  }
95
119
  }
96
120
 
97
- /*y = (RHO_X * I + A'A)x */
98
- static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
99
- ScsLinSysWork *p, const scs_float *x, scs_float *y) {
121
+ /* z = M * z elementwise in place, assumes M, z on GPU */
122
+ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
123
+ scs_float *z, scs_int n) {
124
+ CUBLAS(tbmv)
125
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
126
+ 0, M, 1, z, 1);
127
+ }
128
+
129
+ /* y = (rho_x * I + P + A' R_y^{-1} A) x */
130
+ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
100
131
  /* x and y MUST already be loaded to GPU */
101
- scs_float *tmp_m = p->tmp_m; /* temp memory */
102
- cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
132
+ scs_float *z = p->tmp_m; /* temp memory */
133
+ cudaMemset(y, 0, p->n * sizeof(scs_float));
134
+ cudaMemset(z, 0, p->m * sizeof(scs_float));
135
+
136
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
137
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
138
+ cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
139
+
140
+ /* y = rho_x * x */
141
+ CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
142
+
143
+ if (p->Pg) {
144
+ /* y = rho_x * x + Px */
145
+ SCS(accum_by_p_gpu)
146
+ (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
147
+ &p->buffer);
148
+ }
103
149
 
104
- cusparseDnVecSetValues(p->dn_vec_m, (void *) tmp_m);
105
- cusparseDnVecSetValues(p->dn_vec_n, (void *) x);
150
+ /* z = Ax */
106
151
  #if GPU_TRANSPOSE_MAT > 0
107
- SCS(_accum_by_atrans_gpu)(
108
- p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
109
- &p->buffer_size, &p->buffer
110
- );
152
+ SCS(accum_by_atrans_gpu)
153
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
154
+ &p->buffer);
111
155
  #else
112
- SCS(_accum_by_a_gpu)(
113
- A, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
114
- &p->buffer_size, &p->buffer
115
- );
156
+ SCS(accum_by_a_gpu)
157
+ (p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
158
+ &p->buffer);
116
159
  #endif
160
+ /* z = R_y^{-1} A x */
161
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
117
162
 
118
- cudaMemset(y, 0, A->n * sizeof(scs_float));
119
-
120
- cusparseDnVecSetValues(p->dn_vec_m, (void *) tmp_m);
121
- cusparseDnVecSetValues(p->dn_vec_n, (void *) y);
122
- SCS(_accum_by_atrans_gpu)(
123
- A, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
124
- &p->buffer_size, &p->buffer
125
- );
126
-
127
- CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
163
+ /* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
164
+ SCS(accum_by_atrans_gpu)
165
+ (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
166
+ &p->buffer);
128
167
  }
129
168
 
130
- /* M = inv ( diag ( RHO_X * I + A'A ) ) */
131
- static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
132
- ScsLinSysWork *p) {
133
- scs_int i;
134
- scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
135
-
136
- #if EXTRA_VERBOSE > 0
137
- scs_printf("getting pre-conditioner\n");
138
- #endif
139
-
140
- for (i = 0; i < A->n; ++i) {
141
- M[i] = 1 / (stgs->rho_x +
142
- SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
143
- /* M[i] = 1; */
169
+ /* P comes in upper triangular, expand to full
170
+ * First compute triplet version of full matrix, then compress to csc
171
+ * */
172
+ static csc *fill_p_matrix(const ScsMatrix *P) {
173
+ scs_int i, j, k, kk;
174
+ scs_int Pnzmax = 2 * P->p[P->n]; /* upper bound */
175
+ csc *P_tmp = SCS(cs_spalloc)(P->n, P->n, Pnzmax, 1, 1);
176
+ csc *P_full;
177
+ kk = 0;
178
+ for (j = 0; j < P->n; j++) { /* cols */
179
+ for (k = P->p[j]; k < P->p[j + 1]; k++) {
180
+ i = P->i[k]; /* row */
181
+ if (i > j) { /* only upper triangular needed */
182
+ break;
183
+ }
184
+ P_tmp->i[kk] = i;
185
+ P_tmp->p[kk] = j;
186
+ P_tmp->x[kk] = P->x[k];
187
+ kk++;
188
+ if (i == j) { /* diagonal */
189
+ continue;
190
+ }
191
+ P_tmp->i[kk] = j;
192
+ P_tmp->p[kk] = i;
193
+ P_tmp->x[kk] = P->x[k];
194
+ kk++;
195
+ }
144
196
  }
145
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
146
- scs_free(M);
147
-
148
- #if EXTRA_VERBOSE > 0
149
- scs_printf("finished getting pre-conditioner\n");
150
- #endif
197
+ P_tmp->nz = kk; /* set number of nonzeros */
198
+ P_full = SCS(cs_compress)(P_tmp, SCS_NULL);
199
+ SCS(cs_spfree)(P_tmp);
200
+ return P_full;
151
201
  }
152
202
 
153
- ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
154
- const ScsSettings *stgs) {
203
+ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
204
+ scs_float *rho_y_vec, scs_float rho_x) {
155
205
  cudaError_t err;
206
+ scs_int i;
207
+ csc *P_full;
156
208
  ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
157
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
158
-
159
- /* Used for initializing dense vectors */
160
- scs_float *tmp_null_n = SCS_NULL;
161
- scs_float *tmp_null_m = SCS_NULL;
209
+ ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
210
+ ScsGpuMatrix *Pg = SCS_NULL;
162
211
 
163
212
  #if GPU_TRANSPOSE_MAT > 0
164
213
  size_t new_buffer_size = 0;
165
214
  #endif
166
215
 
216
+ p->rho_x = rho_x;
167
217
  p->cublas_handle = 0;
168
218
  p->cusparse_handle = 0;
169
219
 
170
- p->total_solve_time = 0;
171
220
  p->tot_cg_its = 0;
172
221
 
173
222
  p->buffer_size = 0;
@@ -181,13 +230,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
181
230
 
182
231
  Ag->n = A->n;
183
232
  Ag->m = A->m;
184
- Ag->Annz = A->p[A->n];
233
+ Ag->nnz = A->p[A->n];
185
234
  Ag->descr = 0;
186
- /* Matrix description */
187
-
188
- p->Ag = Ag;
189
- p->Agt = SCS_NULL;
190
-
191
235
  cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
192
236
  cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
193
237
  cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
@@ -196,10 +240,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
196
240
  cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
197
241
  cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
198
242
  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
199
- cudaMalloc((void **)&p->tmp_m,
200
- A->m * sizeof(scs_float)); /* intermediate result */
243
+ cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
201
244
  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
202
245
  cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
246
+ cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
203
247
 
204
248
  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
205
249
  cudaMemcpyHostToDevice);
@@ -207,25 +251,59 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
207
251
  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
208
252
  cudaMemcpyHostToDevice);
209
253
 
210
- cusparseCreateCsr
211
- (&Ag->descr, Ag->n, Ag->m, Ag->Annz, Ag->p, Ag->i, Ag->x,
212
- SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
213
- CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
254
+ p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
255
+ for (i = 0; i < A->m; ++i)
256
+ p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
257
+ cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
258
+ cudaMemcpyHostToDevice);
214
259
 
215
- cudaMalloc((void **)&tmp_null_n, A->n * sizeof(scs_float));
216
- cudaMalloc((void **)&tmp_null_m, A->m * sizeof(scs_float));
217
- cusparseCreateDnVec(&p->dn_vec_n, Ag->n, tmp_null_n, SCS_CUDA_FLOAT);
218
- cusparseCreateDnVec(&p->dn_vec_m, Ag->m, tmp_null_m, SCS_CUDA_FLOAT);
219
- cudaFree(tmp_null_n);
220
- cudaFree(tmp_null_m);
260
+ cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
261
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
262
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
263
+
264
+ if (P) {
265
+ Pg = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
266
+ P_full = fill_p_matrix(P);
267
+ Pg->n = P_full->n;
268
+ Pg->m = P_full->m;
269
+ Pg->nnz = P_full->p[P_full->n];
270
+ Pg->descr = 0;
271
+ cudaMalloc((void **)&Pg->i, (P_full->p[P_full->n]) * sizeof(scs_int));
272
+ cudaMalloc((void **)&Pg->p, (P_full->n + 1) * sizeof(scs_int));
273
+ cudaMalloc((void **)&Pg->x, (P_full->p[P_full->n]) * sizeof(scs_float));
274
+
275
+ cudaMemcpy(Pg->i, P_full->i, (P_full->p[P_full->n]) * sizeof(scs_int),
276
+ cudaMemcpyHostToDevice);
277
+ cudaMemcpy(Pg->p, P_full->p, (P_full->n + 1) * sizeof(scs_int),
278
+ cudaMemcpyHostToDevice);
279
+ cudaMemcpy(Pg->x, P_full->x, (P_full->p[P_full->n]) * sizeof(scs_float),
280
+ cudaMemcpyHostToDevice);
281
+
282
+ cusparseCreateCsr(&Pg->descr, Pg->n, Pg->m, Pg->nnz, Pg->p, Pg->i, Pg->x,
283
+ SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
284
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
285
+
286
+ SCS(cs_spfree)(P_full);
287
+ } else {
288
+ Pg = SCS_NULL;
289
+ }
221
290
 
222
- get_preconditioner(A, stgs, p);
291
+ p->Ag = Ag;
292
+ p->Pg = Pg;
293
+ p->Agt = SCS_NULL;
294
+
295
+ /* we initialize with tmp_m but always overwrite it so it doesn't matter */
296
+ cusparseCreateDnVec(&p->dn_vec_n, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
297
+ cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
298
+ cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
299
+
300
+ set_preconditioner(p, rho_y_vec);
223
301
 
224
302
  #if GPU_TRANSPOSE_MAT > 0
225
303
  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
226
304
  p->Agt->n = A->m;
227
305
  p->Agt->m = A->n;
228
- p->Agt->Annz = A->p[A->n];
306
+ p->Agt->nnz = A->p[A->n];
229
307
  p->Agt->descr = 0;
230
308
  /* Matrix description */
231
309
 
@@ -234,13 +312,10 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
234
312
  cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
235
313
  /* transpose Ag into Agt for faster multiplies */
236
314
  /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
237
- cusparseCsr2cscEx2_bufferSize
238
- (p->cusparse_handle, A->n, A->m, A->p[A->n],
239
- Ag->x, Ag->p, Ag->i,
240
- p->Agt->x, p->Agt->p, p->Agt->i,
241
- SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
242
- CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG,
243
- &new_buffer_size);
315
+ cusparseCsr2cscEx2_bufferSize(
316
+ p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i,
317
+ p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
318
+ CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG, &new_buffer_size);
244
319
 
245
320
  if (new_buffer_size > p->buffer_size) {
246
321
  if (p->buffer != SCS_NULL) {
@@ -250,24 +325,20 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
250
325
  p->buffer_size = new_buffer_size;
251
326
  }
252
327
 
253
- cusparseCsr2cscEx2
254
- (p->cusparse_handle, A->n, A->m, A->p[A->n],
255
- Ag->x, Ag->p, Ag->i,
256
- p->Agt->x, p->Agt->p, p->Agt->i,
257
- SCS_CUDA_FLOAT, CUSPARSE_ACTION_NUMERIC,
258
- CUSPARSE_INDEX_BASE_ZERO, SCS_CSR2CSC_ALG,
259
- p->buffer);
260
-
261
- cusparseCreateCsr
262
- (&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->Annz,
263
- p->Agt->p, p->Agt->i, p->Agt->x,
264
- SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
265
- CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
328
+ cusparseCsr2cscEx2(p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p,
329
+ Ag->i, p->Agt->x, p->Agt->p, p->Agt->i, SCS_CUDA_FLOAT,
330
+ CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
331
+ SCS_CSR2CSC_ALG, p->buffer);
332
+
333
+ cusparseCreateCsr(&p->Agt->descr, p->Agt->n, p->Agt->m, p->Agt->nnz,
334
+ p->Agt->p, p->Agt->i, p->Agt->x, SCS_CUSPARSE_INDEX,
335
+ SCS_CUSPARSE_INDEX, CUSPARSE_INDEX_BASE_ZERO,
336
+ SCS_CUDA_FLOAT);
266
337
  #endif
267
338
 
268
339
  err = cudaGetLastError();
269
340
  if (err != cudaSuccess) {
270
- printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
341
+ printf("%s:%d:%s\nERROR_CUDA (*): %s\n", __FILE__, __LINE__, __func__,
271
342
  cudaGetErrorString(err));
272
343
  SCS(free_lin_sys_work)(p);
273
344
  return SCS_NULL;
@@ -275,138 +346,173 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
275
346
  return p;
276
347
  }
277
348
 
278
- static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
279
- scs_float *z, scs_float *r, scs_int n) {
280
- cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
281
- CUBLAS(tbmv)
282
- (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
283
- 0, M, 1, z, 1);
284
- }
285
-
286
- /* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
287
- static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
288
- ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
349
+ /* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
350
+ * b */
351
+ /* on GPU */
352
+ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
289
353
  scs_int max_its, scs_float tol) {
290
- scs_int i, n = A->n;
291
- scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
354
+ scs_int i, n = pr->n;
355
+ scs_float ztr, ztr_prev, alpha, ptGp, beta, neg_alpha;
292
356
  scs_float onef = 1.0, neg_onef = -1.0;
293
357
  scs_float *p = pr->p; /* cg direction */
294
358
  scs_float *Gp = pr->Gp; /* updated CG direction */
295
359
  scs_float *r = pr->r; /* cg residual */
296
360
  scs_float *z = pr->z; /* preconditioned */
297
- scs_float *M = pr->M; /* preconditioner */
298
361
  cublasHandle_t cublas_handle = pr->cublas_handle;
299
362
 
300
- if (s == SCS_NULL) {
363
+ if (!s) {
364
+ /* take s = 0 */
365
+ /* r = b */
301
366
  cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
367
+ /* b = 0 */
302
368
  cudaMemset(bg, 0, n * sizeof(scs_float));
303
369
  } else {
304
370
  /* p contains bg temporarily */
305
371
  cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
306
- /* bg contains s */
372
+ /* bg = s */
307
373
  cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
308
- mat_vec(A, stgs, pr, bg, r);
374
+ /* r = Mat * s */
375
+ mat_vec(pr, bg, r);
376
+ /* r = Mat * s - b */
309
377
  CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
378
+ /* r = b - Mat * s */
310
379
  CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
311
380
  }
312
381
 
313
- /* for some reason nrm2 is VERY slow */
314
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
315
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
316
- nrm_r = SQRTF(nrm_r);
317
382
  /* check to see if we need to run CG at all */
318
- if (nrm_r < MIN(tol, 1e-18)) {
383
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
319
384
  return 0;
320
385
  }
321
386
 
322
- apply_pre_conditioner(cublas_handle, M, z, r, n);
323
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
324
- /* put z in p, replacing temp mem */
387
+ /* z = M r */
388
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
389
+ scale_by_diag(cublas_handle, pr->M, z, n);
390
+ /* ztr = z'r */
391
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
392
+ /* p = z */
325
393
  cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
326
394
 
327
395
  for (i = 0; i < max_its; ++i) {
328
- mat_vec(A, stgs, pr, p, Gp);
329
-
330
- CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
331
-
332
- alpha = ipzr / p_gp;
396
+ /* Gp = Mat * p */
397
+ mat_vec(pr, p, Gp);
398
+ /* ptGp = p'Gp */
399
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &ptGp);
400
+ /* alpha = z'r / p'G p */
401
+ alpha = ztr / ptGp;
333
402
  neg_alpha = -alpha;
334
-
403
+ /* b += alpha * p */
335
404
  CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
405
+ /* r -= alpha * G p */
336
406
  CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
337
407
 
338
- /* for some reason nrm2 is VERY slow */
339
- /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
340
- CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
341
- nrm_r = SQRTF(nrm_r);
342
- if (nrm_r < tol) {
343
- i++;
344
- break;
345
- }
346
- ipzr_old = ipzr;
347
- apply_pre_conditioner(cublas_handle, M, z, r, n);
348
- CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
408
+ #if VERBOSITY > 3
409
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol,
410
+ cg_gpu_norm(cublas_handle, r, n), (long)i + 1);
411
+ #endif
349
412
 
350
- beta = ipzr / ipzr_old;
413
+ if (cg_gpu_norm(cublas_handle, r, n) < tol) {
414
+ return i + 1;
415
+ }
416
+ /* z = M r */
417
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
418
+ scale_by_diag(cublas_handle, pr->M, z, n);
419
+ ztr_prev = ztr;
420
+ /* ztr = z'r */
421
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
422
+ beta = ztr / ztr_prev;
423
+ /* p = beta * p, where beta = ztr / ztr_prev */
351
424
  CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
425
+ /* p = z + beta * p */
352
426
  CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
353
427
  }
354
- #if EXTRA_VERBOSE > 0
355
- scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
356
- #endif
357
428
  return i;
358
429
  }
359
430
 
360
- scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
361
- ScsLinSysWork *p, scs_float *b, const scs_float *s,
362
- scs_int iter) {
363
- scs_int cg_its;
364
- SCS(timer) linsys_timer;
365
- scs_float *bg = p->bg;
431
+ /* solves Mx = b, for x but stores result in b */
432
+ /* s contains warm-start (if available) */
433
+ /*
434
+ * [x] = [rho_x I + P A' ]^{-1} [rx]
435
+ * [y] [ A -R_y ] [ry]
436
+ *
437
+ * R_y = diag(rho_y_vec)
438
+ *
439
+ * becomes:
440
+ *
441
+ * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
442
+ * y = R_y^{-1} (Ax - ry)
443
+ *
444
+ */
445
+ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
446
+ scs_float tol) {
447
+ scs_int cg_its, max_iters;
366
448
  scs_float neg_onef = -1.0;
449
+
450
+ /* these are on GPU */
451
+ scs_float *bg = p->bg;
452
+ scs_float *tmp_m = p->tmp_m;
367
453
  ScsGpuMatrix *Ag = p->Ag;
368
- scs_float cg_tol =
369
- SCS(norm)(b, Ag->n) *
370
- (iter < 0 ? CG_BEST_TOL
371
- : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
372
- SCS(tic)(&linsys_timer);
373
- /* all on GPU */
374
- cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
375
-
376
- cusparseDnVecSetValues(p->dn_vec_m, (void *) &(bg[Ag->n]));
377
- cusparseDnVecSetValues(p->dn_vec_n, (void *) bg);
378
- SCS(_accum_by_atrans_gpu)(
379
- Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
380
- &p->buffer_size, &p->buffer
381
- );
382
-
383
- /* solves (I+A'A)x = b, s warm start, solution stored in b */
384
- cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
454
+ ScsGpuMatrix *Pg = p->Pg;
455
+
456
+ if (CG_NORM(b, p->n + p->m) <= 1e-12) {
457
+ memset(b, 0, (p->n + p->m) * sizeof(scs_float));
458
+ return 0;
459
+ }
460
+
461
+ if (tol <= 0.) {
462
+ scs_printf("Warning: tol = %4f <= 0, likely compiled without setting "
463
+ "INDIRECT flag.\n",
464
+ tol);
465
+ }
466
+
467
+ /* bg = b = [rx; ry] */
468
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float),
469
+ cudaMemcpyHostToDevice);
470
+ /* tmp = ry */
471
+ cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
472
+ cudaMemcpyDeviceToDevice);
473
+ /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
474
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
475
+
476
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
477
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
478
+ /* bg[:n] = rx + A' R ry */
479
+ SCS(accum_by_atrans_gpu)
480
+ (Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle, &p->buffer_size,
481
+ &p->buffer);
482
+
483
+ /* set max_iters to 10 * n (though in theory n is enough for any tol) */
484
+ max_iters = 10 * Ag->n;
485
+
486
+ /* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
487
+ * in bg */
488
+ cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
489
+
490
+ /* bg[n:] = -ry */
385
491
  CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
492
+ cusparseDnVecSetValues(p->dn_vec_m, (void *)&(bg[Ag->n])); /* -ry */
493
+ cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* x */
386
494
 
387
- cusparseDnVecSetValues(p->dn_vec_m, (void *) &(bg[Ag->n]));
388
- cusparseDnVecSetValues(p->dn_vec_n, (void *) bg);
495
+ /* b[n:] = Ax - ry */
389
496
  #if GPU_TRANSPOSE_MAT > 0
390
- SCS(_accum_by_atrans_gpu)(
391
- p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
392
- &p->buffer_size, &p->buffer
393
- );
497
+ SCS(accum_by_atrans_gpu)
498
+ (p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
499
+ &p->buffer);
394
500
  #else
395
- SCS(_accum_by_a_gpu)(
396
- Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
397
- &p->buffer_size, &p->buffer
398
- );
501
+ SCS(accum_by_a_gpu)
502
+ (Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle, &p->buffer_size,
503
+ &p->buffer);
399
504
  #endif
400
505
 
401
- cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
402
-
403
- if (iter >= 0) {
404
- p->tot_cg_its += cg_its;
405
- }
506
+ /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
507
+ scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
406
508
 
407
- p->total_solve_time += SCS(tocq)(&linsys_timer);
408
- #if EXTRAVERBOSE > 0
409
- scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
509
+ /* copy bg = [x; y] back to b */
510
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
511
+ cudaMemcpyDeviceToHost);
512
+ p->tot_cg_its += cg_its;
513
+ #if VERBOSITY > 1
514
+ scs_printf("tol %.3e\n", tol);
515
+ scs_printf("cg_its %i\n", (int)cg_its);
410
516
  #endif
411
517
  return 0;
412
518
  }