scs 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +8 -8
  4. data/lib/scs/ffi.rb +1 -7
  5. data/lib/scs/version.rb +1 -1
  6. data/vendor/scs/CITATION.cff +1 -1
  7. data/vendor/scs/CMakeLists.txt +55 -7
  8. data/vendor/scs/Makefile +9 -9
  9. data/vendor/scs/README.md +2 -1
  10. data/vendor/scs/include/aa.h +1 -1
  11. data/vendor/scs/include/cones.h +14 -11
  12. data/vendor/scs/include/glbopts.h +26 -64
  13. data/vendor/scs/include/linalg.h +2 -1
  14. data/vendor/scs/include/linsys.h +13 -13
  15. data/vendor/scs/include/normalize.h +6 -5
  16. data/vendor/scs/include/scs.h +43 -87
  17. data/vendor/scs/include/scs_types.h +34 -0
  18. data/vendor/scs/include/scs_work.h +83 -0
  19. data/vendor/scs/linsys/cpu/direct/private.c +86 -73
  20. data/vendor/scs/linsys/cpu/direct/private.h +2 -2
  21. data/vendor/scs/linsys/cpu/indirect/private.c +42 -33
  22. data/vendor/scs/linsys/cpu/indirect/private.h +1 -2
  23. data/vendor/scs/linsys/csparse.c +3 -3
  24. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +6 -6
  25. data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +6 -1
  26. data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
  27. data/vendor/scs/linsys/gpu/gpu.h +8 -11
  28. data/vendor/scs/linsys/gpu/indirect/private.c +72 -49
  29. data/vendor/scs/linsys/gpu/indirect/private.h +14 -13
  30. data/vendor/scs/linsys/scs_matrix.c +26 -46
  31. data/vendor/scs/linsys/scs_matrix.h +4 -4
  32. data/vendor/scs/scs.mk +1 -1
  33. data/vendor/scs/src/aa.c +13 -4
  34. data/vendor/scs/src/cones.c +143 -92
  35. data/vendor/scs/src/linalg.c +25 -0
  36. data/vendor/scs/src/normalize.c +26 -26
  37. data/vendor/scs/src/rw.c +48 -12
  38. data/vendor/scs/src/scs.c +104 -110
  39. data/vendor/scs/src/scs_version.c +8 -6
  40. data/vendor/scs/src/util.c +1 -1
  41. data/vendor/scs/test/minunit.h +6 -1
  42. data/vendor/scs/test/problem_utils.h +28 -35
  43. data/vendor/scs/test/problems/degenerate.h +1 -1
  44. data/vendor/scs/test/problems/hs21_tiny_qp.h +1 -1
  45. data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +1 -1
  46. data/vendor/scs/test/problems/infeasible_tiny_qp.h +1 -1
  47. data/vendor/scs/test/problems/qafiro_tiny_qp.h +3 -3
  48. data/vendor/scs/test/problems/random_prob.h +1 -1
  49. data/vendor/scs/test/problems/rob_gauss_cov_est.h +1 -1
  50. data/vendor/scs/test/problems/small_lp.h +3 -1
  51. data/vendor/scs/test/problems/small_qp.h +352 -0
  52. data/vendor/scs/test/problems/{test_fails.h → test_validation.h} +3 -3
  53. data/vendor/scs/test/problems/unbounded_tiny_qp.h +1 -1
  54. data/vendor/scs/test/random_socp_prob.c +1 -1
  55. data/vendor/scs/test/run_from_file.c +1 -1
  56. data/vendor/scs/test/run_tests.c +23 -14
  57. metadata +8 -5
@@ -17,7 +17,8 @@ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
17
17
  }
18
18
  */
19
19
 
20
- /* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
20
+ /* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
21
+ /* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
21
22
  static void set_preconditioner(ScsLinSysWork *p) {
22
23
  scs_int i, k;
23
24
  scs_float *M = p->M;
@@ -28,22 +29,26 @@ static void set_preconditioner(ScsLinSysWork *p) {
28
29
  scs_printf("getting pre-conditioner\n");
29
30
  #endif
30
31
 
32
+ /* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
31
33
  for (i = 0; i < A->n; ++i) { /* cols */
32
- M[i] = p->rho_x;
33
- /* diag(A' R_y^{-1} A) */
34
+ /* M_ii = (R_x)_i */
35
+ M[i] = p->diag_r[i];
36
+ /* M_ii += a_i' (R_y)^-1 a_i */
34
37
  for (k = A->p[i]; k < A->p[i + 1]; ++k) {
35
38
  /* A->i[k] is row of entry k with value A->x[k] */
36
- M[i] += A->x[k] * A->x[k] / p->rho_y_vec[A->i[k]];
39
+ M[i] += A->x[k] * A->x[k] / p->diag_r[A->n + A->i[k]];
37
40
  }
38
41
  if (P) {
39
42
  for (k = P->p[i]; k < P->p[i + 1]; k++) {
40
43
  /* diagonal element only */
41
44
  if (P->i[k] == i) { /* row == col */
45
+ /* M_ii += P_ii */
42
46
  M[i] += P->x[k];
43
47
  break;
44
48
  }
45
49
  }
46
50
  }
51
+ /* finally invert for pre-conditioner */
47
52
  M[i] = 1. / M[i];
48
53
  }
49
54
  #if VERBOSITY > 0
@@ -111,10 +116,18 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
111
116
  }
112
117
 
113
118
  /* vec -> R_y^{-1} vec */
114
- static void scale_by_diag_r(scs_float *vec, ScsLinSysWork *p) {
119
+ static void scale_by_r_y_inv(scs_float *vec, ScsLinSysWork *p) {
115
120
  scs_int i;
116
121
  for (i = 0; i < p->m; ++i) {
117
- vec[i] /= p->rho_y_vec[i];
122
+ vec[i] /= p->diag_r[p->n + i];
123
+ }
124
+ }
125
+
126
+ /* y += R_x * x */
127
+ static void accum_by_r_x(scs_float *y, const scs_float *x, ScsLinSysWork *p) {
128
+ scs_int i;
129
+ for (i = 0; i < p->n; ++i) {
130
+ y[i] += p->diag_r[i] * x[i];
118
131
  }
119
132
  }
120
133
 
@@ -123,7 +136,7 @@ static void accum_by_a(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
123
136
  SCS(accum_by_atrans)(p->At, x, y);
124
137
  }
125
138
 
126
- /* y = (rho_x * I + P + A' R_y^{-1} A) x */
139
+ /* y = (R_x + P + A' R_y^{-1} A) x */
127
140
  static void mat_vec(const ScsMatrix *A, const ScsMatrix *P, ScsLinSysWork *p,
128
141
  const scs_float *x, scs_float *y) {
129
142
  scs_float *z = p->tmp;
@@ -133,10 +146,10 @@ static void mat_vec(const ScsMatrix *A, const ScsMatrix *P, ScsLinSysWork *p,
133
146
  SCS(accum_by_p)(P, x, y); /* y = Px */
134
147
  }
135
148
  accum_by_a(p, x, z); /* z = Ax */
136
- scale_by_diag_r(z, p); /* z = R_y^{-1} A x */
149
+ scale_by_r_y_inv(z, p); /* z = R_y^{-1} A x */
137
150
  SCS(accum_by_atrans)(A, z, y); /* y += A'z, y = Px + A' R_y^{-1} Ax */
138
- /* y = rho_x * x + Px + A' R_y^{-1} A x */
139
- SCS(add_scaled_array)(y, x, A->n, p->rho_x);
151
+ /* y = R_x * x + Px + A' R_y^{-1} A * x */
152
+ accum_by_r_x(y, x, p);
140
153
  }
141
154
 
142
155
  static void apply_pre_conditioner(scs_float *z, scs_float *r, scs_int n,
@@ -149,36 +162,35 @@ static void apply_pre_conditioner(scs_float *z, scs_float *r, scs_int n,
149
162
  }
150
163
 
151
164
  /* no need to update anything in this case */
152
- void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
153
- p->rho_y_vec = rho_y_vec; /* this isn't needed but do it to be safe */
165
+ void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
166
+ p->diag_r = diag_r; /* this isn't needed but do it to be safe */
154
167
  set_preconditioner(p);
155
168
  }
156
169
 
157
170
  ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
158
- scs_float *rho_y_vec, scs_float rho_x) {
171
+ const scs_float *diag_r) {
159
172
  ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
160
173
  p->A = A;
161
174
  p->P = P;
162
175
  p->m = A->m;
163
176
  p->n = A->n;
164
- p->rho_x = rho_x;
165
177
 
166
- p->p = (scs_float *)scs_malloc((A->n) * sizeof(scs_float));
167
- p->r = (scs_float *)scs_malloc((A->n) * sizeof(scs_float));
168
- p->Gp = (scs_float *)scs_malloc((A->n) * sizeof(scs_float));
169
- p->tmp = (scs_float *)scs_malloc((A->m) * sizeof(scs_float));
178
+ p->p = (scs_float *)scs_calloc((A->n), sizeof(scs_float));
179
+ p->r = (scs_float *)scs_calloc((A->n), sizeof(scs_float));
180
+ p->Gp = (scs_float *)scs_calloc((A->n), sizeof(scs_float));
181
+ p->tmp = (scs_float *)scs_calloc((A->m), sizeof(scs_float));
170
182
 
171
183
  /* memory for A transpose */
172
- p->At = (ScsMatrix *)scs_malloc(sizeof(ScsMatrix));
184
+ p->At = (ScsMatrix *)scs_calloc(1, sizeof(ScsMatrix));
173
185
  p->At->m = A->n;
174
186
  p->At->n = A->m;
175
- p->At->i = (scs_int *)scs_malloc((A->p[A->n]) * sizeof(scs_int));
176
- p->At->p = (scs_int *)scs_malloc((A->m + 1) * sizeof(scs_int));
177
- p->At->x = (scs_float *)scs_malloc((A->p[A->n]) * sizeof(scs_float));
187
+ p->At->i = (scs_int *)scs_calloc((A->p[A->n]), sizeof(scs_int));
188
+ p->At->p = (scs_int *)scs_calloc((A->m + 1), sizeof(scs_int));
189
+ p->At->x = (scs_float *)scs_calloc((A->p[A->n]), sizeof(scs_float));
178
190
  transpose(A, p);
179
191
 
180
192
  /* preconditioner memory */
181
- p->rho_y_vec = rho_y_vec;
193
+ p->diag_r = diag_r;
182
194
  p->z = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
183
195
  p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
184
196
  set_preconditioner(p);
@@ -192,8 +204,7 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
192
204
  return p;
193
205
  }
194
206
 
195
- /* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
196
- * b */
207
+ /* solves (R_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution in b */
197
208
  static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *b,
198
209
  scs_int max_its, scs_float tol) {
199
210
  scs_int i, n = pr->n;
@@ -268,14 +279,12 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *b,
268
279
  /* solves Mx = b, for x but stores result in b */
269
280
  /* s contains warm-start (if available) */
270
281
  /*
271
- * [x] = [rho_x I + P A' ]^{-1} [rx]
272
- * [y] [ A -R_y ] [ry]
273
- *
274
- * R_y = diag(rho_y_vec)
282
+ * [x] = [R_x + P A' ]^{-1} [rx]
283
+ * [y] [ A -R_y ] [ry]
275
284
  *
276
285
  * becomes:
277
286
  *
278
- * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
287
+ * x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
279
288
  * y = R_y^{-1} (Ax - ry)
280
289
  *
281
290
  */
@@ -299,12 +308,12 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
299
308
  /* tmp = ry */
300
309
  memcpy(p->tmp, &(b[p->n]), p->m * sizeof(scs_float));
301
310
  /* tmp = R_y^{-1} * ry */
302
- scale_by_diag_r(p->tmp, p);
311
+ scale_by_r_y_inv(p->tmp, p);
303
312
  /* b[:n] = rx + A' R_y^{-1} ry */
304
313
  SCS(accum_by_atrans)(p->A, p->tmp, b);
305
314
  /* set max_iters to 10 * n (though in theory n is enough for any tol) */
306
315
  max_iters = 10 * p->n;
307
- /* solves (rho_x I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
316
+ /* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
308
317
  * b */
309
318
  cg_its = pcg(p, s, b, max_iters, tol); /* b[:n] = x */
310
319
 
@@ -313,7 +322,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
313
322
  /* b[n:] = Ax - ry */
314
323
  accum_by_a(p, b, &(b[p->n]));
315
324
  /* b[n:] = R_y^{-1} (Ax - ry) = y */
316
- scale_by_diag_r(&(b[p->n]), p);
325
+ scale_by_r_y_inv(&(b[p->n]), p);
317
326
  p->tot_cg_its += cg_its;
318
327
  #if VERBOSITY > 1
319
328
  scs_printf("tol %.3e\n", tol);
@@ -25,8 +25,7 @@ struct SCS_LIN_SYS_WORK {
25
25
  scs_float *M;
26
26
  /* reporting */
27
27
  scs_int tot_cg_its;
28
- scs_float *rho_y_vec;
29
- scs_float rho_x;
28
+ const scs_float *diag_r;
30
29
  };
31
30
 
32
31
  #ifdef __cplusplus
@@ -12,9 +12,9 @@ csc *SCS(cs_spalloc)(scs_int m, scs_int n, scs_int nzmax, scs_int values,
12
12
  A->n = n;
13
13
  A->nzmax = nzmax = MAX(nzmax, 1);
14
14
  A->nz = triplet ? 0 : -1; /* allocate triplet or comp.col */
15
- A->p = (scs_int *)scs_malloc((triplet ? nzmax : n + 1) * sizeof(scs_int));
16
- A->i = (scs_int *)scs_malloc(nzmax * sizeof(scs_int));
17
- A->x = values ? (scs_float *)scs_malloc(nzmax * sizeof(scs_float)) : SCS_NULL;
15
+ A->p = (scs_int *)scs_calloc((triplet ? nzmax : n + 1), sizeof(scs_int));
16
+ A->i = (scs_int *)scs_calloc(nzmax, sizeof(scs_int));
17
+ A->x = values ? (scs_float *)scs_calloc(nzmax, sizeof(scs_float)) : SCS_NULL;
18
18
  return (!A->p || !A->i || (values && !A->x)) ? SCS(cs_spfree)(A) : A;
19
19
  }
20
20
 
@@ -18,7 +18,6 @@
18
18
 
19
19
  #ifdef MATLAB_MEX_FILE
20
20
  #include "mex.h"
21
- #include "scs_matrix.h"
22
21
  #endif
23
22
 
24
23
  #ifndef NULL
@@ -51,7 +50,7 @@
51
50
 
52
51
  struct SuiteSparse_config_struct SuiteSparse_config =
53
52
  {
54
- _scs_malloc, _scs_calloc, _scs_realloc, _scs_free, _scs_printf,
53
+ scs_malloc, scs_calloc, scs_realloc, scs_free, _scs_printf,
55
54
  SuiteSparse_hypot,
56
55
  SuiteSparse_divcomplex
57
56
 
@@ -73,12 +72,13 @@ struct SuiteSparse_config_struct SuiteSparse_config =
73
72
  SuiteSparse_start be called prior to calling any SuiteSparse function.
74
73
  */
75
74
 
75
+
76
76
  void SuiteSparse_start ( void )
77
77
  {
78
- SuiteSparse_config.malloc_func = _scs_malloc ;
79
- SuiteSparse_config.calloc_func = _scs_calloc ;
80
- SuiteSparse_config.realloc_func = _scs_realloc ;
81
- SuiteSparse_config.free_func = _scs_free ;
78
+ SuiteSparse_config.malloc_func = scs_malloc ;
79
+ SuiteSparse_config.calloc_func = scs_calloc ;
80
+ SuiteSparse_config.realloc_func = scs_realloc ;
81
+ SuiteSparse_config.free_func = scs_free ;
82
82
  SuiteSparse_config.printf_func = _scs_printf ;
83
83
  /* math functions */
84
84
  SuiteSparse_config.hypot_func = SuiteSparse_hypot ;
@@ -44,7 +44,7 @@ extern "C" {
44
44
 
45
45
  #include <limits.h>
46
46
  #include <stdlib.h>
47
- #include "scs.h"
47
+ #include "glbopts.h"
48
48
  #include "ctrlc.h"
49
49
 
50
50
  /* ========================================================================== */
@@ -71,6 +71,11 @@ extern "C" {
71
71
  #define SuiteSparse_long_id "%" SuiteSparse_long_idd
72
72
  #endif
73
73
 
74
+ #ifndef _scs_printf
75
+ #define _scs_printf scs_printf
76
+ #endif
77
+
78
+
74
79
  /* ========================================================================== */
75
80
  /* === SuiteSparse_config parameters and functions ========================== */
76
81
  /* ========================================================================== */
@@ -89,8 +89,8 @@ GLOBAL Int AMD_order
89
89
  }
90
90
 
91
91
  /* allocate two size-n integer workspaces */
92
- Len = SuiteSparse_malloc (n, sizeof (Int)) ;
93
- Pinv = SuiteSparse_malloc (n, sizeof (Int)) ;
92
+ Len = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
93
+ Pinv = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
94
94
  mem += n ;
95
95
  mem += n ;
96
96
  if (!Len || !Pinv)
@@ -106,8 +106,8 @@ GLOBAL Int AMD_order
106
106
  {
107
107
  /* sort the input matrix and remove duplicate entries */
108
108
  AMD_DEBUG1 (("Matrix is jumbled\n")) ;
109
- Rp = SuiteSparse_malloc (n+1, sizeof (Int)) ;
110
- Ri = SuiteSparse_malloc (nz, sizeof (Int)) ;
109
+ Rp = (Int *)SuiteSparse_malloc (n+1, sizeof (Int)) ;
110
+ Ri = (Int *)SuiteSparse_malloc (nz, sizeof (Int)) ;
111
111
  mem += (n+1) ;
112
112
  mem += MAX (nz,1) ;
113
113
  if (!Rp || !Ri)
@@ -160,7 +160,7 @@ GLOBAL Int AMD_order
160
160
  ok = ok && (slen < Int_MAX) ; /* S[i] for Int i must be OK */
161
161
  if (ok)
162
162
  {
163
- S = SuiteSparse_malloc (slen, sizeof (Int)) ;
163
+ S = (Int *)SuiteSparse_malloc (slen, sizeof (Int)) ;
164
164
  }
165
165
  AMD_DEBUG1 (("slen %g\n", (scs_float) slen)) ;
166
166
  if (!S)
@@ -1,12 +1,17 @@
1
- #ifndef SCSGPU_H_GUARD
2
- #define SCSGPU_H_GUARD
1
+ #ifndef SCS_GPU_H_GUARD
2
+ #define SCS_GPU_H_GUARD
3
3
 
4
4
  #ifdef __cplusplus
5
5
  extern "C" {
6
6
  #endif
7
7
 
8
- #include <cublas_v2.h>
8
+ /* TODO: Do we need this?
9
+
9
10
  #include <cuda.h>
11
+
12
+ */
13
+
14
+ #include <cublas_v2.h>
10
15
  #include <cuda_runtime_api.h>
11
16
  #include <cusparse.h>
12
17
 
@@ -31,11 +36,9 @@ extern "C" {
31
36
  #ifndef SFLOAT
32
37
  #define CUBLAS(x) cublasD##x
33
38
  #define CUBLASI(x) cublasId##x
34
- #define CUSPARSE(x) cusparseD##x
35
39
  #else
36
40
  #define CUBLAS(x) cublasS##x
37
41
  #define CUBLASI(x) cublasIs##x
38
- #define CUSPARSE(x) cusparseS##x
39
42
  #endif
40
43
  #define CUSPARSE_GEN(x) cusparse##x
41
44
  #else
@@ -46,9 +49,6 @@ extern "C" {
46
49
  #define CUBLASI(x) \
47
50
  CUDA_CHECK_ERR; \
48
51
  cublasId##x
49
- #define CUSPARSE(x) \
50
- CUDA_CHECK_ERR; \
51
- cusparseD##x
52
52
  #else
53
53
  #define CUBLAS(x) \
54
54
  CUDA_CHECK_ERR; \
@@ -56,9 +56,6 @@ extern "C" {
56
56
  #define CUBLASI(x) \
57
57
  CUDA_CHECK_ERR; \
58
58
  cublasIs##x
59
- #define CUSPARSE(x) \
60
- CUDA_CHECK_ERR; \
61
- cusparseS##x
62
59
  #endif
63
60
  #define CUSPARSE_GEN(x) \
64
61
  CUDA_CHECK_ERR; \
@@ -35,63 +35,77 @@ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
35
35
  }
36
36
  */
37
37
 
38
- /* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
39
- static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
38
+ /* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
39
+ /* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
40
+ static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
40
41
  scs_int i, k;
41
42
  const ScsMatrix *A = p->A;
42
43
  const ScsMatrix *P = p->P;
43
- scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
44
+ scs_float *M = p->M;
44
45
 
45
46
  #if VERBOSITY > 0
46
47
  scs_printf("getting pre-conditioner\n");
47
48
  #endif
48
49
 
50
+ /* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
49
51
  for (i = 0; i < A->n; ++i) { /* cols */
50
- M[i] = p->rho_x;
51
- /* diag(A' R_y^{-1} A) */
52
+ /* M_ii = (R_x)_i */
53
+ M[i] = diag_r[i];
54
+ /* M_ii += a_i' (R_y)^-1 a_i */
52
55
  for (k = A->p[i]; k < A->p[i + 1]; ++k) {
53
56
  /* A->i[k] is row of entry k with value A->x[k] */
54
- M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
57
+ M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
55
58
  }
56
59
  if (P) {
57
60
  for (k = P->p[i]; k < P->p[i + 1]; k++) {
58
61
  /* diagonal element only */
59
62
  if (P->i[k] == i) { /* row == col */
63
+ /* M_ii += P_ii */
60
64
  M[i] += P->x[k];
61
65
  break;
62
66
  }
63
67
  }
64
68
  }
69
+ /* finally invert for pre-conditioner */
65
70
  M[i] = 1. / M[i];
66
71
  }
67
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
68
- scs_free(M);
72
+ cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
69
73
  #if VERBOSITY > 0
70
74
  scs_printf("finished getting pre-conditioner\n");
71
75
  #endif
72
76
  }
73
77
 
74
78
  /* no need to update anything in this case */
75
- void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
79
+ void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
76
80
  scs_int i;
81
+
82
+ /* R_x to gpu */
83
+ cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
84
+ cudaMemcpyHostToDevice);
85
+
86
+ /* 1/R_y to gpu */
77
87
  for (i = 0; i < p->m; ++i)
78
- p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
79
- cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
88
+ p->inv_r_y[i] = 1. / diag_r[p->n + i];
89
+ cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
80
90
  cudaMemcpyHostToDevice);
81
- set_preconditioner(p, rho_y_vec);
91
+
92
+ /* set preconditioner M on gpu */
93
+ set_preconditioner(p, diag_r);
82
94
  }
83
95
 
84
96
  void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
85
97
  if (p) {
86
- scs_free(p->inv_rho_y_vec);
98
+ scs_free(p->M);
99
+ scs_free(p->inv_r_y);
87
100
  cudaFree(p->p);
88
101
  cudaFree(p->r);
89
102
  cudaFree(p->Gp);
90
103
  cudaFree(p->bg);
91
104
  cudaFree(p->tmp_m);
92
105
  cudaFree(p->z);
93
- cudaFree(p->M);
94
- cudaFree(p->inv_rho_y_vec_gpu);
106
+ cudaFree(p->M_gpu);
107
+ cudaFree(p->r_x_gpu);
108
+ cudaFree(p->inv_r_y_gpu);
95
109
  if (p->Pg) {
96
110
  SCS(free_gpu_matrix)(p->Pg);
97
111
  scs_free(p->Pg);
@@ -126,22 +140,23 @@ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
126
140
  0, M, 1, z, 1);
127
141
  }
128
142
 
129
- /* y = (rho_x * I + P + A' R_y^{-1} A) x */
143
+ /* y = (R_x + P + A' R_y^{-1} A) x */
130
144
  static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
131
145
  /* x and y MUST already be loaded to GPU */
132
146
  scs_float *z = p->tmp_m; /* temp memory */
133
- cudaMemset(y, 0, p->n * sizeof(scs_float));
134
147
  cudaMemset(z, 0, p->m * sizeof(scs_float));
135
148
 
136
149
  cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
137
150
  cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
138
151
  cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
139
152
 
140
- /* y = rho_x * x */
141
- CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
153
+ /* y = x */
154
+ cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
155
+ /* y = R_x * x */
156
+ scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
142
157
 
143
158
  if (p->Pg) {
144
- /* y = rho_x * x + Px */
159
+ /* y = R_x * x + P x */
145
160
  SCS(accum_by_p_gpu)
146
161
  (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
147
162
  &p->buffer);
@@ -158,9 +173,9 @@ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
158
173
  &p->buffer);
159
174
  #endif
160
175
  /* z = R_y^{-1} A x */
161
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
176
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
162
177
 
163
- /* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
178
+ /* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
164
179
  SCS(accum_by_atrans_gpu)
165
180
  (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
166
181
  &p->buffer);
@@ -201,19 +216,35 @@ static csc *fill_p_matrix(const ScsMatrix *P) {
201
216
  }
202
217
 
203
218
  ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
204
- scs_float *rho_y_vec, scs_float rho_x) {
219
+ const scs_float *diag_r) {
205
220
  cudaError_t err;
206
- scs_int i;
207
221
  csc *P_full;
208
- ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
209
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
222
+ ScsLinSysWork *p = SCS_NULL;
223
+ ScsGpuMatrix *Ag = SCS_NULL;
210
224
  ScsGpuMatrix *Pg = SCS_NULL;
225
+ int device_count;
226
+
227
+ err = cudaGetDeviceCount(&device_count);
228
+ if (err > 0) {
229
+ scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
230
+ return SCS_NULL;
231
+ }
232
+
233
+ p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
234
+ Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
235
+
236
+ p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
237
+ p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
238
+
239
+ p->A = A;
240
+ p->P = P;
241
+ p->m = A->m;
242
+ p->n = A->n;
211
243
 
212
244
  #if GPU_TRANSPOSE_MAT > 0
213
245
  size_t new_buffer_size = 0;
214
246
  #endif
215
247
 
216
- p->rho_x = rho_x;
217
248
  p->cublas_handle = 0;
218
249
  p->cusparse_handle = 0;
219
250
 
@@ -242,8 +273,9 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
242
273
  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
243
274
  cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
244
275
  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
245
- cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
246
- cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
276
+ cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
277
+ cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
278
+ cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
247
279
 
248
280
  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
249
281
  cudaMemcpyHostToDevice);
@@ -251,12 +283,6 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
251
283
  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
252
284
  cudaMemcpyHostToDevice);
253
285
 
254
- p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
255
- for (i = 0; i < A->m; ++i)
256
- p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
257
- cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
258
- cudaMemcpyHostToDevice);
259
-
260
286
  cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
261
287
  SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
262
288
  CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
@@ -297,7 +323,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
297
323
  cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
298
324
  cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
299
325
 
300
- set_preconditioner(p, rho_y_vec);
326
+ /* Form preconditioner and copy R_x, 1/R_y to gpu */
327
+ SCS(update_lin_sys_diag_r)(p, diag_r);
301
328
 
302
329
  #if GPU_TRANSPOSE_MAT > 0
303
330
  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
@@ -346,9 +373,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
346
373
  return p;
347
374
  }
348
375
 
349
- /* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
350
- * b */
351
- /* on GPU */
376
+ /* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
377
+ * b, on GPU */
352
378
  static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
353
379
  scs_int max_its, scs_float tol) {
354
380
  scs_int i, n = pr->n;
@@ -386,7 +412,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
386
412
 
387
413
  /* z = M r */
388
414
  cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
389
- scale_by_diag(cublas_handle, pr->M, z, n);
415
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
390
416
  /* ztr = z'r */
391
417
  CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
392
418
  /* p = z */
@@ -415,7 +441,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
415
441
  }
416
442
  /* z = M r */
417
443
  cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
418
- scale_by_diag(cublas_handle, pr->M, z, n);
444
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
419
445
  ztr_prev = ztr;
420
446
  /* ztr = z'r */
421
447
  CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
@@ -431,14 +457,12 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
431
457
  /* solves Mx = b, for x but stores result in b */
432
458
  /* s contains warm-start (if available) */
433
459
  /*
434
- * [x] = [rho_x I + P A' ]^{-1} [rx]
460
+ * [x] = [R_x + P A' ]^{-1} [rx]
435
461
  * [y] [ A -R_y ] [ry]
436
462
  *
437
- * R_y = diag(rho_y_vec)
438
- *
439
463
  * becomes:
440
464
  *
441
- * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
465
+ * x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
442
466
  * y = R_y^{-1} (Ax - ry)
443
467
  *
444
468
  */
@@ -451,7 +475,6 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
451
475
  scs_float *bg = p->bg;
452
476
  scs_float *tmp_m = p->tmp_m;
453
477
  ScsGpuMatrix *Ag = p->Ag;
454
- ScsGpuMatrix *Pg = p->Pg;
455
478
 
456
479
  if (CG_NORM(b, p->n + p->m) <= 1e-12) {
457
480
  memset(b, 0, (p->n + p->m) * sizeof(scs_float));
@@ -471,7 +494,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
471
494
  cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
472
495
  cudaMemcpyDeviceToDevice);
473
496
  /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
474
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
497
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
475
498
 
476
499
  cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
477
500
  cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
@@ -483,7 +506,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
483
506
  /* set max_iters to 10 * n (though in theory n is enough for any tol) */
484
507
  max_iters = 10 * Ag->n;
485
508
 
486
- /* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
509
+ /* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
487
510
  * in bg */
488
511
  cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
489
512
 
@@ -504,7 +527,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
504
527
  #endif
505
528
 
506
529
  /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
507
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
530
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
508
531
 
509
532
  /* copy bg = [x; y] back to b */
510
533
  cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),