scs 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +8 -8
  4. data/lib/scs/ffi.rb +1 -7
  5. data/lib/scs/version.rb +1 -1
  6. data/vendor/scs/CITATION.cff +1 -1
  7. data/vendor/scs/CMakeLists.txt +55 -7
  8. data/vendor/scs/Makefile +9 -9
  9. data/vendor/scs/README.md +2 -1
  10. data/vendor/scs/include/aa.h +1 -1
  11. data/vendor/scs/include/cones.h +14 -11
  12. data/vendor/scs/include/glbopts.h +26 -64
  13. data/vendor/scs/include/linalg.h +2 -1
  14. data/vendor/scs/include/linsys.h +13 -13
  15. data/vendor/scs/include/normalize.h +6 -5
  16. data/vendor/scs/include/scs.h +43 -87
  17. data/vendor/scs/include/scs_types.h +34 -0
  18. data/vendor/scs/include/scs_work.h +83 -0
  19. data/vendor/scs/linsys/cpu/direct/private.c +86 -73
  20. data/vendor/scs/linsys/cpu/direct/private.h +2 -2
  21. data/vendor/scs/linsys/cpu/indirect/private.c +42 -33
  22. data/vendor/scs/linsys/cpu/indirect/private.h +1 -2
  23. data/vendor/scs/linsys/csparse.c +3 -3
  24. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +6 -6
  25. data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +6 -1
  26. data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
  27. data/vendor/scs/linsys/gpu/gpu.h +8 -11
  28. data/vendor/scs/linsys/gpu/indirect/private.c +72 -49
  29. data/vendor/scs/linsys/gpu/indirect/private.h +14 -13
  30. data/vendor/scs/linsys/scs_matrix.c +26 -46
  31. data/vendor/scs/linsys/scs_matrix.h +4 -4
  32. data/vendor/scs/scs.mk +1 -1
  33. data/vendor/scs/src/aa.c +13 -4
  34. data/vendor/scs/src/cones.c +143 -92
  35. data/vendor/scs/src/linalg.c +25 -0
  36. data/vendor/scs/src/normalize.c +26 -26
  37. data/vendor/scs/src/rw.c +48 -12
  38. data/vendor/scs/src/scs.c +104 -110
  39. data/vendor/scs/src/scs_version.c +8 -6
  40. data/vendor/scs/src/util.c +1 -1
  41. data/vendor/scs/test/minunit.h +6 -1
  42. data/vendor/scs/test/problem_utils.h +28 -35
  43. data/vendor/scs/test/problems/degenerate.h +1 -1
  44. data/vendor/scs/test/problems/hs21_tiny_qp.h +1 -1
  45. data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +1 -1
  46. data/vendor/scs/test/problems/infeasible_tiny_qp.h +1 -1
  47. data/vendor/scs/test/problems/qafiro_tiny_qp.h +3 -3
  48. data/vendor/scs/test/problems/random_prob.h +1 -1
  49. data/vendor/scs/test/problems/rob_gauss_cov_est.h +1 -1
  50. data/vendor/scs/test/problems/small_lp.h +3 -1
  51. data/vendor/scs/test/problems/small_qp.h +352 -0
  52. data/vendor/scs/test/problems/{test_fails.h → test_validation.h} +3 -3
  53. data/vendor/scs/test/problems/unbounded_tiny_qp.h +1 -1
  54. data/vendor/scs/test/random_socp_prob.c +1 -1
  55. data/vendor/scs/test/run_from_file.c +1 -1
  56. data/vendor/scs/test/run_tests.c +23 -14
  57. metadata +8 -5
@@ -17,7 +17,8 @@ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
17
17
  }
18
18
  */
19
19
 
20
- /* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
20
+ /* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
21
+ /* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
21
22
  static void set_preconditioner(ScsLinSysWork *p) {
22
23
  scs_int i, k;
23
24
  scs_float *M = p->M;
@@ -28,22 +29,26 @@ static void set_preconditioner(ScsLinSysWork *p) {
28
29
  scs_printf("getting pre-conditioner\n");
29
30
  #endif
30
31
 
32
+ /* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
31
33
  for (i = 0; i < A->n; ++i) { /* cols */
32
- M[i] = p->rho_x;
33
- /* diag(A' R_y^{-1} A) */
34
+ /* M_ii = (R_x)_i */
35
+ M[i] = p->diag_r[i];
36
+ /* M_ii += a_i' (R_y)^-1 a_i */
34
37
  for (k = A->p[i]; k < A->p[i + 1]; ++k) {
35
38
  /* A->i[k] is row of entry k with value A->x[k] */
36
- M[i] += A->x[k] * A->x[k] / p->rho_y_vec[A->i[k]];
39
+ M[i] += A->x[k] * A->x[k] / p->diag_r[A->n + A->i[k]];
37
40
  }
38
41
  if (P) {
39
42
  for (k = P->p[i]; k < P->p[i + 1]; k++) {
40
43
  /* diagonal element only */
41
44
  if (P->i[k] == i) { /* row == col */
45
+ /* M_ii += P_ii */
42
46
  M[i] += P->x[k];
43
47
  break;
44
48
  }
45
49
  }
46
50
  }
51
+ /* finally invert for pre-conditioner */
47
52
  M[i] = 1. / M[i];
48
53
  }
49
54
  #if VERBOSITY > 0
@@ -111,10 +116,18 @@ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
111
116
  }
112
117
 
113
118
  /* vec -> R_y^{-1} vec */
114
- static void scale_by_diag_r(scs_float *vec, ScsLinSysWork *p) {
119
+ static void scale_by_r_y_inv(scs_float *vec, ScsLinSysWork *p) {
115
120
  scs_int i;
116
121
  for (i = 0; i < p->m; ++i) {
117
- vec[i] /= p->rho_y_vec[i];
122
+ vec[i] /= p->diag_r[p->n + i];
123
+ }
124
+ }
125
+
126
+ /* y += R_x * x */
127
+ static void accum_by_r_x(scs_float *y, const scs_float *x, ScsLinSysWork *p) {
128
+ scs_int i;
129
+ for (i = 0; i < p->n; ++i) {
130
+ y[i] += p->diag_r[i] * x[i];
118
131
  }
119
132
  }
120
133
 
@@ -123,7 +136,7 @@ static void accum_by_a(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
123
136
  SCS(accum_by_atrans)(p->At, x, y);
124
137
  }
125
138
 
126
- /* y = (rho_x * I + P + A' R_y^{-1} A) x */
139
+ /* y = (R_x + P + A' R_y^{-1} A) x */
127
140
  static void mat_vec(const ScsMatrix *A, const ScsMatrix *P, ScsLinSysWork *p,
128
141
  const scs_float *x, scs_float *y) {
129
142
  scs_float *z = p->tmp;
@@ -133,10 +146,10 @@ static void mat_vec(const ScsMatrix *A, const ScsMatrix *P, ScsLinSysWork *p,
133
146
  SCS(accum_by_p)(P, x, y); /* y = Px */
134
147
  }
135
148
  accum_by_a(p, x, z); /* z = Ax */
136
- scale_by_diag_r(z, p); /* z = R_y^{-1} A x */
149
+ scale_by_r_y_inv(z, p); /* z = R_y^{-1} A x */
137
150
  SCS(accum_by_atrans)(A, z, y); /* y += A'z, y = Px + A' R_y^{-1} Ax */
138
- /* y = rho_x * x + Px + A' R_y^{-1} A x */
139
- SCS(add_scaled_array)(y, x, A->n, p->rho_x);
151
+ /* y = R_x * x + Px + A' R_y^{-1} A * x */
152
+ accum_by_r_x(y, x, p);
140
153
  }
141
154
 
142
155
  static void apply_pre_conditioner(scs_float *z, scs_float *r, scs_int n,
@@ -149,36 +162,35 @@ static void apply_pre_conditioner(scs_float *z, scs_float *r, scs_int n,
149
162
  }
150
163
 
151
164
  /* no need to update anything in this case */
152
- void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
153
- p->rho_y_vec = rho_y_vec; /* this isn't needed but do it to be safe */
165
+ void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
166
+ p->diag_r = diag_r; /* this isn't needed but do it to be safe */
154
167
  set_preconditioner(p);
155
168
  }
156
169
 
157
170
  ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
158
- scs_float *rho_y_vec, scs_float rho_x) {
171
+ const scs_float *diag_r) {
159
172
  ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
160
173
  p->A = A;
161
174
  p->P = P;
162
175
  p->m = A->m;
163
176
  p->n = A->n;
164
- p->rho_x = rho_x;
165
177
 
166
- p->p = (scs_float *)scs_malloc((A->n) * sizeof(scs_float));
167
- p->r = (scs_float *)scs_malloc((A->n) * sizeof(scs_float));
168
- p->Gp = (scs_float *)scs_malloc((A->n) * sizeof(scs_float));
169
- p->tmp = (scs_float *)scs_malloc((A->m) * sizeof(scs_float));
178
+ p->p = (scs_float *)scs_calloc((A->n), sizeof(scs_float));
179
+ p->r = (scs_float *)scs_calloc((A->n), sizeof(scs_float));
180
+ p->Gp = (scs_float *)scs_calloc((A->n), sizeof(scs_float));
181
+ p->tmp = (scs_float *)scs_calloc((A->m), sizeof(scs_float));
170
182
 
171
183
  /* memory for A transpose */
172
- p->At = (ScsMatrix *)scs_malloc(sizeof(ScsMatrix));
184
+ p->At = (ScsMatrix *)scs_calloc(1, sizeof(ScsMatrix));
173
185
  p->At->m = A->n;
174
186
  p->At->n = A->m;
175
- p->At->i = (scs_int *)scs_malloc((A->p[A->n]) * sizeof(scs_int));
176
- p->At->p = (scs_int *)scs_malloc((A->m + 1) * sizeof(scs_int));
177
- p->At->x = (scs_float *)scs_malloc((A->p[A->n]) * sizeof(scs_float));
187
+ p->At->i = (scs_int *)scs_calloc((A->p[A->n]), sizeof(scs_int));
188
+ p->At->p = (scs_int *)scs_calloc((A->m + 1), sizeof(scs_int));
189
+ p->At->x = (scs_float *)scs_calloc((A->p[A->n]), sizeof(scs_float));
178
190
  transpose(A, p);
179
191
 
180
192
  /* preconditioner memory */
181
- p->rho_y_vec = rho_y_vec;
193
+ p->diag_r = diag_r;
182
194
  p->z = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
183
195
  p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
184
196
  set_preconditioner(p);
@@ -192,8 +204,7 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
192
204
  return p;
193
205
  }
194
206
 
195
- /* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
196
- * b */
207
+ /* solves (R_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution in b */
197
208
  static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *b,
198
209
  scs_int max_its, scs_float tol) {
199
210
  scs_int i, n = pr->n;
@@ -268,14 +279,12 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *b,
268
279
  /* solves Mx = b, for x but stores result in b */
269
280
  /* s contains warm-start (if available) */
270
281
  /*
271
- * [x] = [rho_x I + P A' ]^{-1} [rx]
272
- * [y] [ A -R_y ] [ry]
273
- *
274
- * R_y = diag(rho_y_vec)
282
+ * [x] = [R_x + P A' ]^{-1} [rx]
283
+ * [y] [ A -R_y ] [ry]
275
284
  *
276
285
  * becomes:
277
286
  *
278
- * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
287
+ * x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
279
288
  * y = R_y^{-1} (Ax - ry)
280
289
  *
281
290
  */
@@ -299,12 +308,12 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
299
308
  /* tmp = ry */
300
309
  memcpy(p->tmp, &(b[p->n]), p->m * sizeof(scs_float));
301
310
  /* tmp = R_y^{-1} * ry */
302
- scale_by_diag_r(p->tmp, p);
311
+ scale_by_r_y_inv(p->tmp, p);
303
312
  /* b[:n] = rx + A' R_y^{-1} ry */
304
313
  SCS(accum_by_atrans)(p->A, p->tmp, b);
305
314
  /* set max_iters to 10 * n (though in theory n is enough for any tol) */
306
315
  max_iters = 10 * p->n;
307
- /* solves (rho_x I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
316
+ /* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
308
317
  * b */
309
318
  cg_its = pcg(p, s, b, max_iters, tol); /* b[:n] = x */
310
319
 
@@ -313,7 +322,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
313
322
  /* b[n:] = Ax - ry */
314
323
  accum_by_a(p, b, &(b[p->n]));
315
324
  /* b[n:] = R_y^{-1} (Ax - ry) = y */
316
- scale_by_diag_r(&(b[p->n]), p);
325
+ scale_by_r_y_inv(&(b[p->n]), p);
317
326
  p->tot_cg_its += cg_its;
318
327
  #if VERBOSITY > 1
319
328
  scs_printf("tol %.3e\n", tol);
@@ -25,8 +25,7 @@ struct SCS_LIN_SYS_WORK {
25
25
  scs_float *M;
26
26
  /* reporting */
27
27
  scs_int tot_cg_its;
28
- scs_float *rho_y_vec;
29
- scs_float rho_x;
28
+ const scs_float *diag_r;
30
29
  };
31
30
 
32
31
  #ifdef __cplusplus
@@ -12,9 +12,9 @@ csc *SCS(cs_spalloc)(scs_int m, scs_int n, scs_int nzmax, scs_int values,
12
12
  A->n = n;
13
13
  A->nzmax = nzmax = MAX(nzmax, 1);
14
14
  A->nz = triplet ? 0 : -1; /* allocate triplet or comp.col */
15
- A->p = (scs_int *)scs_malloc((triplet ? nzmax : n + 1) * sizeof(scs_int));
16
- A->i = (scs_int *)scs_malloc(nzmax * sizeof(scs_int));
17
- A->x = values ? (scs_float *)scs_malloc(nzmax * sizeof(scs_float)) : SCS_NULL;
15
+ A->p = (scs_int *)scs_calloc((triplet ? nzmax : n + 1), sizeof(scs_int));
16
+ A->i = (scs_int *)scs_calloc(nzmax, sizeof(scs_int));
17
+ A->x = values ? (scs_float *)scs_calloc(nzmax, sizeof(scs_float)) : SCS_NULL;
18
18
  return (!A->p || !A->i || (values && !A->x)) ? SCS(cs_spfree)(A) : A;
19
19
  }
20
20
 
@@ -18,7 +18,6 @@
18
18
 
19
19
  #ifdef MATLAB_MEX_FILE
20
20
  #include "mex.h"
21
- #include "scs_matrix.h"
22
21
  #endif
23
22
 
24
23
  #ifndef NULL
@@ -51,7 +50,7 @@
51
50
 
52
51
  struct SuiteSparse_config_struct SuiteSparse_config =
53
52
  {
54
- _scs_malloc, _scs_calloc, _scs_realloc, _scs_free, _scs_printf,
53
+ scs_malloc, scs_calloc, scs_realloc, scs_free, _scs_printf,
55
54
  SuiteSparse_hypot,
56
55
  SuiteSparse_divcomplex
57
56
 
@@ -73,12 +72,13 @@ struct SuiteSparse_config_struct SuiteSparse_config =
73
72
  SuiteSparse_start be called prior to calling any SuiteSparse function.
74
73
  */
75
74
 
75
+
76
76
  void SuiteSparse_start ( void )
77
77
  {
78
- SuiteSparse_config.malloc_func = _scs_malloc ;
79
- SuiteSparse_config.calloc_func = _scs_calloc ;
80
- SuiteSparse_config.realloc_func = _scs_realloc ;
81
- SuiteSparse_config.free_func = _scs_free ;
78
+ SuiteSparse_config.malloc_func = scs_malloc ;
79
+ SuiteSparse_config.calloc_func = scs_calloc ;
80
+ SuiteSparse_config.realloc_func = scs_realloc ;
81
+ SuiteSparse_config.free_func = scs_free ;
82
82
  SuiteSparse_config.printf_func = _scs_printf ;
83
83
  /* math functions */
84
84
  SuiteSparse_config.hypot_func = SuiteSparse_hypot ;
@@ -44,7 +44,7 @@ extern "C" {
44
44
 
45
45
  #include <limits.h>
46
46
  #include <stdlib.h>
47
- #include "scs.h"
47
+ #include "glbopts.h"
48
48
  #include "ctrlc.h"
49
49
 
50
50
  /* ========================================================================== */
@@ -71,6 +71,11 @@ extern "C" {
71
71
  #define SuiteSparse_long_id "%" SuiteSparse_long_idd
72
72
  #endif
73
73
 
74
+ #ifndef _scs_printf
75
+ #define _scs_printf scs_printf
76
+ #endif
77
+
78
+
74
79
  /* ========================================================================== */
75
80
  /* === SuiteSparse_config parameters and functions ========================== */
76
81
  /* ========================================================================== */
@@ -89,8 +89,8 @@ GLOBAL Int AMD_order
89
89
  }
90
90
 
91
91
  /* allocate two size-n integer workspaces */
92
- Len = SuiteSparse_malloc (n, sizeof (Int)) ;
93
- Pinv = SuiteSparse_malloc (n, sizeof (Int)) ;
92
+ Len = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
93
+ Pinv = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
94
94
  mem += n ;
95
95
  mem += n ;
96
96
  if (!Len || !Pinv)
@@ -106,8 +106,8 @@ GLOBAL Int AMD_order
106
106
  {
107
107
  /* sort the input matrix and remove duplicate entries */
108
108
  AMD_DEBUG1 (("Matrix is jumbled\n")) ;
109
- Rp = SuiteSparse_malloc (n+1, sizeof (Int)) ;
110
- Ri = SuiteSparse_malloc (nz, sizeof (Int)) ;
109
+ Rp = (Int *)SuiteSparse_malloc (n+1, sizeof (Int)) ;
110
+ Ri = (Int *)SuiteSparse_malloc (nz, sizeof (Int)) ;
111
111
  mem += (n+1) ;
112
112
  mem += MAX (nz,1) ;
113
113
  if (!Rp || !Ri)
@@ -160,7 +160,7 @@ GLOBAL Int AMD_order
160
160
  ok = ok && (slen < Int_MAX) ; /* S[i] for Int i must be OK */
161
161
  if (ok)
162
162
  {
163
- S = SuiteSparse_malloc (slen, sizeof (Int)) ;
163
+ S = (Int *)SuiteSparse_malloc (slen, sizeof (Int)) ;
164
164
  }
165
165
  AMD_DEBUG1 (("slen %g\n", (scs_float) slen)) ;
166
166
  if (!S)
@@ -1,12 +1,17 @@
1
- #ifndef SCSGPU_H_GUARD
2
- #define SCSGPU_H_GUARD
1
+ #ifndef SCS_GPU_H_GUARD
2
+ #define SCS_GPU_H_GUARD
3
3
 
4
4
  #ifdef __cplusplus
5
5
  extern "C" {
6
6
  #endif
7
7
 
8
- #include <cublas_v2.h>
8
+ /* TODO: Do we need this?
9
+
9
10
  #include <cuda.h>
11
+
12
+ */
13
+
14
+ #include <cublas_v2.h>
10
15
  #include <cuda_runtime_api.h>
11
16
  #include <cusparse.h>
12
17
 
@@ -31,11 +36,9 @@ extern "C" {
31
36
  #ifndef SFLOAT
32
37
  #define CUBLAS(x) cublasD##x
33
38
  #define CUBLASI(x) cublasId##x
34
- #define CUSPARSE(x) cusparseD##x
35
39
  #else
36
40
  #define CUBLAS(x) cublasS##x
37
41
  #define CUBLASI(x) cublasIs##x
38
- #define CUSPARSE(x) cusparseS##x
39
42
  #endif
40
43
  #define CUSPARSE_GEN(x) cusparse##x
41
44
  #else
@@ -46,9 +49,6 @@ extern "C" {
46
49
  #define CUBLASI(x) \
47
50
  CUDA_CHECK_ERR; \
48
51
  cublasId##x
49
- #define CUSPARSE(x) \
50
- CUDA_CHECK_ERR; \
51
- cusparseD##x
52
52
  #else
53
53
  #define CUBLAS(x) \
54
54
  CUDA_CHECK_ERR; \
@@ -56,9 +56,6 @@ extern "C" {
56
56
  #define CUBLASI(x) \
57
57
  CUDA_CHECK_ERR; \
58
58
  cublasIs##x
59
- #define CUSPARSE(x) \
60
- CUDA_CHECK_ERR; \
61
- cusparseS##x
62
59
  #endif
63
60
  #define CUSPARSE_GEN(x) \
64
61
  CUDA_CHECK_ERR; \
@@ -35,63 +35,77 @@ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
35
35
  }
36
36
  */
37
37
 
38
- /* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
39
- static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
38
+ /* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
39
+ /* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
40
+ static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
40
41
  scs_int i, k;
41
42
  const ScsMatrix *A = p->A;
42
43
  const ScsMatrix *P = p->P;
43
- scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
44
+ scs_float *M = p->M;
44
45
 
45
46
  #if VERBOSITY > 0
46
47
  scs_printf("getting pre-conditioner\n");
47
48
  #endif
48
49
 
50
+ /* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
49
51
  for (i = 0; i < A->n; ++i) { /* cols */
50
- M[i] = p->rho_x;
51
- /* diag(A' R_y^{-1} A) */
52
+ /* M_ii = (R_x)_i */
53
+ M[i] = diag_r[i];
54
+ /* M_ii += a_i' (R_y)^-1 a_i */
52
55
  for (k = A->p[i]; k < A->p[i + 1]; ++k) {
53
56
  /* A->i[k] is row of entry k with value A->x[k] */
54
- M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
57
+ M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
55
58
  }
56
59
  if (P) {
57
60
  for (k = P->p[i]; k < P->p[i + 1]; k++) {
58
61
  /* diagonal element only */
59
62
  if (P->i[k] == i) { /* row == col */
63
+ /* M_ii += P_ii */
60
64
  M[i] += P->x[k];
61
65
  break;
62
66
  }
63
67
  }
64
68
  }
69
+ /* finally invert for pre-conditioner */
65
70
  M[i] = 1. / M[i];
66
71
  }
67
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
68
- scs_free(M);
72
+ cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
69
73
  #if VERBOSITY > 0
70
74
  scs_printf("finished getting pre-conditioner\n");
71
75
  #endif
72
76
  }
73
77
 
74
78
  /* no need to update anything in this case */
75
- void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
79
+ void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
76
80
  scs_int i;
81
+
82
+ /* R_x to gpu */
83
+ cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
84
+ cudaMemcpyHostToDevice);
85
+
86
+ /* 1/R_y to gpu */
77
87
  for (i = 0; i < p->m; ++i)
78
- p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
79
- cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
88
+ p->inv_r_y[i] = 1. / diag_r[p->n + i];
89
+ cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
80
90
  cudaMemcpyHostToDevice);
81
- set_preconditioner(p, rho_y_vec);
91
+
92
+ /* set preconditioner M on gpu */
93
+ set_preconditioner(p, diag_r);
82
94
  }
83
95
 
84
96
  void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
85
97
  if (p) {
86
- scs_free(p->inv_rho_y_vec);
98
+ scs_free(p->M);
99
+ scs_free(p->inv_r_y);
87
100
  cudaFree(p->p);
88
101
  cudaFree(p->r);
89
102
  cudaFree(p->Gp);
90
103
  cudaFree(p->bg);
91
104
  cudaFree(p->tmp_m);
92
105
  cudaFree(p->z);
93
- cudaFree(p->M);
94
- cudaFree(p->inv_rho_y_vec_gpu);
106
+ cudaFree(p->M_gpu);
107
+ cudaFree(p->r_x_gpu);
108
+ cudaFree(p->inv_r_y_gpu);
95
109
  if (p->Pg) {
96
110
  SCS(free_gpu_matrix)(p->Pg);
97
111
  scs_free(p->Pg);
@@ -126,22 +140,23 @@ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
126
140
  0, M, 1, z, 1);
127
141
  }
128
142
 
129
- /* y = (rho_x * I + P + A' R_y^{-1} A) x */
143
+ /* y = (R_x + P + A' R_y^{-1} A) x */
130
144
  static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
131
145
  /* x and y MUST already be loaded to GPU */
132
146
  scs_float *z = p->tmp_m; /* temp memory */
133
- cudaMemset(y, 0, p->n * sizeof(scs_float));
134
147
  cudaMemset(z, 0, p->m * sizeof(scs_float));
135
148
 
136
149
  cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
137
150
  cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
138
151
  cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
139
152
 
140
- /* y = rho_x * x */
141
- CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
153
+ /* y = x */
154
+ cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
155
+ /* y = R_x * x */
156
+ scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
142
157
 
143
158
  if (p->Pg) {
144
- /* y = rho_x * x + Px */
159
+ /* y = R_x * x + P x */
145
160
  SCS(accum_by_p_gpu)
146
161
  (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
147
162
  &p->buffer);
@@ -158,9 +173,9 @@ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
158
173
  &p->buffer);
159
174
  #endif
160
175
  /* z = R_y^{-1} A x */
161
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
176
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
162
177
 
163
- /* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
178
+ /* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
164
179
  SCS(accum_by_atrans_gpu)
165
180
  (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
166
181
  &p->buffer);
@@ -201,19 +216,35 @@ static csc *fill_p_matrix(const ScsMatrix *P) {
201
216
  }
202
217
 
203
218
  ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
204
- scs_float *rho_y_vec, scs_float rho_x) {
219
+ const scs_float *diag_r) {
205
220
  cudaError_t err;
206
- scs_int i;
207
221
  csc *P_full;
208
- ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
209
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
222
+ ScsLinSysWork *p = SCS_NULL;
223
+ ScsGpuMatrix *Ag = SCS_NULL;
210
224
  ScsGpuMatrix *Pg = SCS_NULL;
225
+ int device_count;
226
+
227
+ err = cudaGetDeviceCount(&device_count);
228
+ if (err > 0) {
229
+ scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
230
+ return SCS_NULL;
231
+ }
232
+
233
+ p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
234
+ Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
235
+
236
+ p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
237
+ p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
238
+
239
+ p->A = A;
240
+ p->P = P;
241
+ p->m = A->m;
242
+ p->n = A->n;
211
243
 
212
244
  #if GPU_TRANSPOSE_MAT > 0
213
245
  size_t new_buffer_size = 0;
214
246
  #endif
215
247
 
216
- p->rho_x = rho_x;
217
248
  p->cublas_handle = 0;
218
249
  p->cusparse_handle = 0;
219
250
 
@@ -242,8 +273,9 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
242
273
  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
243
274
  cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
244
275
  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
245
- cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
246
- cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
276
+ cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
277
+ cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
278
+ cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
247
279
 
248
280
  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
249
281
  cudaMemcpyHostToDevice);
@@ -251,12 +283,6 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
251
283
  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
252
284
  cudaMemcpyHostToDevice);
253
285
 
254
- p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
255
- for (i = 0; i < A->m; ++i)
256
- p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
257
- cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
258
- cudaMemcpyHostToDevice);
259
-
260
286
  cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
261
287
  SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
262
288
  CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
@@ -297,7 +323,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
297
323
  cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
298
324
  cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
299
325
 
300
- set_preconditioner(p, rho_y_vec);
326
+ /* Form preconditioner and copy R_x, 1/R_y to gpu */
327
+ SCS(update_lin_sys_diag_r)(p, diag_r);
301
328
 
302
329
  #if GPU_TRANSPOSE_MAT > 0
303
330
  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
@@ -346,9 +373,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
346
373
  return p;
347
374
  }
348
375
 
349
- /* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
350
- * b */
351
- /* on GPU */
376
+ /* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
377
+ * b, on GPU */
352
378
  static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
353
379
  scs_int max_its, scs_float tol) {
354
380
  scs_int i, n = pr->n;
@@ -386,7 +412,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
386
412
 
387
413
  /* z = M r */
388
414
  cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
389
- scale_by_diag(cublas_handle, pr->M, z, n);
415
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
390
416
  /* ztr = z'r */
391
417
  CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
392
418
  /* p = z */
@@ -415,7 +441,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
415
441
  }
416
442
  /* z = M r */
417
443
  cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
418
- scale_by_diag(cublas_handle, pr->M, z, n);
444
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
419
445
  ztr_prev = ztr;
420
446
  /* ztr = z'r */
421
447
  CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
@@ -431,14 +457,12 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
431
457
  /* solves Mx = b, for x but stores result in b */
432
458
  /* s contains warm-start (if available) */
433
459
  /*
434
- * [x] = [rho_x I + P A' ]^{-1} [rx]
460
+ * [x] = [R_x + P A' ]^{-1} [rx]
435
461
  * [y] [ A -R_y ] [ry]
436
462
  *
437
- * R_y = diag(rho_y_vec)
438
- *
439
463
  * becomes:
440
464
  *
441
- * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
465
+ * x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
442
466
  * y = R_y^{-1} (Ax - ry)
443
467
  *
444
468
  */
@@ -451,7 +475,6 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
451
475
  scs_float *bg = p->bg;
452
476
  scs_float *tmp_m = p->tmp_m;
453
477
  ScsGpuMatrix *Ag = p->Ag;
454
- ScsGpuMatrix *Pg = p->Pg;
455
478
 
456
479
  if (CG_NORM(b, p->n + p->m) <= 1e-12) {
457
480
  memset(b, 0, (p->n + p->m) * sizeof(scs_float));
@@ -471,7 +494,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
471
494
  cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
472
495
  cudaMemcpyDeviceToDevice);
473
496
  /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
474
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
497
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
475
498
 
476
499
  cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
477
500
  cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
@@ -483,7 +506,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
483
506
  /* set max_iters to 10 * n (though in theory n is enough for any tol) */
484
507
  max_iters = 10 * Ag->n;
485
508
 
486
- /* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
509
+ /* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
487
510
  * in bg */
488
511
  cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
489
512
 
@@ -504,7 +527,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
504
527
  #endif
505
528
 
506
529
  /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
507
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
530
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
508
531
 
509
532
  /* copy bg = [x; y] back to b */
510
533
  cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),