scs 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (92) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -0
  3. data/README.md +42 -13
  4. data/lib/scs/ffi.rb +1 -7
  5. data/lib/scs/matrix.rb +72 -0
  6. data/lib/scs/solver.rb +19 -26
  7. data/lib/scs/version.rb +1 -1
  8. data/lib/scs.rb +1 -0
  9. data/vendor/scs/CITATION.cff +1 -1
  10. data/vendor/scs/CMakeLists.txt +55 -7
  11. data/vendor/scs/Makefile +9 -9
  12. data/vendor/scs/README.md +4 -1
  13. data/vendor/scs/include/aa.h +1 -1
  14. data/vendor/scs/include/cones.h +17 -12
  15. data/vendor/scs/include/glbopts.h +27 -66
  16. data/vendor/scs/include/linalg.h +2 -1
  17. data/vendor/scs/include/linsys.h +13 -13
  18. data/vendor/scs/include/normalize.h +7 -5
  19. data/vendor/scs/include/rw.h +3 -3
  20. data/vendor/scs/include/scs.h +85 -106
  21. data/vendor/scs/include/scs_types.h +34 -0
  22. data/vendor/scs/include/scs_work.h +80 -0
  23. data/vendor/scs/include/util.h +3 -1
  24. data/vendor/scs/linsys/cpu/direct/private.c +86 -73
  25. data/vendor/scs/linsys/cpu/direct/private.h +2 -2
  26. data/vendor/scs/linsys/cpu/indirect/private.c +42 -33
  27. data/vendor/scs/linsys/cpu/indirect/private.h +1 -2
  28. data/vendor/scs/linsys/csparse.c +3 -3
  29. data/vendor/scs/linsys/external/amd/LICENSE.txt +0 -897
  30. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +9 -7
  31. data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +1 -1
  32. data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
  33. data/vendor/scs/linsys/gpu/gpu.h +8 -11
  34. data/vendor/scs/linsys/gpu/indirect/private.c +72 -49
  35. data/vendor/scs/linsys/gpu/indirect/private.h +14 -13
  36. data/vendor/scs/linsys/scs_matrix.c +55 -104
  37. data/vendor/scs/linsys/scs_matrix.h +5 -4
  38. data/vendor/scs/scs.mk +1 -5
  39. data/vendor/scs/src/aa.c +13 -8
  40. data/vendor/scs/src/cones.c +197 -108
  41. data/vendor/scs/src/linalg.c +25 -0
  42. data/vendor/scs/src/normalize.c +75 -26
  43. data/vendor/scs/src/rw.c +74 -30
  44. data/vendor/scs/src/scs.c +300 -264
  45. data/vendor/scs/src/scs_version.c +8 -6
  46. data/vendor/scs/src/util.c +27 -13
  47. data/vendor/scs/test/minunit.h +6 -1
  48. data/vendor/scs/test/problem_utils.h +28 -35
  49. data/vendor/scs/test/problems/degenerate.h +2 -1
  50. data/vendor/scs/test/problems/hs21_tiny_qp.h +2 -1
  51. data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +6 -2
  52. data/vendor/scs/test/problems/infeasible_tiny_qp.h +2 -1
  53. data/vendor/scs/test/problems/qafiro_tiny_qp.h +5 -4
  54. data/vendor/scs/test/problems/random_prob.h +6 -2
  55. data/vendor/scs/test/problems/rob_gauss_cov_est.h +9 -2
  56. data/vendor/scs/test/problems/small_lp.h +7 -2
  57. data/vendor/scs/test/problems/small_qp.h +387 -0
  58. data/vendor/scs/test/problems/{test_fails.h → test_validation.h} +7 -4
  59. data/vendor/scs/test/problems/unbounded_tiny_qp.h +4 -4
  60. data/vendor/scs/test/random_socp_prob.c +4 -2
  61. data/vendor/scs/test/run_from_file.c +16 -4
  62. data/vendor/scs/test/run_tests.c +23 -14
  63. metadata +10 -35
  64. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  65. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  66. data/vendor/scs/linsys/csparse.o +0 -0
  67. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  68. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  69. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  70. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  71. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  72. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  73. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  74. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  75. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  76. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  77. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  78. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  79. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  80. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  81. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  82. data/vendor/scs/linsys/scs_matrix.o +0 -0
  83. data/vendor/scs/src/aa.o +0 -0
  84. data/vendor/scs/src/cones.o +0 -0
  85. data/vendor/scs/src/ctrlc.o +0 -0
  86. data/vendor/scs/src/linalg.o +0 -0
  87. data/vendor/scs/src/normalize.o +0 -0
  88. data/vendor/scs/src/rw.o +0 -0
  89. data/vendor/scs/src/scs.o +0 -0
  90. data/vendor/scs/src/scs_indir.o +0 -0
  91. data/vendor/scs/src/scs_version.o +0 -0
  92. data/vendor/scs/src/util.o +0 -0
@@ -18,7 +18,6 @@
18
18
 
19
19
  #ifdef MATLAB_MEX_FILE
20
20
  #include "mex.h"
21
- #include "scs_matrix.h"
22
21
  #endif
23
22
 
24
23
  #ifndef NULL
@@ -51,7 +50,9 @@
51
50
 
52
51
  struct SuiteSparse_config_struct SuiteSparse_config =
53
52
  {
54
- _scs_malloc, _scs_calloc, _scs_realloc, _scs_free, _scs_printf,
53
+ scs_malloc, scs_calloc, scs_realloc, scs_free,
54
+ /* Disable printing */
55
+ SCS_NULL,
55
56
  SuiteSparse_hypot,
56
57
  SuiteSparse_divcomplex
57
58
 
@@ -73,13 +74,14 @@ struct SuiteSparse_config_struct SuiteSparse_config =
73
74
  SuiteSparse_start be called prior to calling any SuiteSparse function.
74
75
  */
75
76
 
77
+
76
78
  void SuiteSparse_start ( void )
77
79
  {
78
- SuiteSparse_config.malloc_func = _scs_malloc ;
79
- SuiteSparse_config.calloc_func = _scs_calloc ;
80
- SuiteSparse_config.realloc_func = _scs_realloc ;
81
- SuiteSparse_config.free_func = _scs_free ;
82
- SuiteSparse_config.printf_func = _scs_printf ;
80
+ SuiteSparse_config.malloc_func = scs_malloc ;
81
+ SuiteSparse_config.calloc_func = scs_calloc ;
82
+ SuiteSparse_config.realloc_func = scs_realloc ;
83
+ SuiteSparse_config.free_func = scs_free ;
84
+ SuiteSparse_config.printf_func = SCS_NULL;
83
85
  /* math functions */
84
86
  SuiteSparse_config.hypot_func = SuiteSparse_hypot ;
85
87
  SuiteSparse_config.divcomplex_func = SuiteSparse_divcomplex ;
@@ -44,7 +44,7 @@ extern "C" {
44
44
 
45
45
  #include <limits.h>
46
46
  #include <stdlib.h>
47
- #include "scs.h"
47
+ #include "glbopts.h"
48
48
  #include "ctrlc.h"
49
49
 
50
50
  /* ========================================================================== */
@@ -89,8 +89,8 @@ GLOBAL Int AMD_order
89
89
  }
90
90
 
91
91
  /* allocate two size-n integer workspaces */
92
- Len = SuiteSparse_malloc (n, sizeof (Int)) ;
93
- Pinv = SuiteSparse_malloc (n, sizeof (Int)) ;
92
+ Len = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
93
+ Pinv = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
94
94
  mem += n ;
95
95
  mem += n ;
96
96
  if (!Len || !Pinv)
@@ -106,8 +106,8 @@ GLOBAL Int AMD_order
106
106
  {
107
107
  /* sort the input matrix and remove duplicate entries */
108
108
  AMD_DEBUG1 (("Matrix is jumbled\n")) ;
109
- Rp = SuiteSparse_malloc (n+1, sizeof (Int)) ;
110
- Ri = SuiteSparse_malloc (nz, sizeof (Int)) ;
109
+ Rp = (Int *)SuiteSparse_malloc (n+1, sizeof (Int)) ;
110
+ Ri = (Int *)SuiteSparse_malloc (nz, sizeof (Int)) ;
111
111
  mem += (n+1) ;
112
112
  mem += MAX (nz,1) ;
113
113
  if (!Rp || !Ri)
@@ -160,7 +160,7 @@ GLOBAL Int AMD_order
160
160
  ok = ok && (slen < Int_MAX) ; /* S[i] for Int i must be OK */
161
161
  if (ok)
162
162
  {
163
- S = SuiteSparse_malloc (slen, sizeof (Int)) ;
163
+ S = (Int *)SuiteSparse_malloc (slen, sizeof (Int)) ;
164
164
  }
165
165
  AMD_DEBUG1 (("slen %g\n", (scs_float) slen)) ;
166
166
  if (!S)
@@ -1,12 +1,17 @@
1
- #ifndef SCSGPU_H_GUARD
2
- #define SCSGPU_H_GUARD
1
+ #ifndef SCS_GPU_H_GUARD
2
+ #define SCS_GPU_H_GUARD
3
3
 
4
4
  #ifdef __cplusplus
5
5
  extern "C" {
6
6
  #endif
7
7
 
8
- #include <cublas_v2.h>
8
+ /* TODO: Do we need this?
9
+
9
10
  #include <cuda.h>
11
+
12
+ */
13
+
14
+ #include <cublas_v2.h>
10
15
  #include <cuda_runtime_api.h>
11
16
  #include <cusparse.h>
12
17
 
@@ -31,11 +36,9 @@ extern "C" {
31
36
  #ifndef SFLOAT
32
37
  #define CUBLAS(x) cublasD##x
33
38
  #define CUBLASI(x) cublasId##x
34
- #define CUSPARSE(x) cusparseD##x
35
39
  #else
36
40
  #define CUBLAS(x) cublasS##x
37
41
  #define CUBLASI(x) cublasIs##x
38
- #define CUSPARSE(x) cusparseS##x
39
42
  #endif
40
43
  #define CUSPARSE_GEN(x) cusparse##x
41
44
  #else
@@ -46,9 +49,6 @@ extern "C" {
46
49
  #define CUBLASI(x) \
47
50
  CUDA_CHECK_ERR; \
48
51
  cublasId##x
49
- #define CUSPARSE(x) \
50
- CUDA_CHECK_ERR; \
51
- cusparseD##x
52
52
  #else
53
53
  #define CUBLAS(x) \
54
54
  CUDA_CHECK_ERR; \
@@ -56,9 +56,6 @@ extern "C" {
56
56
  #define CUBLASI(x) \
57
57
  CUDA_CHECK_ERR; \
58
58
  cublasIs##x
59
- #define CUSPARSE(x) \
60
- CUDA_CHECK_ERR; \
61
- cusparseS##x
62
59
  #endif
63
60
  #define CUSPARSE_GEN(x) \
64
61
  CUDA_CHECK_ERR; \
@@ -35,63 +35,77 @@ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
35
35
  }
36
36
  */
37
37
 
38
- /* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
39
- static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
38
+ /* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
39
+ /* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
40
+ static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
40
41
  scs_int i, k;
41
42
  const ScsMatrix *A = p->A;
42
43
  const ScsMatrix *P = p->P;
43
- scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
44
+ scs_float *M = p->M;
44
45
 
45
46
  #if VERBOSITY > 0
46
47
  scs_printf("getting pre-conditioner\n");
47
48
  #endif
48
49
 
50
+ /* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
49
51
  for (i = 0; i < A->n; ++i) { /* cols */
50
- M[i] = p->rho_x;
51
- /* diag(A' R_y^{-1} A) */
52
+ /* M_ii = (R_x)_i */
53
+ M[i] = diag_r[i];
54
+ /* M_ii += a_i' (R_y)^-1 a_i */
52
55
  for (k = A->p[i]; k < A->p[i + 1]; ++k) {
53
56
  /* A->i[k] is row of entry k with value A->x[k] */
54
- M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
57
+ M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
55
58
  }
56
59
  if (P) {
57
60
  for (k = P->p[i]; k < P->p[i + 1]; k++) {
58
61
  /* diagonal element only */
59
62
  if (P->i[k] == i) { /* row == col */
63
+ /* M_ii += P_ii */
60
64
  M[i] += P->x[k];
61
65
  break;
62
66
  }
63
67
  }
64
68
  }
69
+ /* finally invert for pre-conditioner */
65
70
  M[i] = 1. / M[i];
66
71
  }
67
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
68
- scs_free(M);
72
+ cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
69
73
  #if VERBOSITY > 0
70
74
  scs_printf("finished getting pre-conditioner\n");
71
75
  #endif
72
76
  }
73
77
 
74
78
  /* no need to update anything in this case */
75
- void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
79
+ void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
76
80
  scs_int i;
81
+
82
+ /* R_x to gpu */
83
+ cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
84
+ cudaMemcpyHostToDevice);
85
+
86
+ /* 1/R_y to gpu */
77
87
  for (i = 0; i < p->m; ++i)
78
- p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
79
- cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
88
+ p->inv_r_y[i] = 1. / diag_r[p->n + i];
89
+ cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
80
90
  cudaMemcpyHostToDevice);
81
- set_preconditioner(p, rho_y_vec);
91
+
92
+ /* set preconditioner M on gpu */
93
+ set_preconditioner(p, diag_r);
82
94
  }
83
95
 
84
96
  void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
85
97
  if (p) {
86
- scs_free(p->inv_rho_y_vec);
98
+ scs_free(p->M);
99
+ scs_free(p->inv_r_y);
87
100
  cudaFree(p->p);
88
101
  cudaFree(p->r);
89
102
  cudaFree(p->Gp);
90
103
  cudaFree(p->bg);
91
104
  cudaFree(p->tmp_m);
92
105
  cudaFree(p->z);
93
- cudaFree(p->M);
94
- cudaFree(p->inv_rho_y_vec_gpu);
106
+ cudaFree(p->M_gpu);
107
+ cudaFree(p->r_x_gpu);
108
+ cudaFree(p->inv_r_y_gpu);
95
109
  if (p->Pg) {
96
110
  SCS(free_gpu_matrix)(p->Pg);
97
111
  scs_free(p->Pg);
@@ -126,22 +140,23 @@ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
126
140
  0, M, 1, z, 1);
127
141
  }
128
142
 
129
- /* y = (rho_x * I + P + A' R_y^{-1} A) x */
143
+ /* y = (R_x + P + A' R_y^{-1} A) x */
130
144
  static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
131
145
  /* x and y MUST already be loaded to GPU */
132
146
  scs_float *z = p->tmp_m; /* temp memory */
133
- cudaMemset(y, 0, p->n * sizeof(scs_float));
134
147
  cudaMemset(z, 0, p->m * sizeof(scs_float));
135
148
 
136
149
  cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
137
150
  cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
138
151
  cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
139
152
 
140
- /* y = rho_x * x */
141
- CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
153
+ /* y = x */
154
+ cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
155
+ /* y = R_x * x */
156
+ scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
142
157
 
143
158
  if (p->Pg) {
144
- /* y = rho_x * x + Px */
159
+ /* y = R_x * x + P x */
145
160
  SCS(accum_by_p_gpu)
146
161
  (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
147
162
  &p->buffer);
@@ -158,9 +173,9 @@ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
158
173
  &p->buffer);
159
174
  #endif
160
175
  /* z = R_y^{-1} A x */
161
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
176
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
162
177
 
163
- /* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
178
+ /* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
164
179
  SCS(accum_by_atrans_gpu)
165
180
  (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
166
181
  &p->buffer);
@@ -201,19 +216,35 @@ static csc *fill_p_matrix(const ScsMatrix *P) {
201
216
  }
202
217
 
203
218
  ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
204
- scs_float *rho_y_vec, scs_float rho_x) {
219
+ const scs_float *diag_r) {
205
220
  cudaError_t err;
206
- scs_int i;
207
221
  csc *P_full;
208
- ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
209
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
222
+ ScsLinSysWork *p = SCS_NULL;
223
+ ScsGpuMatrix *Ag = SCS_NULL;
210
224
  ScsGpuMatrix *Pg = SCS_NULL;
225
+ int device_count;
226
+
227
+ err = cudaGetDeviceCount(&device_count);
228
+ if (err > 0) {
229
+ scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
230
+ return SCS_NULL;
231
+ }
232
+
233
+ p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
234
+ Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
235
+
236
+ p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
237
+ p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
238
+
239
+ p->A = A;
240
+ p->P = P;
241
+ p->m = A->m;
242
+ p->n = A->n;
211
243
 
212
244
  #if GPU_TRANSPOSE_MAT > 0
213
245
  size_t new_buffer_size = 0;
214
246
  #endif
215
247
 
216
- p->rho_x = rho_x;
217
248
  p->cublas_handle = 0;
218
249
  p->cusparse_handle = 0;
219
250
 
@@ -242,8 +273,9 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
242
273
  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
243
274
  cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
244
275
  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
245
- cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
246
- cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
276
+ cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
277
+ cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
278
+ cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
247
279
 
248
280
  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
249
281
  cudaMemcpyHostToDevice);
@@ -251,12 +283,6 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
251
283
  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
252
284
  cudaMemcpyHostToDevice);
253
285
 
254
- p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
255
- for (i = 0; i < A->m; ++i)
256
- p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
257
- cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
258
- cudaMemcpyHostToDevice);
259
-
260
286
  cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
261
287
  SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
262
288
  CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
@@ -297,7 +323,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
297
323
  cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
298
324
  cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
299
325
 
300
- set_preconditioner(p, rho_y_vec);
326
+ /* Form preconditioner and copy R_x, 1/R_y to gpu */
327
+ SCS(update_lin_sys_diag_r)(p, diag_r);
301
328
 
302
329
  #if GPU_TRANSPOSE_MAT > 0
303
330
  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
@@ -346,9 +373,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
346
373
  return p;
347
374
  }
348
375
 
349
- /* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
350
- * b */
351
- /* on GPU */
376
+ /* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
377
+ * b, on GPU */
352
378
  static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
353
379
  scs_int max_its, scs_float tol) {
354
380
  scs_int i, n = pr->n;
@@ -386,7 +412,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
386
412
 
387
413
  /* z = M r */
388
414
  cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
389
- scale_by_diag(cublas_handle, pr->M, z, n);
415
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
390
416
  /* ztr = z'r */
391
417
  CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
392
418
  /* p = z */
@@ -415,7 +441,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
415
441
  }
416
442
  /* z = M r */
417
443
  cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
418
- scale_by_diag(cublas_handle, pr->M, z, n);
444
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
419
445
  ztr_prev = ztr;
420
446
  /* ztr = z'r */
421
447
  CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
@@ -431,14 +457,12 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
431
457
  /* solves Mx = b, for x but stores result in b */
432
458
  /* s contains warm-start (if available) */
433
459
  /*
434
- * [x] = [rho_x I + P A' ]^{-1} [rx]
460
+ * [x] = [R_x + P A' ]^{-1} [rx]
435
461
  * [y] [ A -R_y ] [ry]
436
462
  *
437
- * R_y = diag(rho_y_vec)
438
- *
439
463
  * becomes:
440
464
  *
441
- * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
465
+ * x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
442
466
  * y = R_y^{-1} (Ax - ry)
443
467
  *
444
468
  */
@@ -451,7 +475,6 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
451
475
  scs_float *bg = p->bg;
452
476
  scs_float *tmp_m = p->tmp_m;
453
477
  ScsGpuMatrix *Ag = p->Ag;
454
- ScsGpuMatrix *Pg = p->Pg;
455
478
 
456
479
  if (CG_NORM(b, p->n + p->m) <= 1e-12) {
457
480
  memset(b, 0, (p->n + p->m) * sizeof(scs_float));
@@ -471,7 +494,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
471
494
  cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
472
495
  cudaMemcpyDeviceToDevice);
473
496
  /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
474
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
497
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
475
498
 
476
499
  cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
477
500
  cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
@@ -483,7 +506,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
483
506
  /* set max_iters to 10 * n (though in theory n is enough for any tol) */
484
507
  max_iters = 10 * Ag->n;
485
508
 
486
- /* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
509
+ /* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
487
510
  * in bg */
488
511
  cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
489
512
 
@@ -504,7 +527,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
504
527
  #endif
505
528
 
506
529
  /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
507
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
530
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
508
531
 
509
532
  /* copy bg = [x; y] back to b */
510
533
  cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
@@ -15,19 +15,20 @@ struct SCS_LIN_SYS_WORK {
15
15
  scs_int n, m; /* linear system dimensions */
16
16
  /* reporting */
17
17
  scs_int tot_cg_its;
18
+ scs_float *M; /* preconditioner on cpu */
18
19
  /* ALL BELOW HOSTED ON THE GPU */
19
- scs_float *p; /* cg iterate, n */
20
- scs_float *r; /* cg residual, n */
21
- scs_float *Gp; /* G * p, n */
22
- scs_float *bg; /* b, n */
23
- scs_float *tmp_m; /* m, used in mat_vec */
24
- scs_float *z; /* preconditioned */
25
- scs_float *M; /* preconditioner */
20
+ scs_float *p; /* cg iterate, n */
21
+ scs_float *r; /* cg residual, n */
22
+ scs_float *Gp; /* G * p, n */
23
+ scs_float *bg; /* b, n */
24
+ scs_float *tmp_m; /* m, used in mat_vec */
25
+ scs_float *z; /* preconditioned */
26
+ scs_float *M_gpu; /* preconditioner */
26
27
  const ScsMatrix *A; /* does *not* own this memory */
27
28
  const ScsMatrix *P; /* does *not* own this memory */
28
- ScsGpuMatrix *Ag; /* A matrix on GPU */
29
- ScsGpuMatrix *Agt; /* A trans matrix on GPU */
30
- ScsGpuMatrix *Pg; /* P matrix on GPU */
29
+ ScsGpuMatrix *Ag; /* A matrix on GPU */
30
+ ScsGpuMatrix *Agt; /* A trans matrix on GPU */
31
+ ScsGpuMatrix *Pg; /* P matrix on GPU */
31
32
  /* CUDA */
32
33
  cublasHandle_t cublas_handle;
33
34
  cusparseHandle_t cusparse_handle;
@@ -39,9 +40,9 @@ struct SCS_LIN_SYS_WORK {
39
40
  cusparseDnVecDescr_t dn_vec_n_p; /* Dense vector of length n */
40
41
 
41
42
  /* rho terms */
42
- scs_float rho_x;
43
- scs_float *inv_rho_y_vec; /* inverse rho_y_vec */
44
- scs_float *inv_rho_y_vec_gpu; /* inverse rho_y_vec on GPU */
43
+ scs_float *r_x_gpu;
44
+ scs_float *inv_r_y; /* inverse R_y */
45
+ scs_float *inv_r_y_gpu; /* inverse R_y on GPU */
45
46
  };
46
47
 
47
48
  #ifdef __cplusplus