scs 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -0
  3. data/README.md +42 -13
  4. data/lib/scs/ffi.rb +1 -7
  5. data/lib/scs/matrix.rb +72 -0
  6. data/lib/scs/solver.rb +19 -26
  7. data/lib/scs/version.rb +1 -1
  8. data/lib/scs.rb +1 -0
  9. data/vendor/scs/CITATION.cff +1 -1
  10. data/vendor/scs/CMakeLists.txt +55 -7
  11. data/vendor/scs/Makefile +9 -9
  12. data/vendor/scs/README.md +4 -1
  13. data/vendor/scs/include/aa.h +1 -1
  14. data/vendor/scs/include/cones.h +17 -12
  15. data/vendor/scs/include/glbopts.h +27 -66
  16. data/vendor/scs/include/linalg.h +2 -1
  17. data/vendor/scs/include/linsys.h +13 -13
  18. data/vendor/scs/include/normalize.h +7 -5
  19. data/vendor/scs/include/rw.h +3 -3
  20. data/vendor/scs/include/scs.h +85 -106
  21. data/vendor/scs/include/scs_types.h +34 -0
  22. data/vendor/scs/include/scs_work.h +80 -0
  23. data/vendor/scs/include/util.h +3 -1
  24. data/vendor/scs/linsys/cpu/direct/private.c +86 -73
  25. data/vendor/scs/linsys/cpu/direct/private.h +2 -2
  26. data/vendor/scs/linsys/cpu/indirect/private.c +42 -33
  27. data/vendor/scs/linsys/cpu/indirect/private.h +1 -2
  28. data/vendor/scs/linsys/csparse.c +3 -3
  29. data/vendor/scs/linsys/external/amd/LICENSE.txt +0 -897
  30. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +9 -7
  31. data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +1 -1
  32. data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
  33. data/vendor/scs/linsys/gpu/gpu.h +8 -11
  34. data/vendor/scs/linsys/gpu/indirect/private.c +72 -49
  35. data/vendor/scs/linsys/gpu/indirect/private.h +14 -13
  36. data/vendor/scs/linsys/scs_matrix.c +55 -104
  37. data/vendor/scs/linsys/scs_matrix.h +5 -4
  38. data/vendor/scs/scs.mk +1 -5
  39. data/vendor/scs/src/aa.c +13 -8
  40. data/vendor/scs/src/cones.c +197 -108
  41. data/vendor/scs/src/linalg.c +25 -0
  42. data/vendor/scs/src/normalize.c +75 -26
  43. data/vendor/scs/src/rw.c +74 -30
  44. data/vendor/scs/src/scs.c +300 -264
  45. data/vendor/scs/src/scs_version.c +8 -6
  46. data/vendor/scs/src/util.c +27 -13
  47. data/vendor/scs/test/minunit.h +6 -1
  48. data/vendor/scs/test/problem_utils.h +28 -35
  49. data/vendor/scs/test/problems/degenerate.h +2 -1
  50. data/vendor/scs/test/problems/hs21_tiny_qp.h +2 -1
  51. data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +6 -2
  52. data/vendor/scs/test/problems/infeasible_tiny_qp.h +2 -1
  53. data/vendor/scs/test/problems/qafiro_tiny_qp.h +5 -4
  54. data/vendor/scs/test/problems/random_prob.h +6 -2
  55. data/vendor/scs/test/problems/rob_gauss_cov_est.h +9 -2
  56. data/vendor/scs/test/problems/small_lp.h +7 -2
  57. data/vendor/scs/test/problems/small_qp.h +387 -0
  58. data/vendor/scs/test/problems/{test_fails.h → test_validation.h} +7 -4
  59. data/vendor/scs/test/problems/unbounded_tiny_qp.h +4 -4
  60. data/vendor/scs/test/random_socp_prob.c +4 -2
  61. data/vendor/scs/test/run_from_file.c +16 -4
  62. data/vendor/scs/test/run_tests.c +23 -14
  63. metadata +10 -35
  64. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  65. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  66. data/vendor/scs/linsys/csparse.o +0 -0
  67. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  68. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  69. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  70. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  71. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  72. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  73. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  74. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  75. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  76. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  77. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  78. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  79. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  80. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  81. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  82. data/vendor/scs/linsys/scs_matrix.o +0 -0
  83. data/vendor/scs/src/aa.o +0 -0
  84. data/vendor/scs/src/cones.o +0 -0
  85. data/vendor/scs/src/ctrlc.o +0 -0
  86. data/vendor/scs/src/linalg.o +0 -0
  87. data/vendor/scs/src/normalize.o +0 -0
  88. data/vendor/scs/src/rw.o +0 -0
  89. data/vendor/scs/src/scs.o +0 -0
  90. data/vendor/scs/src/scs_indir.o +0 -0
  91. data/vendor/scs/src/scs_version.o +0 -0
  92. data/vendor/scs/src/util.o +0 -0
@@ -18,7 +18,6 @@
18
18
 
19
19
  #ifdef MATLAB_MEX_FILE
20
20
  #include "mex.h"
21
- #include "scs_matrix.h"
22
21
  #endif
23
22
 
24
23
  #ifndef NULL
@@ -51,7 +50,9 @@
51
50
 
52
51
  struct SuiteSparse_config_struct SuiteSparse_config =
53
52
  {
54
- _scs_malloc, _scs_calloc, _scs_realloc, _scs_free, _scs_printf,
53
+ scs_malloc, scs_calloc, scs_realloc, scs_free,
54
+ /* Disable printing */
55
+ SCS_NULL,
55
56
  SuiteSparse_hypot,
56
57
  SuiteSparse_divcomplex
57
58
 
@@ -73,13 +74,14 @@ struct SuiteSparse_config_struct SuiteSparse_config =
73
74
  SuiteSparse_start be called prior to calling any SuiteSparse function.
74
75
  */
75
76
 
77
+
76
78
  void SuiteSparse_start ( void )
77
79
  {
78
- SuiteSparse_config.malloc_func = _scs_malloc ;
79
- SuiteSparse_config.calloc_func = _scs_calloc ;
80
- SuiteSparse_config.realloc_func = _scs_realloc ;
81
- SuiteSparse_config.free_func = _scs_free ;
82
- SuiteSparse_config.printf_func = _scs_printf ;
80
+ SuiteSparse_config.malloc_func = scs_malloc ;
81
+ SuiteSparse_config.calloc_func = scs_calloc ;
82
+ SuiteSparse_config.realloc_func = scs_realloc ;
83
+ SuiteSparse_config.free_func = scs_free ;
84
+ SuiteSparse_config.printf_func = SCS_NULL;
83
85
  /* math functions */
84
86
  SuiteSparse_config.hypot_func = SuiteSparse_hypot ;
85
87
  SuiteSparse_config.divcomplex_func = SuiteSparse_divcomplex ;
@@ -44,7 +44,7 @@ extern "C" {
44
44
 
45
45
  #include <limits.h>
46
46
  #include <stdlib.h>
47
- #include "scs.h"
47
+ #include "glbopts.h"
48
48
  #include "ctrlc.h"
49
49
 
50
50
  /* ========================================================================== */
@@ -89,8 +89,8 @@ GLOBAL Int AMD_order
89
89
  }
90
90
 
91
91
  /* allocate two size-n integer workspaces */
92
- Len = SuiteSparse_malloc (n, sizeof (Int)) ;
93
- Pinv = SuiteSparse_malloc (n, sizeof (Int)) ;
92
+ Len = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
93
+ Pinv = (Int *)SuiteSparse_malloc (n, sizeof (Int)) ;
94
94
  mem += n ;
95
95
  mem += n ;
96
96
  if (!Len || !Pinv)
@@ -106,8 +106,8 @@ GLOBAL Int AMD_order
106
106
  {
107
107
  /* sort the input matrix and remove duplicate entries */
108
108
  AMD_DEBUG1 (("Matrix is jumbled\n")) ;
109
- Rp = SuiteSparse_malloc (n+1, sizeof (Int)) ;
110
- Ri = SuiteSparse_malloc (nz, sizeof (Int)) ;
109
+ Rp = (Int *)SuiteSparse_malloc (n+1, sizeof (Int)) ;
110
+ Ri = (Int *)SuiteSparse_malloc (nz, sizeof (Int)) ;
111
111
  mem += (n+1) ;
112
112
  mem += MAX (nz,1) ;
113
113
  if (!Rp || !Ri)
@@ -160,7 +160,7 @@ GLOBAL Int AMD_order
160
160
  ok = ok && (slen < Int_MAX) ; /* S[i] for Int i must be OK */
161
161
  if (ok)
162
162
  {
163
- S = SuiteSparse_malloc (slen, sizeof (Int)) ;
163
+ S = (Int *)SuiteSparse_malloc (slen, sizeof (Int)) ;
164
164
  }
165
165
  AMD_DEBUG1 (("slen %g\n", (scs_float) slen)) ;
166
166
  if (!S)
@@ -1,12 +1,17 @@
1
- #ifndef SCSGPU_H_GUARD
2
- #define SCSGPU_H_GUARD
1
+ #ifndef SCS_GPU_H_GUARD
2
+ #define SCS_GPU_H_GUARD
3
3
 
4
4
  #ifdef __cplusplus
5
5
  extern "C" {
6
6
  #endif
7
7
 
8
- #include <cublas_v2.h>
8
+ /* TODO: Do we need this?
9
+
9
10
  #include <cuda.h>
11
+
12
+ */
13
+
14
+ #include <cublas_v2.h>
10
15
  #include <cuda_runtime_api.h>
11
16
  #include <cusparse.h>
12
17
 
@@ -31,11 +36,9 @@ extern "C" {
31
36
  #ifndef SFLOAT
32
37
  #define CUBLAS(x) cublasD##x
33
38
  #define CUBLASI(x) cublasId##x
34
- #define CUSPARSE(x) cusparseD##x
35
39
  #else
36
40
  #define CUBLAS(x) cublasS##x
37
41
  #define CUBLASI(x) cublasIs##x
38
- #define CUSPARSE(x) cusparseS##x
39
42
  #endif
40
43
  #define CUSPARSE_GEN(x) cusparse##x
41
44
  #else
@@ -46,9 +49,6 @@ extern "C" {
46
49
  #define CUBLASI(x) \
47
50
  CUDA_CHECK_ERR; \
48
51
  cublasId##x
49
- #define CUSPARSE(x) \
50
- CUDA_CHECK_ERR; \
51
- cusparseD##x
52
52
  #else
53
53
  #define CUBLAS(x) \
54
54
  CUDA_CHECK_ERR; \
@@ -56,9 +56,6 @@ extern "C" {
56
56
  #define CUBLASI(x) \
57
57
  CUDA_CHECK_ERR; \
58
58
  cublasIs##x
59
- #define CUSPARSE(x) \
60
- CUDA_CHECK_ERR; \
61
- cusparseS##x
62
59
  #endif
63
60
  #define CUSPARSE_GEN(x) \
64
61
  CUDA_CHECK_ERR; \
@@ -35,63 +35,77 @@ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
35
35
  }
36
36
  */
37
37
 
38
- /* set M = inv ( diag ( rho_x * I + P + A' R_y^{-1} A ) ) */
39
- static void set_preconditioner(ScsLinSysWork *p, scs_float *rho_y_vec) {
38
+ /* Not possible to do this on the fly due to M_ii += a_i' (R_y)^-1 a_i */
39
+ /* set M = inv ( diag ( R_x + P + A' R_y^{-1} A ) ) */
40
+ static void set_preconditioner(ScsLinSysWork *p, const scs_float *diag_r) {
40
41
  scs_int i, k;
41
42
  const ScsMatrix *A = p->A;
42
43
  const ScsMatrix *P = p->P;
43
- scs_float *M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
44
+ scs_float *M = p->M;
44
45
 
45
46
  #if VERBOSITY > 0
46
47
  scs_printf("getting pre-conditioner\n");
47
48
  #endif
48
49
 
50
+ /* M_ii = (R_x)_i + P_ii + a_i' (R_y)^-1 a_i */
49
51
  for (i = 0; i < A->n; ++i) { /* cols */
50
- M[i] = p->rho_x;
51
- /* diag(A' R_y^{-1} A) */
52
+ /* M_ii = (R_x)_i */
53
+ M[i] = diag_r[i];
54
+ /* M_ii += a_i' (R_y)^-1 a_i */
52
55
  for (k = A->p[i]; k < A->p[i + 1]; ++k) {
53
56
  /* A->i[k] is row of entry k with value A->x[k] */
54
- M[i] += A->x[k] * A->x[k] / rho_y_vec[A->i[k]];
57
+ M[i] += A->x[k] * A->x[k] / diag_r[A->n + A->i[k]];
55
58
  }
56
59
  if (P) {
57
60
  for (k = P->p[i]; k < P->p[i + 1]; k++) {
58
61
  /* diagonal element only */
59
62
  if (P->i[k] == i) { /* row == col */
63
+ /* M_ii += P_ii */
60
64
  M[i] += P->x[k];
61
65
  break;
62
66
  }
63
67
  }
64
68
  }
69
+ /* finally invert for pre-conditioner */
65
70
  M[i] = 1. / M[i];
66
71
  }
67
- cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
68
- scs_free(M);
72
+ cudaMemcpy(p->M_gpu, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
69
73
  #if VERBOSITY > 0
70
74
  scs_printf("finished getting pre-conditioner\n");
71
75
  #endif
72
76
  }
73
77
 
74
78
  /* no need to update anything in this case */
75
- void SCS(update_lin_sys_rho_y_vec)(ScsLinSysWork *p, scs_float *rho_y_vec) {
79
+ void SCS(update_lin_sys_diag_r)(ScsLinSysWork *p, const scs_float *diag_r) {
76
80
  scs_int i;
81
+
82
+ /* R_x to gpu */
83
+ cudaMemcpy(p->r_x_gpu, diag_r, p->n * sizeof(scs_float),
84
+ cudaMemcpyHostToDevice);
85
+
86
+ /* 1/R_y to gpu */
77
87
  for (i = 0; i < p->m; ++i)
78
- p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
79
- cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, p->m * sizeof(scs_float),
88
+ p->inv_r_y[i] = 1. / diag_r[p->n + i];
89
+ cudaMemcpy(p->inv_r_y_gpu, p->inv_r_y, p->m * sizeof(scs_float),
80
90
  cudaMemcpyHostToDevice);
81
- set_preconditioner(p, rho_y_vec);
91
+
92
+ /* set preconditioner M on gpu */
93
+ set_preconditioner(p, diag_r);
82
94
  }
83
95
 
84
96
  void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
85
97
  if (p) {
86
- scs_free(p->inv_rho_y_vec);
98
+ scs_free(p->M);
99
+ scs_free(p->inv_r_y);
87
100
  cudaFree(p->p);
88
101
  cudaFree(p->r);
89
102
  cudaFree(p->Gp);
90
103
  cudaFree(p->bg);
91
104
  cudaFree(p->tmp_m);
92
105
  cudaFree(p->z);
93
- cudaFree(p->M);
94
- cudaFree(p->inv_rho_y_vec_gpu);
106
+ cudaFree(p->M_gpu);
107
+ cudaFree(p->r_x_gpu);
108
+ cudaFree(p->inv_r_y_gpu);
95
109
  if (p->Pg) {
96
110
  SCS(free_gpu_matrix)(p->Pg);
97
111
  scs_free(p->Pg);
@@ -126,22 +140,23 @@ static void scale_by_diag(cublasHandle_t cublas_handle, scs_float *M,
126
140
  0, M, 1, z, 1);
127
141
  }
128
142
 
129
- /* y = (rho_x * I + P + A' R_y^{-1} A) x */
143
+ /* y = (R_x + P + A' R_y^{-1} A) x */
130
144
  static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
131
145
  /* x and y MUST already be loaded to GPU */
132
146
  scs_float *z = p->tmp_m; /* temp memory */
133
- cudaMemset(y, 0, p->n * sizeof(scs_float));
134
147
  cudaMemset(z, 0, p->m * sizeof(scs_float));
135
148
 
136
149
  cusparseDnVecSetValues(p->dn_vec_m, (void *)z);
137
150
  cusparseDnVecSetValues(p->dn_vec_n, (void *)x);
138
151
  cusparseDnVecSetValues(p->dn_vec_n_p, (void *)y);
139
152
 
140
- /* y = rho_x * x */
141
- CUBLAS(axpy)(p->cublas_handle, p->n, &(p->rho_x), x, 1, y, 1);
153
+ /* y = x */
154
+ cudaMemcpy(y, x, p->n * sizeof(scs_float), cudaMemcpyHostToDevice);
155
+ /* y = R_x * x */
156
+ scale_by_diag(p->cublas_handle, p->r_x_gpu, y, p->n);
142
157
 
143
158
  if (p->Pg) {
144
- /* y = rho_x * x + Px */
159
+ /* y = R_x * x + P x */
145
160
  SCS(accum_by_p_gpu)
146
161
  (p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
147
162
  &p->buffer);
@@ -158,9 +173,9 @@ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
158
173
  &p->buffer);
159
174
  #endif
160
175
  /* z = R_y^{-1} A x */
161
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, z, p->m);
176
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
162
177
 
163
- /* y += A'z => y = rho_x * x + Px + A' R_y^{-1} Ax */
178
+ /* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
164
179
  SCS(accum_by_atrans_gpu)
165
180
  (p->Ag, p->dn_vec_m, p->dn_vec_n_p, p->cusparse_handle, &p->buffer_size,
166
181
  &p->buffer);
@@ -201,19 +216,35 @@ static csc *fill_p_matrix(const ScsMatrix *P) {
201
216
  }
202
217
 
203
218
  ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
204
- scs_float *rho_y_vec, scs_float rho_x) {
219
+ const scs_float *diag_r) {
205
220
  cudaError_t err;
206
- scs_int i;
207
221
  csc *P_full;
208
- ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
209
- ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
222
+ ScsLinSysWork *p = SCS_NULL;
223
+ ScsGpuMatrix *Ag = SCS_NULL;
210
224
  ScsGpuMatrix *Pg = SCS_NULL;
225
+ int device_count;
226
+
227
+ err = cudaGetDeviceCount(&device_count);
228
+ if (err > 0) {
229
+ scs_printf("cudaError: %i (100 indicates no device)\n", (int)err);
230
+ return SCS_NULL;
231
+ }
232
+
233
+ p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
234
+ Ag = (ScsGpuMatrix *)scs_calloc(1, sizeof(ScsGpuMatrix));
235
+
236
+ p->inv_r_y = (scs_float *)scs_calloc(A->m, sizeof(scs_float));
237
+ p->M = (scs_float *)scs_calloc(A->n, sizeof(scs_float));
238
+
239
+ p->A = A;
240
+ p->P = P;
241
+ p->m = A->m;
242
+ p->n = A->n;
211
243
 
212
244
  #if GPU_TRANSPOSE_MAT > 0
213
245
  size_t new_buffer_size = 0;
214
246
  #endif
215
247
 
216
- p->rho_x = rho_x;
217
248
  p->cublas_handle = 0;
218
249
  p->cusparse_handle = 0;
219
250
 
@@ -242,8 +273,9 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
242
273
  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
243
274
  cudaMalloc((void **)&p->tmp_m, A->m * sizeof(scs_float));
244
275
  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
245
- cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
246
- cudaMalloc((void **)&p->inv_rho_y_vec_gpu, A->m * sizeof(scs_float));
276
+ cudaMalloc((void **)&p->M_gpu, A->n * sizeof(scs_float));
277
+ cudaMalloc((void **)&p->r_x_gpu, A->n * sizeof(scs_float));
278
+ cudaMalloc((void **)&p->inv_r_y_gpu, A->m * sizeof(scs_float));
247
279
 
248
280
  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
249
281
  cudaMemcpyHostToDevice);
@@ -251,12 +283,6 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
251
283
  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
252
284
  cudaMemcpyHostToDevice);
253
285
 
254
- p->inv_rho_y_vec = (scs_float *)scs_malloc(A->m * sizeof(scs_float));
255
- for (i = 0; i < A->m; ++i)
256
- p->inv_rho_y_vec[i] = 1. / rho_y_vec[i];
257
- cudaMemcpy(p->inv_rho_y_vec_gpu, p->inv_rho_y_vec, A->m * sizeof(scs_float),
258
- cudaMemcpyHostToDevice);
259
-
260
286
  cusparseCreateCsr(&Ag->descr, Ag->n, Ag->m, Ag->nnz, Ag->p, Ag->i, Ag->x,
261
287
  SCS_CUSPARSE_INDEX, SCS_CUSPARSE_INDEX,
262
288
  CUSPARSE_INDEX_BASE_ZERO, SCS_CUDA_FLOAT);
@@ -297,7 +323,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
297
323
  cusparseCreateDnVec(&p->dn_vec_n_p, Ag->n, p->tmp_m, SCS_CUDA_FLOAT);
298
324
  cusparseCreateDnVec(&p->dn_vec_m, Ag->m, p->tmp_m, SCS_CUDA_FLOAT);
299
325
 
300
- set_preconditioner(p, rho_y_vec);
326
+ /* Form preconditioner and copy R_x, 1/R_y to gpu */
327
+ SCS(update_lin_sys_diag_r)(p, diag_r);
301
328
 
302
329
  #if GPU_TRANSPOSE_MAT > 0
303
330
  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
@@ -346,9 +373,8 @@ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A, const ScsMatrix *P,
346
373
  return p;
347
374
  }
348
375
 
349
- /* solves (rho_x * I + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
350
- * b */
351
- /* on GPU */
376
+ /* solves (R_x + P + A' R_y^{-1} A)x = b, s warm start, solution stored in
377
+ * b, on GPU */
352
378
  static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
353
379
  scs_int max_its, scs_float tol) {
354
380
  scs_int i, n = pr->n;
@@ -386,7 +412,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
386
412
 
387
413
  /* z = M r */
388
414
  cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
389
- scale_by_diag(cublas_handle, pr->M, z, n);
415
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
390
416
  /* ztr = z'r */
391
417
  CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
392
418
  /* p = z */
@@ -415,7 +441,7 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
415
441
  }
416
442
  /* z = M r */
417
443
  cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
418
- scale_by_diag(cublas_handle, pr->M, z, n);
444
+ scale_by_diag(cublas_handle, pr->M_gpu, z, n);
419
445
  ztr_prev = ztr;
420
446
  /* ztr = z'r */
421
447
  CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ztr);
@@ -431,14 +457,12 @@ static scs_int pcg(ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
431
457
  /* solves Mx = b, for x but stores result in b */
432
458
  /* s contains warm-start (if available) */
433
459
  /*
434
- * [x] = [rho_x I + P A' ]^{-1} [rx]
460
+ * [x] = [R_x + P A' ]^{-1} [rx]
435
461
  * [y] [ A -R_y ] [ry]
436
462
  *
437
- * R_y = diag(rho_y_vec)
438
- *
439
463
  * becomes:
440
464
  *
441
- * x = (rho_x I + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
465
+ * x = (R_x + P + A' R_y^{-1} A)^{-1} (rx + A' R_y^{-1} ry)
442
466
  * y = R_y^{-1} (Ax - ry)
443
467
  *
444
468
  */
@@ -451,7 +475,6 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
451
475
  scs_float *bg = p->bg;
452
476
  scs_float *tmp_m = p->tmp_m;
453
477
  ScsGpuMatrix *Ag = p->Ag;
454
- ScsGpuMatrix *Pg = p->Pg;
455
478
 
456
479
  if (CG_NORM(b, p->n + p->m) <= 1e-12) {
457
480
  memset(b, 0, (p->n + p->m) * sizeof(scs_float));
@@ -471,7 +494,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
471
494
  cudaMemcpy(tmp_m, &(bg[Ag->n]), Ag->m * sizeof(scs_float),
472
495
  cudaMemcpyDeviceToDevice);
473
496
  /* tmp = R_y^{-1} * tmp = R_y^{-1} * ry */
474
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, tmp_m, p->Ag->m);
497
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, tmp_m, p->Ag->m);
475
498
 
476
499
  cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
477
500
  cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
@@ -483,7 +506,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
483
506
  /* set max_iters to 10 * n (though in theory n is enough for any tol) */
484
507
  max_iters = 10 * Ag->n;
485
508
 
486
- /* solves (rho_x I + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
509
+ /* solves (R_x + P + A' R_y^{-1} A)x = bg, s warm start, solution stored
487
510
  * in bg */
488
511
  cg_its = pcg(p, s, bg, max_iters, tol); /* bg[:n] = x */
489
512
 
@@ -504,7 +527,7 @@ scs_int SCS(solve_lin_sys)(ScsLinSysWork *p, scs_float *b, const scs_float *s,
504
527
  #endif
505
528
 
506
529
  /* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
507
- scale_by_diag(p->cublas_handle, p->inv_rho_y_vec_gpu, &(bg[p->n]), p->Ag->m);
530
+ scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, &(bg[p->n]), p->Ag->m);
508
531
 
509
532
  /* copy bg = [x; y] back to b */
510
533
  cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float),
@@ -15,19 +15,20 @@ struct SCS_LIN_SYS_WORK {
15
15
  scs_int n, m; /* linear system dimensions */
16
16
  /* reporting */
17
17
  scs_int tot_cg_its;
18
+ scs_float *M; /* preconditioner on cpu */
18
19
  /* ALL BELOW HOSTED ON THE GPU */
19
- scs_float *p; /* cg iterate, n */
20
- scs_float *r; /* cg residual, n */
21
- scs_float *Gp; /* G * p, n */
22
- scs_float *bg; /* b, n */
23
- scs_float *tmp_m; /* m, used in mat_vec */
24
- scs_float *z; /* preconditioned */
25
- scs_float *M; /* preconditioner */
20
+ scs_float *p; /* cg iterate, n */
21
+ scs_float *r; /* cg residual, n */
22
+ scs_float *Gp; /* G * p, n */
23
+ scs_float *bg; /* b, n */
24
+ scs_float *tmp_m; /* m, used in mat_vec */
25
+ scs_float *z; /* preconditioned */
26
+ scs_float *M_gpu; /* preconditioner */
26
27
  const ScsMatrix *A; /* does *not* own this memory */
27
28
  const ScsMatrix *P; /* does *not* own this memory */
28
- ScsGpuMatrix *Ag; /* A matrix on GPU */
29
- ScsGpuMatrix *Agt; /* A trans matrix on GPU */
30
- ScsGpuMatrix *Pg; /* P matrix on GPU */
29
+ ScsGpuMatrix *Ag; /* A matrix on GPU */
30
+ ScsGpuMatrix *Agt; /* A trans matrix on GPU */
31
+ ScsGpuMatrix *Pg; /* P matrix on GPU */
31
32
  /* CUDA */
32
33
  cublasHandle_t cublas_handle;
33
34
  cusparseHandle_t cusparse_handle;
@@ -39,9 +40,9 @@ struct SCS_LIN_SYS_WORK {
39
40
  cusparseDnVecDescr_t dn_vec_n_p; /* Dense vector of length n */
40
41
 
41
42
  /* rho terms */
42
- scs_float rho_x;
43
- scs_float *inv_rho_y_vec; /* inverse rho_y_vec */
44
- scs_float *inv_rho_y_vec_gpu; /* inverse rho_y_vec on GPU */
43
+ scs_float *r_x_gpu;
44
+ scs_float *inv_r_y; /* inverse R_y */
45
+ scs_float *inv_r_y_gpu; /* inverse R_y on GPU */
45
46
  };
46
47
 
47
48
  #ifdef __cplusplus