scs 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +84 -0
  5. data/ext/scs/Rakefile +11 -0
  6. data/lib/scs/ffi.rb +117 -0
  7. data/lib/scs/solver.rb +178 -0
  8. data/lib/scs/version.rb +3 -0
  9. data/lib/scs.rb +17 -0
  10. data/vendor/scs/LICENSE.txt +21 -0
  11. data/vendor/scs/Makefile +164 -0
  12. data/vendor/scs/README.md +220 -0
  13. data/vendor/scs/include/aa.h +56 -0
  14. data/vendor/scs/include/cones.h +46 -0
  15. data/vendor/scs/include/ctrlc.h +33 -0
  16. data/vendor/scs/include/glbopts.h +177 -0
  17. data/vendor/scs/include/linalg.h +26 -0
  18. data/vendor/scs/include/linsys.h +64 -0
  19. data/vendor/scs/include/normalize.h +18 -0
  20. data/vendor/scs/include/rw.h +17 -0
  21. data/vendor/scs/include/scs.h +161 -0
  22. data/vendor/scs/include/scs_blas.h +51 -0
  23. data/vendor/scs/include/util.h +65 -0
  24. data/vendor/scs/linsys/amatrix.c +305 -0
  25. data/vendor/scs/linsys/amatrix.h +36 -0
  26. data/vendor/scs/linsys/amatrix.o +0 -0
  27. data/vendor/scs/linsys/cpu/direct/private.c +366 -0
  28. data/vendor/scs/linsys/cpu/direct/private.h +26 -0
  29. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  30. data/vendor/scs/linsys/cpu/indirect/private.c +256 -0
  31. data/vendor/scs/linsys/cpu/indirect/private.h +31 -0
  32. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  33. data/vendor/scs/linsys/external/amd/LICENSE.txt +934 -0
  34. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +469 -0
  35. data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +254 -0
  36. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  37. data/vendor/scs/linsys/external/amd/amd.h +400 -0
  38. data/vendor/scs/linsys/external/amd/amd_1.c +180 -0
  39. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  40. data/vendor/scs/linsys/external/amd/amd_2.c +1842 -0
  41. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  42. data/vendor/scs/linsys/external/amd/amd_aat.c +184 -0
  43. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  44. data/vendor/scs/linsys/external/amd/amd_control.c +64 -0
  45. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  46. data/vendor/scs/linsys/external/amd/amd_defaults.c +37 -0
  47. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  48. data/vendor/scs/linsys/external/amd/amd_dump.c +179 -0
  49. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  50. data/vendor/scs/linsys/external/amd/amd_global.c +16 -0
  51. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  52. data/vendor/scs/linsys/external/amd/amd_info.c +119 -0
  53. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  54. data/vendor/scs/linsys/external/amd/amd_internal.h +304 -0
  55. data/vendor/scs/linsys/external/amd/amd_order.c +199 -0
  56. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  57. data/vendor/scs/linsys/external/amd/amd_post_tree.c +120 -0
  58. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  59. data/vendor/scs/linsys/external/amd/amd_postorder.c +206 -0
  60. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  61. data/vendor/scs/linsys/external/amd/amd_preprocess.c +118 -0
  62. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  63. data/vendor/scs/linsys/external/amd/amd_valid.c +92 -0
  64. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  65. data/vendor/scs/linsys/external/amd/changes +11 -0
  66. data/vendor/scs/linsys/external/qdldl/LICENSE +201 -0
  67. data/vendor/scs/linsys/external/qdldl/README.md +120 -0
  68. data/vendor/scs/linsys/external/qdldl/changes +4 -0
  69. data/vendor/scs/linsys/external/qdldl/qdldl.c +298 -0
  70. data/vendor/scs/linsys/external/qdldl/qdldl.h +177 -0
  71. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  72. data/vendor/scs/linsys/external/qdldl/qdldl_types.h +21 -0
  73. data/vendor/scs/linsys/gpu/gpu.c +41 -0
  74. data/vendor/scs/linsys/gpu/gpu.h +85 -0
  75. data/vendor/scs/linsys/gpu/indirect/private.c +304 -0
  76. data/vendor/scs/linsys/gpu/indirect/private.h +36 -0
  77. data/vendor/scs/scs.mk +181 -0
  78. data/vendor/scs/src/aa.c +224 -0
  79. data/vendor/scs/src/aa.o +0 -0
  80. data/vendor/scs/src/cones.c +802 -0
  81. data/vendor/scs/src/cones.o +0 -0
  82. data/vendor/scs/src/ctrlc.c +77 -0
  83. data/vendor/scs/src/ctrlc.o +0 -0
  84. data/vendor/scs/src/linalg.c +84 -0
  85. data/vendor/scs/src/linalg.o +0 -0
  86. data/vendor/scs/src/normalize.c +93 -0
  87. data/vendor/scs/src/normalize.o +0 -0
  88. data/vendor/scs/src/rw.c +167 -0
  89. data/vendor/scs/src/rw.o +0 -0
  90. data/vendor/scs/src/scs.c +975 -0
  91. data/vendor/scs/src/scs.o +0 -0
  92. data/vendor/scs/src/scs_version.c +5 -0
  93. data/vendor/scs/src/scs_version.o +0 -0
  94. data/vendor/scs/src/util.c +196 -0
  95. data/vendor/scs/src/util.o +0 -0
  96. data/vendor/scs/test/data/small_random_socp +0 -0
  97. data/vendor/scs/test/minunit.h +13 -0
  98. data/vendor/scs/test/problem_utils.h +93 -0
  99. data/vendor/scs/test/problems/rob_gauss_cov_est.h +85 -0
  100. data/vendor/scs/test/problems/small_lp.h +50 -0
  101. data/vendor/scs/test/problems/small_random_socp.h +33 -0
  102. data/vendor/scs/test/random_socp_prob.c +171 -0
  103. data/vendor/scs/test/run_from_file.c +69 -0
  104. data/vendor/scs/test/run_tests +2 -0
  105. data/vendor/scs/test/run_tests.c +32 -0
  106. metadata +203 -0
@@ -0,0 +1,85 @@
1
+ #ifndef SCSGPU_H_GUARD
2
+ #define SCSGPU_H_GUARD
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #include <cublas_v2.h>
9
+ #include <cuda.h>
10
+ #include <cuda_runtime_api.h>
11
+ #include <cusparse.h>
12
+
13
+ #include "amatrix.h"
14
+ #include "glbopts.h"
15
+ #include "linalg.h"
16
+ #include "linsys.h"
17
+ #include "scs.h"
18
+ #include "util.h"
19
+
20
+ #define CUDA_CHECK_ERR \
21
+ do { \
22
+ cudaError_t err = cudaGetLastError(); \
23
+ if (err != cudaSuccess) { \
24
+ printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
25
+ cudaGetErrorString(err)); \
26
+ } \
27
+ } while (0)
28
+
29
+ #ifndef EXTRA_VERBOSE
30
+ #ifndef SFLOAT
31
+ #define CUBLAS(x) cublasD##x
32
+ #define CUSPARSE(x) cusparseD##x
33
+ #else
34
+ #define CUBLAS(x) cublasS##x
35
+ #define CUSPARSE(x) cusparseS##x
36
+ #endif
37
+ #else
38
+ #ifndef SFLOAT
39
+ #define CUBLAS(x) \
40
+ CUDA_CHECK_ERR; \
41
+ cublasD##x
42
+ #define CUSPARSE(x) \
43
+ CUDA_CHECK_ERR; \
44
+ cusparseD##x
45
+ #else
46
+ #define CUBLAS(x) \
47
+ CUDA_CHECK_ERR; \
48
+ cublasS##x
49
+ #define CUSPARSE(x) \
50
+ CUDA_CHECK_ERR; \
51
+ cusparseS##x
52
+ #endif
53
+ #endif
54
+
55
+ /*
56
+ CUDA matrix routines only for CSR, not CSC matrices:
57
+ CSC CSR GPU Mult
58
+ A (m x n) A' (n x m) Ag accum_by_a_trans_gpu
59
+ A'(n x m) A (m x n) Agt accum_by_a_gpu
60
+ */
61
+
62
+ /* this struct defines the data matrix A on GPU */
63
+ typedef struct SCS_GPU_A_DATA_MATRIX {
64
+ /* A is supplied in column compressed format */
65
+ scs_float *x; /* A values, size: NNZ A */
66
+ scs_int *i; /* A row index, size: NNZ A */
67
+ scs_int *p; /* A column pointer, size: n+1 */
68
+ scs_int m, n; /* m rows, n cols */
69
+ scs_int Annz; /* num non-zeros in A matrix */
70
+ /* CUDA */
71
+ cusparseMatDescr_t descr;
72
+ } ScsGpuMatrix;
73
+
74
+ void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const scs_float *x,
75
+ scs_float *y, cusparseHandle_t cusparse_handle);
76
+
77
+ void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const scs_float *x,
78
+ scs_float *y, cusparseHandle_t cusparse_handle);
79
+
80
+ void SCS(free_gpu_matrix)(ScsGpuMatrix *A);
81
+
82
+ #ifdef __cplusplus
83
+ }
84
+ #endif
85
+ #endif
@@ -0,0 +1,304 @@
1
+ #include "private.h"
2
+
3
+ #define CG_BEST_TOL 1e-9
4
+ #define CG_MIN_TOL 1e-1
5
+
6
+ /* do not use within pcg, reuses memory */
7
+ void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
8
+ const scs_float *x, scs_float *y) {
9
+ scs_float *v_m = p->tmp_m;
10
+ scs_float *v_n = p->r;
11
+ cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
12
+ cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
13
+ SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
14
+ cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
15
+ }
16
+
17
+ /* do not use within pcg, reuses memory */
18
+ void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
19
+ scs_float *y) {
20
+ scs_float *v_m = p->tmp_m;
21
+ scs_float *v_n = p->r;
22
+ cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
23
+ cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
24
+ #if GPU_TRANSPOSE_MAT > 0
25
+ SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
26
+ #else
27
+ SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
28
+ #endif
29
+ cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
30
+ }
31
+
32
+ char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
33
+ char *str = (char *)scs_malloc(sizeof(char) * 128);
34
+ sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
35
+ (long)A->p[A->n], stgs->cg_rate);
36
+ return str;
37
+ }
38
+
39
+ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
40
+ char *str = (char *)scs_malloc(sizeof(char) * 128);
41
+ sprintf(str,
42
+ "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
43
+ (scs_float)p->tot_cg_its / (info->iter + 1),
44
+ p->total_solve_time / (info->iter + 1) / 1e3);
45
+ p->tot_cg_its = 0;
46
+ p->total_solve_time = 0;
47
+ return str;
48
+ }
49
+
50
+ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
51
+ if (p) {
52
+ cudaFree(p->p);
53
+ cudaFree(p->r);
54
+ cudaFree(p->Gp);
55
+ cudaFree(p->bg);
56
+ cudaFree(p->tmp_m);
57
+ cudaFree(p->z);
58
+ cudaFree(p->M);
59
+ if (p->Ag) {
60
+ SCS(free_gpu_matrix)(p->Ag);
61
+ scs_free(p->Ag);
62
+ }
63
+ if (p->Agt) {
64
+ SCS(free_gpu_matrix)(p->Agt);
65
+ scs_free(p->Agt);
66
+ }
67
+ cusparseDestroy(p->cusparse_handle);
68
+ cublasDestroy(p->cublas_handle);
69
+ /* Don't reset because it interferes with other GPU programs. */
70
+ /* cudaDeviceReset(); */
71
+ scs_free(p);
72
+ }
73
+ }
74
+
75
+ /*y = (RHO_X * I + A'A)x */
76
+ static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
77
+ ScsLinSysWork *p, const scs_float *x, scs_float *y) {
78
+ /* x and y MUST already be loaded to GPU */
79
+ scs_float *tmp_m = p->tmp_m; /* temp memory */
80
+ cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
81
+ SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
82
+ cudaMemset(y, 0, A->n * sizeof(scs_float));
83
+ SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
84
+ CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
85
+ }
86
+
87
+ /* M = inv ( diag ( RHO_X * I + A'A ) ) */
88
+ static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
89
+ ScsLinSysWork *p) {
90
+ scs_int i;
91
+ scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
92
+
93
+ #if EXTRA_VERBOSE > 0
94
+ scs_printf("getting pre-conditioner\n");
95
+ #endif
96
+
97
+ for (i = 0; i < A->n; ++i) {
98
+ M[i] = 1 / (stgs->rho_x +
99
+ SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
100
+ /* M[i] = 1; */
101
+ }
102
+ cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
103
+ scs_free(M);
104
+
105
+ #if EXTRA_VERBOSE > 0
106
+ scs_printf("finished getting pre-conditioner\n");
107
+ #endif
108
+ }
109
+
110
+ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
111
+ const ScsSettings *stgs) {
112
+ cudaError_t err;
113
+ ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
114
+ ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
115
+
116
+ p->cublas_handle = 0;
117
+ p->cusparse_handle = 0;
118
+
119
+ p->total_solve_time = 0;
120
+ p->tot_cg_its = 0;
121
+
122
+ /* Get handle to the CUBLAS context */
123
+ cublasCreate(&p->cublas_handle);
124
+
125
+ /* Get handle to the CUSPARSE context */
126
+ cusparseCreate(&p->cusparse_handle);
127
+
128
+ Ag->n = A->n;
129
+ Ag->m = A->m;
130
+ Ag->Annz = A->p[A->n];
131
+ Ag->descr = 0;
132
+ /* Matrix description */
133
+ cusparseCreateMatDescr(&Ag->descr);
134
+ cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
135
+ cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
136
+ p->Ag = Ag;
137
+ p->Agt = SCS_NULL;
138
+
139
+ cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
140
+ cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
141
+ cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
142
+
143
+ cudaMalloc((void **)&p->p, A->n * sizeof(scs_float));
144
+ cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
145
+ cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
146
+ cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
147
+ cudaMalloc((void **)&p->tmp_m,
148
+ A->m * sizeof(scs_float)); /* intermediate result */
149
+ cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
150
+ cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
151
+
152
+ cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
153
+ cudaMemcpyHostToDevice);
154
+ cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
155
+ cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
156
+ cudaMemcpyHostToDevice);
157
+
158
+ get_preconditioner(A, stgs, p);
159
+
160
+ #if GPU_TRANSPOSE_MAT > 0
161
+ p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
162
+ p->Agt->n = A->m;
163
+ p->Agt->m = A->n;
164
+ p->Agt->Annz = A->p[A->n];
165
+ p->Agt->descr = 0;
166
+ /* Matrix description */
167
+ cusparseCreateMatDescr(&p->Agt->descr);
168
+ cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
169
+ cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
170
+
171
+ cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
172
+ cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
173
+ cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
174
+ /* transpose Ag into Agt for faster multiplies */
175
+ /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
176
+ CUSPARSE(csr2csc)
177
+ (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
178
+ p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
179
+ #endif
180
+
181
+ err = cudaGetLastError();
182
+ if (err != cudaSuccess) {
183
+ printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
184
+ cudaGetErrorString(err));
185
+ SCS(free_lin_sys_work)(p);
186
+ return SCS_NULL;
187
+ }
188
+ return p;
189
+ }
190
+
191
+ static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
192
+ scs_float *z, scs_float *r, scs_int n) {
193
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
194
+ CUBLAS(tbmv)
195
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
196
+ 0, M, 1, z, 1);
197
+ }
198
+
199
+ /* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
200
+ static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
201
+ ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
202
+ scs_int max_its, scs_float tol) {
203
+ scs_int i, n = A->n;
204
+ scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
205
+ scs_float onef = 1.0, neg_onef = -1.0;
206
+ scs_float *p = pr->p; /* cg direction */
207
+ scs_float *Gp = pr->Gp; /* updated CG direction */
208
+ scs_float *r = pr->r; /* cg residual */
209
+ scs_float *z = pr->z; /* preconditioned */
210
+ scs_float *M = pr->M; /* preconditioner */
211
+ cublasHandle_t cublas_handle = pr->cublas_handle;
212
+
213
+ if (s == SCS_NULL) {
214
+ cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
215
+ cudaMemset(bg, 0, n * sizeof(scs_float));
216
+ } else {
217
+ /* p contains bg temporarily */
218
+ cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
219
+ /* bg contains s */
220
+ cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
221
+ mat_vec(A, stgs, pr, bg, r);
222
+ CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
223
+ CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
224
+ }
225
+
226
+ /* for some reason nrm2 is VERY slow */
227
+ /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
228
+ CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
229
+ nrm_r = SQRTF(nrm_r);
230
+ /* check to see if we need to run CG at all */
231
+ if (nrm_r < MIN(tol, 1e-18)) {
232
+ return 0;
233
+ }
234
+
235
+ apply_pre_conditioner(cublas_handle, M, z, r, n);
236
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
237
+ /* put z in p, replacing temp mem */
238
+ cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
239
+
240
+ for (i = 0; i < max_its; ++i) {
241
+ mat_vec(A, stgs, pr, p, Gp);
242
+
243
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
244
+
245
+ alpha = ipzr / p_gp;
246
+ neg_alpha = -alpha;
247
+
248
+ CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
249
+ CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
250
+
251
+ /* for some reason nrm2 is VERY slow */
252
+ /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
253
+ CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
254
+ nrm_r = SQRTF(nrm_r);
255
+ if (nrm_r < tol) {
256
+ i++;
257
+ break;
258
+ }
259
+ ipzr_old = ipzr;
260
+ apply_pre_conditioner(cublas_handle, M, z, r, n);
261
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
262
+
263
+ beta = ipzr / ipzr_old;
264
+ CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
265
+ CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
266
+ }
267
+ #if EXTRA_VERBOSE > 0
268
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
269
+ #endif
270
+ return i;
271
+ }
272
+
273
+ scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
274
+ ScsLinSysWork *p, scs_float *b, const scs_float *s,
275
+ scs_int iter) {
276
+ scs_int cg_its;
277
+ SCS(timer) linsys_timer;
278
+ scs_float *bg = p->bg;
279
+ scs_float neg_onef = -1.0;
280
+ ScsGpuMatrix *Ag = p->Ag;
281
+ scs_float cg_tol =
282
+ SCS(norm)(b, Ag->n) *
283
+ (iter < 0 ? CG_BEST_TOL
284
+ : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
285
+ SCS(tic)(&linsys_timer);
286
+ /* all on GPU */
287
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
288
+ SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
289
+ /* solves (I+A'A)x = b, s warm start, solution stored in b */
290
+ cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
291
+ CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
292
+ SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
293
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
294
+
295
+ if (iter >= 0) {
296
+ p->tot_cg_its += cg_its;
297
+ }
298
+
299
+ p->total_solve_time += SCS(tocq)(&linsys_timer);
300
+ #if EXTRAVERBOSE > 0
301
+ scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
302
+ #endif
303
+ return 0;
304
+ }
@@ -0,0 +1,36 @@
1
+ #ifndef PRIV_H_GUARD
2
+ #define PRIV_H_GUARD
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #include "gpu.h"
9
+ #include "glbopts.h"
10
+ #include "linalg.h"
11
+ #include "scs.h"
12
+
13
+
14
+ struct SCS_LIN_SYS_WORK {
15
+ /* reporting */
16
+ scs_int tot_cg_its;
17
+ scs_float total_solve_time;
18
+ /* ALL BELOW HOSTED ON THE GPU */
19
+ scs_float *p; /* cg iterate, n */
20
+ scs_float *r; /* cg residual, n */
21
+ scs_float *Gp; /* G * p, n */
22
+ scs_float *bg; /* b, n */
23
+ scs_float *tmp_m; /* m, used in mat_vec */
24
+ scs_float *z; /* preconditioned */
25
+ scs_float *M; /* preconditioner */
26
+ ScsGpuMatrix *Ag; /* A matrix on GPU */
27
+ ScsGpuMatrix *Agt; /* A trans matrix on GPU */
28
+ /* CUDA */
29
+ cublasHandle_t cublas_handle;
30
+ cusparseHandle_t cusparse_handle;
31
+ };
32
+
33
+ #ifdef __cplusplus
34
+ }
35
+ #endif
36
+ #endif
data/vendor/scs/scs.mk ADDED
@@ -0,0 +1,181 @@
1
+ ifeq ($(OS),Windows_NT)
2
+ UNAME = CYGWINorMINGWorMSYS
3
+ else
4
+ UNAME = $(shell uname -s)
5
+ endif
6
+
7
+ #CC = gcc
8
+ # For cross-compiling with mingw use these.
9
+ #CC = i686-w64-mingw32-gcc -m32
10
+ #CC = x86_64-w64-mingw32-gcc-4.8
11
+ CUCC = $(CC) #Don't need to use nvcc, since using cuda blas APIs
12
+
13
+ # For GPU must add cuda libs to path, e.g.
14
+ # export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
15
+
16
+ ifneq (, $(findstring CYGWIN, $(UNAME)))
17
+ ISWINDOWS := 1
18
+ else
19
+ ifneq (, $(findstring MINGW, $(UNAME)))
20
+ ISWINDOWS := 1
21
+ else
22
+ ifneq (, $(findstring MSYS, $(UNAME)))
23
+ ISWINDOWS := 1
24
+ else
25
+ ifneq (, $(findstring mingw, $(CC)))
26
+ ISWINDOWS := 1
27
+ else
28
+ ISWINDOWS := 0
29
+ endif
30
+ endif
31
+ endif
32
+ endif
33
+
34
+ ifeq ($(UNAME), Darwin)
35
+ # we're on apple, no need to link rt library
36
+ LDFLAGS += -lm
37
+ SHARED = dylib
38
+ SONAME = -install_name
39
+ else
40
+ ifeq ($(ISWINDOWS), 1)
41
+ # we're on windows (cygwin or msys)
42
+ LDFLAGS += -lm
43
+ SHARED = dll
44
+ SONAME = -soname
45
+ else
46
+ # we're on a linux system, use accurate timer provided by clock_gettime()
47
+ LDFLAGS += -lm -lrt
48
+ SHARED = so
49
+ SONAME = -soname
50
+ endif
51
+ endif
52
+
53
+ #TODO: check if this works for all platforms:
54
+ ifeq ($(CUDA_PATH), )
55
+ CUDA_PATH=/usr/local/cuda
56
+ endif
57
+ CULDFLAGS = -L$(CUDA_PATH)/lib -L$(CUDA_PATH)/lib64 -lcudart -lcublas -lcusparse
58
+ CUDAFLAGS = $(CFLAGS) -I$(CUDA_PATH)/include -Ilinsys/gpu -Wno-c++11-long-long # turn off annoying long-long warnings in cuda header files
59
+
60
+ # Add on default CFLAGS
61
+ OPT = -O3
62
+ override CFLAGS += -g -Wall -Wwrite-strings -pedantic -funroll-loops -Wstrict-prototypes -I. -Iinclude -Ilinsys $(OPT)
63
+ ifneq ($(ISWINDOWS), 1)
64
+ override CFLAGS += -fPIC
65
+ endif
66
+
67
+ LINSYS = linsys
68
+ DIRSRC = $(LINSYS)/cpu/direct
69
+ INDIRSRC = $(LINSYS)/cpu/indirect
70
+ GPUDIR = $(LINSYS)/gpu/direct
71
+ GPUINDIR = $(LINSYS)/gpu/indirect
72
+
73
+ EXTSRC = $(LINSYS)/external
74
+
75
+ OUT = out
76
+ AR = ar
77
+ ARFLAGS = rv
78
+ ARCHIVE = $(AR) $(ARFLAGS)
79
+ RANLIB = ranlib
80
+ INSTALL = install
81
+
82
+ ifeq ($(PREFIX),)
83
+ PREFIX = /usr/local
84
+ endif
85
+
86
+ OPT_FLAGS =
87
+ ########### OPTIONAL FLAGS ##########
88
+ # these can all be override from the command line
89
+ # e.g. make DLONG=1 will override the setting below
90
+ DLONG = 0
91
+ ifneq ($(DLONG), 0)
92
+ OPT_FLAGS += -DDLONG=$(DLONG) # use longs rather than ints
93
+ endif
94
+ CTRLC = 1
95
+ ifneq ($(CTRLC), 0)
96
+ OPT_FLAGS += -DCTRLC=$(CTRLC) # graceful interrupts with ctrl-c
97
+ endif
98
+ SFLOAT = 0
99
+ ifneq ($(SFLOAT), 0)
100
+ OPT_FLAGS += -DSFLOAT=$(SFLOAT) # use floats rather than doubles
101
+ endif
102
+ NOVALIDATE = 0
103
+ ifneq ($(NOVALIDATE), 0)
104
+ OPT_FLAGS += -DNOVALIDATE=$(NOVALIDATE)$ # remove data validation step
105
+ endif
106
+ NOTIMER = 0
107
+ ifneq ($(NOTIMER), 0)
108
+ OPT_FLAGS += -DNOTIMER=$(NOTIMER) # no timing, times reported as nan
109
+ endif
110
+ COPYAMATRIX = 1
111
+ ifneq ($(COPYAMATRIX), 0)
112
+ OPT_FLAGS += -DCOPYAMATRIX=$(COPYAMATRIX) # if normalize, copy A
113
+ endif
114
+ GPU_TRANSPOSE_MAT = 1
115
+ ifneq ($(GPU_TRANSPOSE_MAT), 0)
116
+ OPT_FLAGS += -DGPU_TRANSPOSE_MAT=$(GPU_TRANSPOSE_MAT) # tranpose A mat in GPU memory
117
+ endif
118
+
119
+ ### VERBOSITY LEVELS: 0,1,2
120
+ EXTRA_VERBOSE = 0
121
+ ifneq ($(EXTRA_VERBOSE), 0)
122
+ OPT_FLAGS += -DEXTRA_VERBOSE=$(EXTRA_VERBOSE) # extra verbosity level
123
+ endif
124
+
125
+ ############ OPENMP: ############
126
+ # set USE_OPENMP = 1 to allow openmp (multi-threaded matrix multiplies):
127
+ # set the number of threads to, for example, 4 by entering the command:
128
+ # export OMP_NUM_THREADS=4
129
+
130
+ USE_OPENMP = 0
131
+ ifneq ($(USE_OPENMP), 0)
132
+ override CFLAGS += -fopenmp
133
+ LDFLAGS += -lgomp
134
+ endif
135
+
136
+ ############ SDPS: BLAS + LAPACK ############
137
+ # set USE_LAPACK = 1 below to enable solving SDPs
138
+ # NB: point the libraries to the locations where
139
+ # you have blas and lapack installed
140
+
141
+ USE_LAPACK = 1
142
+ ifneq ($(USE_LAPACK), 0)
143
+ # edit these for your setup:
144
+ BLASLDFLAGS = -lblas -llapack #-lgfortran
145
+ LDFLAGS += $(BLASLDFLAGS)
146
+ OPT_FLAGS += -DUSE_LAPACK
147
+
148
+ BLAS64 = 0
149
+ ifneq ($(BLAS64), 0)
150
+ OPT_FLAGS += -DBLAS64=$(BLAS64) # if blas/lapack lib uses 64 bit ints
151
+ endif
152
+
153
+ NOBLASSUFFIX = 0
154
+ ifneq ($(NOBLASSUFFIX), 0)
155
+ OPT_FLAGS += -DNOBLASSUFFIX=$(NOBLASSUFFIX) # hack to strip blas suffix
156
+ endif
157
+
158
+ BLASSUFFIX = "_"
159
+ ifneq ($(BLASSUFFIX), "_")
160
+ OPT_FLAGS += -DBLASSUFFIX=$(BLASSUFFIX) # blas suffix (underscore usually)
161
+ endif
162
+ endif
163
+
164
+ MATLAB_MEX_FILE = 0
165
+ ifneq ($(MATLAB_MEX_FILE), 0)
166
+ OPT_FLAGS += -DMATLAB_MEX_FILE=$(MATLAB_MEX_FILE) # matlab mex
167
+ endif
168
+ PYTHON = 0
169
+ ifneq ($(PYTHON), 0)
170
+ OPT_FLAGS += -DPYTHON=$(PYTHON) # python extension
171
+ endif
172
+ USING_R = 0
173
+ ifneq ($(USING_R), 0)
174
+ OPT_FLAGS += -DUSING_R=$(USING_R) # R extension
175
+ endif
176
+
177
+ # debug to see var values, e.g. 'make print-OBJECTS' shows OBJECTS value
178
+ print-%: ; @echo $*=$($*)
179
+
180
+ override CFLAGS += $(OPT_FLAGS)
181
+ CUDAFLAGS += $(OPT_FLAGS)