scs 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +84 -0
  5. data/ext/scs/Rakefile +11 -0
  6. data/lib/scs/ffi.rb +117 -0
  7. data/lib/scs/solver.rb +178 -0
  8. data/lib/scs/version.rb +3 -0
  9. data/lib/scs.rb +17 -0
  10. data/vendor/scs/LICENSE.txt +21 -0
  11. data/vendor/scs/Makefile +164 -0
  12. data/vendor/scs/README.md +220 -0
  13. data/vendor/scs/include/aa.h +56 -0
  14. data/vendor/scs/include/cones.h +46 -0
  15. data/vendor/scs/include/ctrlc.h +33 -0
  16. data/vendor/scs/include/glbopts.h +177 -0
  17. data/vendor/scs/include/linalg.h +26 -0
  18. data/vendor/scs/include/linsys.h +64 -0
  19. data/vendor/scs/include/normalize.h +18 -0
  20. data/vendor/scs/include/rw.h +17 -0
  21. data/vendor/scs/include/scs.h +161 -0
  22. data/vendor/scs/include/scs_blas.h +51 -0
  23. data/vendor/scs/include/util.h +65 -0
  24. data/vendor/scs/linsys/amatrix.c +305 -0
  25. data/vendor/scs/linsys/amatrix.h +36 -0
  26. data/vendor/scs/linsys/amatrix.o +0 -0
  27. data/vendor/scs/linsys/cpu/direct/private.c +366 -0
  28. data/vendor/scs/linsys/cpu/direct/private.h +26 -0
  29. data/vendor/scs/linsys/cpu/direct/private.o +0 -0
  30. data/vendor/scs/linsys/cpu/indirect/private.c +256 -0
  31. data/vendor/scs/linsys/cpu/indirect/private.h +31 -0
  32. data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
  33. data/vendor/scs/linsys/external/amd/LICENSE.txt +934 -0
  34. data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +469 -0
  35. data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +254 -0
  36. data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
  37. data/vendor/scs/linsys/external/amd/amd.h +400 -0
  38. data/vendor/scs/linsys/external/amd/amd_1.c +180 -0
  39. data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
  40. data/vendor/scs/linsys/external/amd/amd_2.c +1842 -0
  41. data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
  42. data/vendor/scs/linsys/external/amd/amd_aat.c +184 -0
  43. data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
  44. data/vendor/scs/linsys/external/amd/amd_control.c +64 -0
  45. data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
  46. data/vendor/scs/linsys/external/amd/amd_defaults.c +37 -0
  47. data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
  48. data/vendor/scs/linsys/external/amd/amd_dump.c +179 -0
  49. data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
  50. data/vendor/scs/linsys/external/amd/amd_global.c +16 -0
  51. data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
  52. data/vendor/scs/linsys/external/amd/amd_info.c +119 -0
  53. data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
  54. data/vendor/scs/linsys/external/amd/amd_internal.h +304 -0
  55. data/vendor/scs/linsys/external/amd/amd_order.c +199 -0
  56. data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
  57. data/vendor/scs/linsys/external/amd/amd_post_tree.c +120 -0
  58. data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
  59. data/vendor/scs/linsys/external/amd/amd_postorder.c +206 -0
  60. data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
  61. data/vendor/scs/linsys/external/amd/amd_preprocess.c +118 -0
  62. data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
  63. data/vendor/scs/linsys/external/amd/amd_valid.c +92 -0
  64. data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
  65. data/vendor/scs/linsys/external/amd/changes +11 -0
  66. data/vendor/scs/linsys/external/qdldl/LICENSE +201 -0
  67. data/vendor/scs/linsys/external/qdldl/README.md +120 -0
  68. data/vendor/scs/linsys/external/qdldl/changes +4 -0
  69. data/vendor/scs/linsys/external/qdldl/qdldl.c +298 -0
  70. data/vendor/scs/linsys/external/qdldl/qdldl.h +177 -0
  71. data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
  72. data/vendor/scs/linsys/external/qdldl/qdldl_types.h +21 -0
  73. data/vendor/scs/linsys/gpu/gpu.c +41 -0
  74. data/vendor/scs/linsys/gpu/gpu.h +85 -0
  75. data/vendor/scs/linsys/gpu/indirect/private.c +304 -0
  76. data/vendor/scs/linsys/gpu/indirect/private.h +36 -0
  77. data/vendor/scs/scs.mk +181 -0
  78. data/vendor/scs/src/aa.c +224 -0
  79. data/vendor/scs/src/aa.o +0 -0
  80. data/vendor/scs/src/cones.c +802 -0
  81. data/vendor/scs/src/cones.o +0 -0
  82. data/vendor/scs/src/ctrlc.c +77 -0
  83. data/vendor/scs/src/ctrlc.o +0 -0
  84. data/vendor/scs/src/linalg.c +84 -0
  85. data/vendor/scs/src/linalg.o +0 -0
  86. data/vendor/scs/src/normalize.c +93 -0
  87. data/vendor/scs/src/normalize.o +0 -0
  88. data/vendor/scs/src/rw.c +167 -0
  89. data/vendor/scs/src/rw.o +0 -0
  90. data/vendor/scs/src/scs.c +975 -0
  91. data/vendor/scs/src/scs.o +0 -0
  92. data/vendor/scs/src/scs_version.c +5 -0
  93. data/vendor/scs/src/scs_version.o +0 -0
  94. data/vendor/scs/src/util.c +196 -0
  95. data/vendor/scs/src/util.o +0 -0
  96. data/vendor/scs/test/data/small_random_socp +0 -0
  97. data/vendor/scs/test/minunit.h +13 -0
  98. data/vendor/scs/test/problem_utils.h +93 -0
  99. data/vendor/scs/test/problems/rob_gauss_cov_est.h +85 -0
  100. data/vendor/scs/test/problems/small_lp.h +50 -0
  101. data/vendor/scs/test/problems/small_random_socp.h +33 -0
  102. data/vendor/scs/test/random_socp_prob.c +171 -0
  103. data/vendor/scs/test/run_from_file.c +69 -0
  104. data/vendor/scs/test/run_tests +2 -0
  105. data/vendor/scs/test/run_tests.c +32 -0
  106. metadata +203 -0
@@ -0,0 +1,85 @@
1
+ #ifndef SCSGPU_H_GUARD
2
+ #define SCSGPU_H_GUARD
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #include <cublas_v2.h>
9
+ #include <cuda.h>
10
+ #include <cuda_runtime_api.h>
11
+ #include <cusparse.h>
12
+
13
+ #include "amatrix.h"
14
+ #include "glbopts.h"
15
+ #include "linalg.h"
16
+ #include "linsys.h"
17
+ #include "scs.h"
18
+ #include "util.h"
19
+
20
+ #define CUDA_CHECK_ERR \
21
+ do { \
22
+ cudaError_t err = cudaGetLastError(); \
23
+ if (err != cudaSuccess) { \
24
+ printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
25
+ cudaGetErrorString(err)); \
26
+ } \
27
+ } while (0)
28
+
29
+ #ifndef EXTRA_VERBOSE
30
+ #ifndef SFLOAT
31
+ #define CUBLAS(x) cublasD##x
32
+ #define CUSPARSE(x) cusparseD##x
33
+ #else
34
+ #define CUBLAS(x) cublasS##x
35
+ #define CUSPARSE(x) cusparseS##x
36
+ #endif
37
+ #else
38
+ #ifndef SFLOAT
39
+ #define CUBLAS(x) \
40
+ CUDA_CHECK_ERR; \
41
+ cublasD##x
42
+ #define CUSPARSE(x) \
43
+ CUDA_CHECK_ERR; \
44
+ cusparseD##x
45
+ #else
46
+ #define CUBLAS(x) \
47
+ CUDA_CHECK_ERR; \
48
+ cublasS##x
49
+ #define CUSPARSE(x) \
50
+ CUDA_CHECK_ERR; \
51
+ cusparseS##x
52
+ #endif
53
+ #endif
54
+
55
+ /*
56
+ CUDA matrix routines only for CSR, not CSC matrices:
57
+ CSC CSR GPU Mult
58
+ A (m x n) A' (n x m) Ag accum_by_a_trans_gpu
59
+ A'(n x m) A (m x n) Agt accum_by_a_gpu
60
+ */
61
+
62
+ /* this struct defines the data matrix A on GPU */
63
+ typedef struct SCS_GPU_A_DATA_MATRIX {
64
+ /* A is supplied in column compressed format */
65
+ scs_float *x; /* A values, size: NNZ A */
66
+ scs_int *i; /* A row index, size: NNZ A */
67
+ scs_int *p; /* A column pointer, size: n+1 */
68
+ scs_int m, n; /* m rows, n cols */
69
+ scs_int Annz; /* num non-zeros in A matrix */
70
+ /* CUDA */
71
+ cusparseMatDescr_t descr;
72
+ } ScsGpuMatrix;
73
+
74
+ void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const scs_float *x,
75
+ scs_float *y, cusparseHandle_t cusparse_handle);
76
+
77
+ void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const scs_float *x,
78
+ scs_float *y, cusparseHandle_t cusparse_handle);
79
+
80
+ void SCS(free_gpu_matrix)(ScsGpuMatrix *A);
81
+
82
+ #ifdef __cplusplus
83
+ }
84
+ #endif
85
+ #endif
@@ -0,0 +1,304 @@
1
+ #include "private.h"
2
+
3
+ #define CG_BEST_TOL 1e-9
4
+ #define CG_MIN_TOL 1e-1
5
+
6
+ /* do not use within pcg, reuses memory */
7
+ void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
8
+ const scs_float *x, scs_float *y) {
9
+ scs_float *v_m = p->tmp_m;
10
+ scs_float *v_n = p->r;
11
+ cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
12
+ cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
13
+ SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
14
+ cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
15
+ }
16
+
17
+ /* do not use within pcg, reuses memory */
18
+ void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
19
+ scs_float *y) {
20
+ scs_float *v_m = p->tmp_m;
21
+ scs_float *v_n = p->r;
22
+ cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
23
+ cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
24
+ #if GPU_TRANSPOSE_MAT > 0
25
+ SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
26
+ #else
27
+ SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
28
+ #endif
29
+ cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
30
+ }
31
+
32
+ char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
33
+ char *str = (char *)scs_malloc(sizeof(char) * 128);
34
+ sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
35
+ (long)A->p[A->n], stgs->cg_rate);
36
+ return str;
37
+ }
38
+
39
+ char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
40
+ char *str = (char *)scs_malloc(sizeof(char) * 128);
41
+ sprintf(str,
42
+ "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
43
+ (scs_float)p->tot_cg_its / (info->iter + 1),
44
+ p->total_solve_time / (info->iter + 1) / 1e3);
45
+ p->tot_cg_its = 0;
46
+ p->total_solve_time = 0;
47
+ return str;
48
+ }
49
+
50
+ void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
51
+ if (p) {
52
+ cudaFree(p->p);
53
+ cudaFree(p->r);
54
+ cudaFree(p->Gp);
55
+ cudaFree(p->bg);
56
+ cudaFree(p->tmp_m);
57
+ cudaFree(p->z);
58
+ cudaFree(p->M);
59
+ if (p->Ag) {
60
+ SCS(free_gpu_matrix)(p->Ag);
61
+ scs_free(p->Ag);
62
+ }
63
+ if (p->Agt) {
64
+ SCS(free_gpu_matrix)(p->Agt);
65
+ scs_free(p->Agt);
66
+ }
67
+ cusparseDestroy(p->cusparse_handle);
68
+ cublasDestroy(p->cublas_handle);
69
+ /* Don't reset because it interferes with other GPU programs. */
70
+ /* cudaDeviceReset(); */
71
+ scs_free(p);
72
+ }
73
+ }
74
+
75
+ /*y = (RHO_X * I + A'A)x */
76
+ static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
77
+ ScsLinSysWork *p, const scs_float *x, scs_float *y) {
78
+ /* x and y MUST already be loaded to GPU */
79
+ scs_float *tmp_m = p->tmp_m; /* temp memory */
80
+ cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
81
+ SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
82
+ cudaMemset(y, 0, A->n * sizeof(scs_float));
83
+ SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
84
+ CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
85
+ }
86
+
87
+ /* M = inv ( diag ( RHO_X * I + A'A ) ) */
88
+ static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
89
+ ScsLinSysWork *p) {
90
+ scs_int i;
91
+ scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
92
+
93
+ #if EXTRA_VERBOSE > 0
94
+ scs_printf("getting pre-conditioner\n");
95
+ #endif
96
+
97
+ for (i = 0; i < A->n; ++i) {
98
+ M[i] = 1 / (stgs->rho_x +
99
+ SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
100
+ /* M[i] = 1; */
101
+ }
102
+ cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
103
+ scs_free(M);
104
+
105
+ #if EXTRA_VERBOSE > 0
106
+ scs_printf("finished getting pre-conditioner\n");
107
+ #endif
108
+ }
109
+
110
+ ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
111
+ const ScsSettings *stgs) {
112
+ cudaError_t err;
113
+ ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
114
+ ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
115
+
116
+ p->cublas_handle = 0;
117
+ p->cusparse_handle = 0;
118
+
119
+ p->total_solve_time = 0;
120
+ p->tot_cg_its = 0;
121
+
122
+ /* Get handle to the CUBLAS context */
123
+ cublasCreate(&p->cublas_handle);
124
+
125
+ /* Get handle to the CUSPARSE context */
126
+ cusparseCreate(&p->cusparse_handle);
127
+
128
+ Ag->n = A->n;
129
+ Ag->m = A->m;
130
+ Ag->Annz = A->p[A->n];
131
+ Ag->descr = 0;
132
+ /* Matrix description */
133
+ cusparseCreateMatDescr(&Ag->descr);
134
+ cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
135
+ cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
136
+ p->Ag = Ag;
137
+ p->Agt = SCS_NULL;
138
+
139
+ cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
140
+ cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
141
+ cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
142
+
143
+ cudaMalloc((void **)&p->p, A->n * sizeof(scs_float));
144
+ cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
145
+ cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
146
+ cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
147
+ cudaMalloc((void **)&p->tmp_m,
148
+ A->m * sizeof(scs_float)); /* intermediate result */
149
+ cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
150
+ cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
151
+
152
+ cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
153
+ cudaMemcpyHostToDevice);
154
+ cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
155
+ cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
156
+ cudaMemcpyHostToDevice);
157
+
158
+ get_preconditioner(A, stgs, p);
159
+
160
+ #if GPU_TRANSPOSE_MAT > 0
161
+ p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
162
+ p->Agt->n = A->m;
163
+ p->Agt->m = A->n;
164
+ p->Agt->Annz = A->p[A->n];
165
+ p->Agt->descr = 0;
166
+ /* Matrix description */
167
+ cusparseCreateMatDescr(&p->Agt->descr);
168
+ cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
169
+ cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
170
+
171
+ cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
172
+ cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
173
+ cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
174
+ /* transpose Ag into Agt for faster multiplies */
175
+ /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
176
+ CUSPARSE(csr2csc)
177
+ (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
178
+ p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
179
+ #endif
180
+
181
+ err = cudaGetLastError();
182
+ if (err != cudaSuccess) {
183
+ printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
184
+ cudaGetErrorString(err));
185
+ SCS(free_lin_sys_work)(p);
186
+ return SCS_NULL;
187
+ }
188
+ return p;
189
+ }
190
+
191
+ static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
192
+ scs_float *z, scs_float *r, scs_int n) {
193
+ cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
194
+ CUBLAS(tbmv)
195
+ (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
196
+ 0, M, 1, z, 1);
197
+ }
198
+
199
+ /* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
200
+ static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
201
+ ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
202
+ scs_int max_its, scs_float tol) {
203
+ scs_int i, n = A->n;
204
+ scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
205
+ scs_float onef = 1.0, neg_onef = -1.0;
206
+ scs_float *p = pr->p; /* cg direction */
207
+ scs_float *Gp = pr->Gp; /* updated CG direction */
208
+ scs_float *r = pr->r; /* cg residual */
209
+ scs_float *z = pr->z; /* preconditioned */
210
+ scs_float *M = pr->M; /* preconditioner */
211
+ cublasHandle_t cublas_handle = pr->cublas_handle;
212
+
213
+ if (s == SCS_NULL) {
214
+ cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
215
+ cudaMemset(bg, 0, n * sizeof(scs_float));
216
+ } else {
217
+ /* p contains bg temporarily */
218
+ cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
219
+ /* bg contains s */
220
+ cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
221
+ mat_vec(A, stgs, pr, bg, r);
222
+ CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
223
+ CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
224
+ }
225
+
226
+ /* for some reason nrm2 is VERY slow */
227
+ /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
228
+ CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
229
+ nrm_r = SQRTF(nrm_r);
230
+ /* check to see if we need to run CG at all */
231
+ if (nrm_r < MIN(tol, 1e-18)) {
232
+ return 0;
233
+ }
234
+
235
+ apply_pre_conditioner(cublas_handle, M, z, r, n);
236
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
237
+ /* put z in p, replacing temp mem */
238
+ cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
239
+
240
+ for (i = 0; i < max_its; ++i) {
241
+ mat_vec(A, stgs, pr, p, Gp);
242
+
243
+ CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
244
+
245
+ alpha = ipzr / p_gp;
246
+ neg_alpha = -alpha;
247
+
248
+ CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
249
+ CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
250
+
251
+ /* for some reason nrm2 is VERY slow */
252
+ /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
253
+ CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
254
+ nrm_r = SQRTF(nrm_r);
255
+ if (nrm_r < tol) {
256
+ i++;
257
+ break;
258
+ }
259
+ ipzr_old = ipzr;
260
+ apply_pre_conditioner(cublas_handle, M, z, r, n);
261
+ CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
262
+
263
+ beta = ipzr / ipzr_old;
264
+ CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
265
+ CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
266
+ }
267
+ #if EXTRA_VERBOSE > 0
268
+ scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
269
+ #endif
270
+ return i;
271
+ }
272
+
273
+ scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
274
+ ScsLinSysWork *p, scs_float *b, const scs_float *s,
275
+ scs_int iter) {
276
+ scs_int cg_its;
277
+ SCS(timer) linsys_timer;
278
+ scs_float *bg = p->bg;
279
+ scs_float neg_onef = -1.0;
280
+ ScsGpuMatrix *Ag = p->Ag;
281
+ scs_float cg_tol =
282
+ SCS(norm)(b, Ag->n) *
283
+ (iter < 0 ? CG_BEST_TOL
284
+ : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
285
+ SCS(tic)(&linsys_timer);
286
+ /* all on GPU */
287
+ cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
288
+ SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
289
+ /* solves (I+A'A)x = b, s warm start, solution stored in b */
290
+ cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
291
+ CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
292
+ SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
293
+ cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
294
+
295
+ if (iter >= 0) {
296
+ p->tot_cg_its += cg_its;
297
+ }
298
+
299
+ p->total_solve_time += SCS(tocq)(&linsys_timer);
300
+ #if EXTRAVERBOSE > 0
301
+ scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
302
+ #endif
303
+ return 0;
304
+ }
@@ -0,0 +1,36 @@
1
+ #ifndef PRIV_H_GUARD
2
+ #define PRIV_H_GUARD
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #include "gpu.h"
9
+ #include "glbopts.h"
10
+ #include "linalg.h"
11
+ #include "scs.h"
12
+
13
+
14
+ struct SCS_LIN_SYS_WORK {
15
+ /* reporting */
16
+ scs_int tot_cg_its;
17
+ scs_float total_solve_time;
18
+ /* ALL BELOW HOSTED ON THE GPU */
19
+ scs_float *p; /* cg iterate, n */
20
+ scs_float *r; /* cg residual, n */
21
+ scs_float *Gp; /* G * p, n */
22
+ scs_float *bg; /* b, n */
23
+ scs_float *tmp_m; /* m, used in mat_vec */
24
+ scs_float *z; /* preconditioned */
25
+ scs_float *M; /* preconditioner */
26
+ ScsGpuMatrix *Ag; /* A matrix on GPU */
27
+ ScsGpuMatrix *Agt; /* A trans matrix on GPU */
28
+ /* CUDA */
29
+ cublasHandle_t cublas_handle;
30
+ cusparseHandle_t cusparse_handle;
31
+ };
32
+
33
+ #ifdef __cplusplus
34
+ }
35
+ #endif
36
+ #endif
data/vendor/scs/scs.mk ADDED
@@ -0,0 +1,181 @@
1
+ ifeq ($(OS),Windows_NT)
2
+ UNAME = CYGWINorMINGWorMSYS
3
+ else
4
+ UNAME = $(shell uname -s)
5
+ endif
6
+
7
+ #CC = gcc
8
+ # For cross-compiling with mingw use these.
9
+ #CC = i686-w64-mingw32-gcc -m32
10
+ #CC = x86_64-w64-mingw32-gcc-4.8
11
+ CUCC = $(CC) #Don't need to use nvcc, since using cuda blas APIs
12
+
13
+ # For GPU must add cuda libs to path, e.g.
14
+ # export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
15
+
16
+ ifneq (, $(findstring CYGWIN, $(UNAME)))
17
+ ISWINDOWS := 1
18
+ else
19
+ ifneq (, $(findstring MINGW, $(UNAME)))
20
+ ISWINDOWS := 1
21
+ else
22
+ ifneq (, $(findstring MSYS, $(UNAME)))
23
+ ISWINDOWS := 1
24
+ else
25
+ ifneq (, $(findstring mingw, $(CC)))
26
+ ISWINDOWS := 1
27
+ else
28
+ ISWINDOWS := 0
29
+ endif
30
+ endif
31
+ endif
32
+ endif
33
+
34
+ ifeq ($(UNAME), Darwin)
35
+ # we're on apple, no need to link rt library
36
+ LDFLAGS += -lm
37
+ SHARED = dylib
38
+ SONAME = -install_name
39
+ else
40
+ ifeq ($(ISWINDOWS), 1)
41
+ # we're on windows (cygwin or msys)
42
+ LDFLAGS += -lm
43
+ SHARED = dll
44
+ SONAME = -soname
45
+ else
46
+ # we're on a linux system, use accurate timer provided by clock_gettime()
47
+ LDFLAGS += -lm -lrt
48
+ SHARED = so
49
+ SONAME = -soname
50
+ endif
51
+ endif
52
+
53
+ #TODO: check if this works for all platforms:
54
+ ifeq ($(CUDA_PATH), )
55
+ CUDA_PATH=/usr/local/cuda
56
+ endif
57
+ CULDFLAGS = -L$(CUDA_PATH)/lib -L$(CUDA_PATH)/lib64 -lcudart -lcublas -lcusparse
58
+ CUDAFLAGS = $(CFLAGS) -I$(CUDA_PATH)/include -Ilinsys/gpu -Wno-c++11-long-long # turn off annoying long-long warnings in cuda header files
59
+
60
+ # Add on default CFLAGS
61
+ OPT = -O3
62
+ override CFLAGS += -g -Wall -Wwrite-strings -pedantic -funroll-loops -Wstrict-prototypes -I. -Iinclude -Ilinsys $(OPT)
63
+ ifneq ($(ISWINDOWS), 1)
64
+ override CFLAGS += -fPIC
65
+ endif
66
+
67
+ LINSYS = linsys
68
+ DIRSRC = $(LINSYS)/cpu/direct
69
+ INDIRSRC = $(LINSYS)/cpu/indirect
70
+ GPUDIR = $(LINSYS)/gpu/direct
71
+ GPUINDIR = $(LINSYS)/gpu/indirect
72
+
73
+ EXTSRC = $(LINSYS)/external
74
+
75
+ OUT = out
76
+ AR = ar
77
+ ARFLAGS = rv
78
+ ARCHIVE = $(AR) $(ARFLAGS)
79
+ RANLIB = ranlib
80
+ INSTALL = install
81
+
82
+ ifeq ($(PREFIX),)
83
+ PREFIX = /usr/local
84
+ endif
85
+
86
+ OPT_FLAGS =
87
+ ########### OPTIONAL FLAGS ##########
88
+ # these can all be override from the command line
89
+ # e.g. make DLONG=1 will override the setting below
90
+ DLONG = 0
91
+ ifneq ($(DLONG), 0)
92
+ OPT_FLAGS += -DDLONG=$(DLONG) # use longs rather than ints
93
+ endif
94
+ CTRLC = 1
95
+ ifneq ($(CTRLC), 0)
96
+ OPT_FLAGS += -DCTRLC=$(CTRLC) # graceful interrupts with ctrl-c
97
+ endif
98
+ SFLOAT = 0
99
+ ifneq ($(SFLOAT), 0)
100
+ OPT_FLAGS += -DSFLOAT=$(SFLOAT) # use floats rather than doubles
101
+ endif
102
+ NOVALIDATE = 0
103
+ ifneq ($(NOVALIDATE), 0)
104
+ OPT_FLAGS += -DNOVALIDATE=$(NOVALIDATE)$ # remove data validation step
105
+ endif
106
+ NOTIMER = 0
107
+ ifneq ($(NOTIMER), 0)
108
+ OPT_FLAGS += -DNOTIMER=$(NOTIMER) # no timing, times reported as nan
109
+ endif
110
+ COPYAMATRIX = 1
111
+ ifneq ($(COPYAMATRIX), 0)
112
+ OPT_FLAGS += -DCOPYAMATRIX=$(COPYAMATRIX) # if normalize, copy A
113
+ endif
114
+ GPU_TRANSPOSE_MAT = 1
115
+ ifneq ($(GPU_TRANSPOSE_MAT), 0)
116
+ OPT_FLAGS += -DGPU_TRANSPOSE_MAT=$(GPU_TRANSPOSE_MAT) # tranpose A mat in GPU memory
117
+ endif
118
+
119
+ ### VERBOSITY LEVELS: 0,1,2
120
+ EXTRA_VERBOSE = 0
121
+ ifneq ($(EXTRA_VERBOSE), 0)
122
+ OPT_FLAGS += -DEXTRA_VERBOSE=$(EXTRA_VERBOSE) # extra verbosity level
123
+ endif
124
+
125
+ ############ OPENMP: ############
126
+ # set USE_OPENMP = 1 to allow openmp (multi-threaded matrix multiplies):
127
+ # set the number of threads to, for example, 4 by entering the command:
128
+ # export OMP_NUM_THREADS=4
129
+
130
+ USE_OPENMP = 0
131
+ ifneq ($(USE_OPENMP), 0)
132
+ override CFLAGS += -fopenmp
133
+ LDFLAGS += -lgomp
134
+ endif
135
+
136
+ ############ SDPS: BLAS + LAPACK ############
137
+ # set USE_LAPACK = 1 below to enable solving SDPs
138
+ # NB: point the libraries to the locations where
139
+ # you have blas and lapack installed
140
+
141
+ USE_LAPACK = 1
142
+ ifneq ($(USE_LAPACK), 0)
143
+ # edit these for your setup:
144
+ BLASLDFLAGS = -lblas -llapack #-lgfortran
145
+ LDFLAGS += $(BLASLDFLAGS)
146
+ OPT_FLAGS += -DUSE_LAPACK
147
+
148
+ BLAS64 = 0
149
+ ifneq ($(BLAS64), 0)
150
+ OPT_FLAGS += -DBLAS64=$(BLAS64) # if blas/lapack lib uses 64 bit ints
151
+ endif
152
+
153
+ NOBLASSUFFIX = 0
154
+ ifneq ($(NOBLASSUFFIX), 0)
155
+ OPT_FLAGS += -DNOBLASSUFFIX=$(NOBLASSUFFIX) # hack to strip blas suffix
156
+ endif
157
+
158
+ BLASSUFFIX = "_"
159
+ ifneq ($(BLASSUFFIX), "_")
160
+ OPT_FLAGS += -DBLASSUFFIX=$(BLASSUFFIX) # blas suffix (underscore usually)
161
+ endif
162
+ endif
163
+
164
+ MATLAB_MEX_FILE = 0
165
+ ifneq ($(MATLAB_MEX_FILE), 0)
166
+ OPT_FLAGS += -DMATLAB_MEX_FILE=$(MATLAB_MEX_FILE) # matlab mex
167
+ endif
168
+ PYTHON = 0
169
+ ifneq ($(PYTHON), 0)
170
+ OPT_FLAGS += -DPYTHON=$(PYTHON) # python extension
171
+ endif
172
+ USING_R = 0
173
+ ifneq ($(USING_R), 0)
174
+ OPT_FLAGS += -DUSING_R=$(USING_R) # R extension
175
+ endif
176
+
177
+ # debug to see var values, e.g. 'make print-OBJECTS' shows OBJECTS value
178
+ print-%: ; @echo $*=$($*)
179
+
180
+ override CFLAGS += $(OPT_FLAGS)
181
+ CUDAFLAGS += $(OPT_FLAGS)