scs 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/scs/ffi.rb +2 -0
- data/lib/scs/version.rb +1 -1
- data/vendor/scs/CITATION.cff +2 -2
- data/vendor/scs/CMakeLists.txt +136 -6
- data/vendor/scs/Makefile +53 -3
- data/vendor/scs/README.md +1 -1
- data/vendor/scs/include/cones.h +47 -2
- data/vendor/scs/include/glbopts.h +1 -1
- data/vendor/scs/include/scs.h +29 -0
- data/vendor/scs/include/scs_blas.h +4 -0
- data/vendor/scs/include/scs_types.h +3 -1
- data/vendor/scs/include/util_spectral_cones.h +45 -0
- data/vendor/scs/linsys/cpu/direct/private.c +3 -3
- data/vendor/scs/linsys/cpu/direct/private.h +2 -1
- data/vendor/scs/linsys/csparse.c +1 -1
- data/vendor/scs/linsys/cudss/direct/private.c +279 -0
- data/vendor/scs/linsys/cudss/direct/private.h +63 -0
- data/vendor/scs/linsys/external/qdldl/qdldl_types.h +1 -1
- data/vendor/scs/linsys/gpu/indirect/private.c +14 -21
- data/vendor/scs/scs.mk +17 -2
- data/vendor/scs/src/aa.c +8 -12
- data/vendor/scs/src/cones.c +783 -12
- data/vendor/scs/src/rw.c +15 -1
- data/vendor/scs/src/scs.c +4 -0
- data/vendor/scs/src/spectral_cones/logdeterminant/log_cone_IPM.c +660 -0
- data/vendor/scs/src/spectral_cones/logdeterminant/log_cone_Newton.c +279 -0
- data/vendor/scs/src/spectral_cones/logdeterminant/log_cone_wrapper.c +205 -0
- data/vendor/scs/src/spectral_cones/logdeterminant/logdet_cone.c +143 -0
- data/vendor/scs/src/spectral_cones/nuclear/ell1_cone.c +221 -0
- data/vendor/scs/src/spectral_cones/nuclear/nuclear_cone.c +99 -0
- data/vendor/scs/src/spectral_cones/sum-largest/sum_largest_cone.c +196 -0
- data/vendor/scs/src/spectral_cones/sum-largest/sum_largest_eval_cone.c +140 -0
- data/vendor/scs/src/spectral_cones/util_spectral_cones.c +52 -0
- data/vendor/scs/test/problems/complex_PSD.h +83 -0
- data/vendor/scs/test/rng.h +4 -4
- data/vendor/scs/test/run_tests.c +25 -0
- data/vendor/scs/test/spectral_cones_problems/exp_design.h +141 -0
- data/vendor/scs/test/spectral_cones_problems/graph_partitioning.h +275 -0
- data/vendor/scs/test/spectral_cones_problems/robust_pca.h +253 -0
- data/vendor/scs/test/spectral_cones_problems/several_logdet_cones.h +222 -0
- data/vendor/scs/test/spectral_cones_problems/several_nuc_cone.h +285 -0
- data/vendor/scs/test/spectral_cones_problems/several_sum_largest.h +420 -0
- metadata +21 -2
@@ -0,0 +1,279 @@
|
|
1
|
+
#include "private.h"
|
2
|
+
#include "linsys.h"
|
3
|
+
|
4
|
+
/* In case of error abort freeing p */
|
5
|
+
#define CUDSS_CHECK_ABORT(call, p, fname) \
|
6
|
+
do { \
|
7
|
+
cudssStatus_t status = call; \
|
8
|
+
if (status != CUDSS_STATUS_SUCCESS) { \
|
9
|
+
scs_printf("CUDSS call " #fname " returned status = %d\n", status); \
|
10
|
+
scs_free_lin_sys_work(p); \
|
11
|
+
return SCS_NULL; \
|
12
|
+
} \
|
13
|
+
} while (0);
|
14
|
+
|
15
|
+
/* In case of error abort freeing p */
|
16
|
+
#define CUDA_CHECK_ABORT(call, p, fname) \
|
17
|
+
do { \
|
18
|
+
cudaError_t status = call; \
|
19
|
+
if (status != cudaSuccess) { \
|
20
|
+
printf("CUDA call " #fname " returned status = %d\n", status); \
|
21
|
+
scs_free_lin_sys_work(p); \
|
22
|
+
return SCS_NULL; \
|
23
|
+
} \
|
24
|
+
} while (0);
|
25
|
+
|
26
|
+
/* Return the linear system method name */
|
27
|
+
const char *scs_get_lin_sys_method() {
|
28
|
+
return "sparse-direct-cuDSS";
|
29
|
+
}
|
30
|
+
|
31
|
+
/* Free allocated resources for the linear system solver */
|
32
|
+
void scs_free_lin_sys_work(ScsLinSysWork *p) {
|
33
|
+
if (p) {
|
34
|
+
/* Free GPU resources */
|
35
|
+
if (p->d_kkt_val)
|
36
|
+
cudaFree(p->d_kkt_val);
|
37
|
+
if (p->d_kkt_row_ptr)
|
38
|
+
cudaFree(p->d_kkt_row_ptr);
|
39
|
+
if (p->d_kkt_col_ind)
|
40
|
+
cudaFree(p->d_kkt_col_ind);
|
41
|
+
if (p->d_b)
|
42
|
+
cudaFree(p->d_b);
|
43
|
+
if (p->d_sol)
|
44
|
+
cudaFree(p->d_sol);
|
45
|
+
|
46
|
+
/* Free cuDSS resources */
|
47
|
+
if (p->d_kkt_mat)
|
48
|
+
cudssMatrixDestroy(p->d_kkt_mat);
|
49
|
+
if (p->d_b_mat)
|
50
|
+
cudssMatrixDestroy(p->d_b_mat);
|
51
|
+
if (p->d_sol_mat)
|
52
|
+
cudssMatrixDestroy(p->d_sol_mat);
|
53
|
+
|
54
|
+
if (p->solver_config)
|
55
|
+
cudssConfigDestroy(p->solver_config);
|
56
|
+
if (p->solver_data && p->handle)
|
57
|
+
cudssDataDestroy(p->handle, p->solver_data);
|
58
|
+
if (p->handle)
|
59
|
+
cudssDestroy(p->handle);
|
60
|
+
|
61
|
+
/* Free CPU resources */
|
62
|
+
if (p->kkt)
|
63
|
+
SCS(cs_spfree)(p->kkt);
|
64
|
+
if (p->sol)
|
65
|
+
scs_free(p->sol);
|
66
|
+
if (p->diag_r_idxs)
|
67
|
+
scs_free(p->diag_r_idxs);
|
68
|
+
if (p->diag_p)
|
69
|
+
scs_free(p->diag_p);
|
70
|
+
|
71
|
+
scs_free(p);
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
/* Initialize the linear system solver workspace */
|
76
|
+
ScsLinSysWork *scs_init_lin_sys_work(const ScsMatrix *A, const ScsMatrix *P,
|
77
|
+
const scs_float *diag_r) {
|
78
|
+
ScsLinSysWork *p = scs_calloc(1, sizeof(ScsLinSysWork));
|
79
|
+
if (!p)
|
80
|
+
return SCS_NULL;
|
81
|
+
|
82
|
+
/* Store problem dimensions */
|
83
|
+
p->n = A->n;
|
84
|
+
p->m = A->m;
|
85
|
+
p->n_plus_m = p->n + p->m;
|
86
|
+
|
87
|
+
/* Allocate CPU memory */
|
88
|
+
p->sol = (scs_float *)scs_malloc(sizeof(scs_float) * p->n_plus_m);
|
89
|
+
if (!p->sol) {
|
90
|
+
scs_free_lin_sys_work(p);
|
91
|
+
return SCS_NULL;
|
92
|
+
}
|
93
|
+
|
94
|
+
p->diag_r_idxs = (scs_int *)scs_calloc(p->n_plus_m, sizeof(scs_int));
|
95
|
+
if (!p->diag_r_idxs) {
|
96
|
+
scs_free_lin_sys_work(p);
|
97
|
+
return SCS_NULL;
|
98
|
+
}
|
99
|
+
|
100
|
+
p->diag_p = (scs_float *)scs_calloc(p->n, sizeof(scs_float));
|
101
|
+
if (!p->diag_p) {
|
102
|
+
scs_free_lin_sys_work(p);
|
103
|
+
return SCS_NULL;
|
104
|
+
}
|
105
|
+
|
106
|
+
/* Form KKT matrix as upper-triangular, CSC */
|
107
|
+
/* Because of symmetry it is equivalent to lower-triangular, CSR */
|
108
|
+
p->kkt = SCS(form_kkt)(A, P, p->diag_p, diag_r, p->diag_r_idxs, 1);
|
109
|
+
if (!p->kkt) {
|
110
|
+
scs_printf("Error in forming KKT matrix");
|
111
|
+
scs_free_lin_sys_work(p);
|
112
|
+
return SCS_NULL;
|
113
|
+
}
|
114
|
+
|
115
|
+
cudssStatus_t status;
|
116
|
+
cudaError_t cuda_error;
|
117
|
+
|
118
|
+
/* Create cuDSS handle */
|
119
|
+
CUDSS_CHECK_ABORT(cudssCreate(&p->handle), p, "cudssCreate");
|
120
|
+
/* Creating cuDSS solver configuration and data objects */
|
121
|
+
|
122
|
+
CUDSS_CHECK_ABORT(cudssConfigCreate(&p->solver_config), p,
|
123
|
+
"cudssConfigCreate");
|
124
|
+
CUDSS_CHECK_ABORT(cudssDataCreate(p->handle, &p->solver_data), p,
|
125
|
+
"cudssDataCreate");
|
126
|
+
|
127
|
+
/* Allocate device memory for KKT matrix */
|
128
|
+
scs_int nnz = p->kkt->p[p->n_plus_m];
|
129
|
+
|
130
|
+
CUDA_CHECK_ABORT(cudaMalloc((void **)&p->d_kkt_val, nnz * sizeof(scs_float)),
|
131
|
+
p, "cudaMalloc: kkt_val");
|
132
|
+
CUDA_CHECK_ABORT(cudaMalloc((void **)&p->d_kkt_row_ptr,
|
133
|
+
(p->n_plus_m + 1) * sizeof(scs_int)),
|
134
|
+
p, "cudaMalloc: kkt_row_ptr");
|
135
|
+
CUDA_CHECK_ABORT(
|
136
|
+
cudaMalloc((void **)&p->d_kkt_col_ind, nnz * sizeof(scs_int)), p,
|
137
|
+
"cudaMalloc: kkt_col_ind");
|
138
|
+
|
139
|
+
/* Copy KKT matrix to device */
|
140
|
+
/* Note: we treat column pointers (p->kkt->p) as row pointers on the device */
|
141
|
+
CUDA_CHECK_ABORT(cudaMemcpy(p->d_kkt_val, p->kkt->x, nnz * sizeof(scs_float),
|
142
|
+
cudaMemcpyHostToDevice),
|
143
|
+
p, "cudaMemcpy: kkt_val");
|
144
|
+
CUDA_CHECK_ABORT(cudaMemcpy(p->d_kkt_row_ptr, p->kkt->p,
|
145
|
+
(p->kkt->n + 1) * sizeof(scs_int),
|
146
|
+
cudaMemcpyHostToDevice),
|
147
|
+
p, "cudaMemcpy: kkt_row_ptr");
|
148
|
+
CUDA_CHECK_ABORT(cudaMemcpy(p->d_kkt_col_ind, p->kkt->i,
|
149
|
+
nnz * sizeof(scs_int), cudaMemcpyHostToDevice),
|
150
|
+
p, "cudaMemcpy: kkt_col_ind");
|
151
|
+
|
152
|
+
/* Create kkt matrix descriptor */
|
153
|
+
/* We pass the kkt matrix as symmetric, lower triangular */
|
154
|
+
cudssMatrixType_t mtype = CUDSS_MTYPE_SYMMETRIC;
|
155
|
+
cudssMatrixViewType_t mview = CUDSS_MVIEW_LOWER;
|
156
|
+
cudssIndexBase_t base = CUDSS_BASE_ZERO;
|
157
|
+
CUDSS_CHECK_ABORT(cudssMatrixCreateCsr(
|
158
|
+
&p->d_kkt_mat, p->kkt->m, p->kkt->n, nnz,
|
159
|
+
p->d_kkt_row_ptr, NULL, p->d_kkt_col_ind, p->d_kkt_val,
|
160
|
+
SCS_CUDA_INDEX, SCS_CUDA_FLOAT, mtype, mview, base),
|
161
|
+
p, "cudssMatrixCreateCsr");
|
162
|
+
|
163
|
+
/* Allocate device memory for vectors */
|
164
|
+
CUDA_CHECK_ABORT(
|
165
|
+
cudaMalloc((void **)&p->d_b, p->n_plus_m * sizeof(scs_float)), p,
|
166
|
+
"cudaMalloc: b");
|
167
|
+
CUDA_CHECK_ABORT(
|
168
|
+
cudaMalloc((void **)&p->d_sol, p->n_plus_m * sizeof(scs_float)), p,
|
169
|
+
"cudaMalloc: sol");
|
170
|
+
|
171
|
+
/* Create RHS and solution matrix descriptors */
|
172
|
+
scs_int nrhs = 1;
|
173
|
+
CUDSS_CHECK_ABORT(cudssMatrixCreateDn(&p->d_b_mat, p->n_plus_m, nrhs,
|
174
|
+
p->n_plus_m, p->d_b, SCS_CUDA_FLOAT,
|
175
|
+
CUDSS_LAYOUT_COL_MAJOR),
|
176
|
+
p, "cudssMatrixCreateDn: b");
|
177
|
+
CUDSS_CHECK_ABORT(cudssMatrixCreateDn(&p->d_sol_mat, p->n_plus_m, nrhs,
|
178
|
+
p->n_plus_m, p->d_sol, SCS_CUDA_FLOAT,
|
179
|
+
CUDSS_LAYOUT_COL_MAJOR),
|
180
|
+
p, "cudssMatrixCreateDn: sol");
|
181
|
+
|
182
|
+
/* Symbolic factorization */
|
183
|
+
CUDSS_CHECK_ABORT(cudssExecute(p->handle, CUDSS_PHASE_ANALYSIS,
|
184
|
+
p->solver_config, p->solver_data, p->d_kkt_mat,
|
185
|
+
p->d_sol_mat, p->d_b_mat),
|
186
|
+
p, "cudssExecute: analysis");
|
187
|
+
|
188
|
+
/* Numerical Factorization */
|
189
|
+
CUDSS_CHECK_ABORT(cudssExecute(p->handle, CUDSS_PHASE_FACTORIZATION,
|
190
|
+
p->solver_config, p->solver_data, p->d_kkt_mat,
|
191
|
+
p->d_sol_mat, p->d_b_mat),
|
192
|
+
p, "cudssExecute: factorization");
|
193
|
+
|
194
|
+
return p;
|
195
|
+
}
|
196
|
+
|
197
|
+
/* Solve the linear system for a given RHS b */
|
198
|
+
scs_int scs_solve_lin_sys(ScsLinSysWork *p, scs_float *b, const scs_float *ws,
|
199
|
+
scs_float tol) {
|
200
|
+
/* Copy right-hand side to device */
|
201
|
+
cudaError_t custatus = cudaMemcpy(p->d_b, b, p->n_plus_m * sizeof(scs_float),
|
202
|
+
cudaMemcpyHostToDevice);
|
203
|
+
if (custatus != cudaSuccess) {
|
204
|
+
scs_printf("scs_solve_lin_sys: Error copying `b` side to device: %d\n",
|
205
|
+
(int)custatus);
|
206
|
+
return custatus;
|
207
|
+
}
|
208
|
+
|
209
|
+
// is this really needed?
|
210
|
+
cudssMatrixSetValues(p->d_b_mat, p->d_b);
|
211
|
+
|
212
|
+
/* Solve the system */
|
213
|
+
cudssStatus_t status =
|
214
|
+
cudssExecute(p->handle, CUDSS_PHASE_SOLVE, p->solver_config,
|
215
|
+
p->solver_data, p->d_kkt_mat, p->d_sol_mat, p->d_b_mat);
|
216
|
+
|
217
|
+
if (status != CUDSS_STATUS_SUCCESS) {
|
218
|
+
scs_printf("scs_solve_lin_sys: Error during solve: %d\n", (int)status);
|
219
|
+
return status;
|
220
|
+
}
|
221
|
+
|
222
|
+
/* Copy solution back to host */
|
223
|
+
custatus = cudaMemcpy(b, p->d_sol, p->n_plus_m * sizeof(scs_float),
|
224
|
+
cudaMemcpyDeviceToHost);
|
225
|
+
if (status != cudaSuccess) {
|
226
|
+
scs_printf("scs_solve_lin_sys: Error copying d_sol to host: %d\n",
|
227
|
+
(int)status);
|
228
|
+
return status;
|
229
|
+
}
|
230
|
+
|
231
|
+
return 0; /* Success */
|
232
|
+
}
|
233
|
+
|
234
|
+
/* Update the KKT matrix when R changes */
|
235
|
+
void scs_update_lin_sys_diag_r(ScsLinSysWork *p, const scs_float *diag_r) {
|
236
|
+
scs_int i;
|
237
|
+
|
238
|
+
/* Update KKT matrix on CPU */
|
239
|
+
for (i = 0; i < p->n; ++i) {
|
240
|
+
/* top left is R_x + P */
|
241
|
+
p->kkt->x[p->diag_r_idxs[i]] = p->diag_p[i] + diag_r[i];
|
242
|
+
}
|
243
|
+
for (i = p->n; i < p->n + p->m; ++i) {
|
244
|
+
/* bottom right is -R_y */
|
245
|
+
p->kkt->x[p->diag_r_idxs[i]] = -diag_r[i];
|
246
|
+
}
|
247
|
+
|
248
|
+
/* Copy updated values to device */
|
249
|
+
cudaError_t custatus = cudaMemcpy(p->d_kkt_val, p->kkt->x,
|
250
|
+
p->kkt->p[p->n_plus_m] * sizeof(scs_float),
|
251
|
+
cudaMemcpyHostToDevice);
|
252
|
+
if (custatus != cudaSuccess) {
|
253
|
+
scs_printf(
|
254
|
+
"scs_update_lin_sys_diag_r: Error copying kkt->x to device: %d\n",
|
255
|
+
(int)custatus);
|
256
|
+
return;
|
257
|
+
}
|
258
|
+
|
259
|
+
/* Update the matrix values in cuDSS */
|
260
|
+
cudssStatus_t status;
|
261
|
+
status = cudssMatrixSetCsrPointers(p->d_kkt_mat, p->d_kkt_row_ptr, NULL,
|
262
|
+
p->d_kkt_col_ind, p->d_kkt_val);
|
263
|
+
if (status != CUDSS_STATUS_SUCCESS) {
|
264
|
+
scs_printf(
|
265
|
+
"scs_update_lin_sys_diag_r: Error updating kkt matrix on device: %d\n",
|
266
|
+
(int)status);
|
267
|
+
return;
|
268
|
+
}
|
269
|
+
|
270
|
+
/* Perform Refactorization with the updated matrix */
|
271
|
+
status =
|
272
|
+
cudssExecute(p->handle, CUDSS_PHASE_REFACTORIZATION, p->solver_config,
|
273
|
+
p->solver_data, p->d_kkt_mat, p->d_sol_mat, p->d_b_mat);
|
274
|
+
if (status != CUDSS_STATUS_SUCCESS) {
|
275
|
+
scs_printf("scs_update_lin_sys_diag_r: Error during re-factorization: %d\n",
|
276
|
+
(int)status);
|
277
|
+
return;
|
278
|
+
}
|
279
|
+
}
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#ifndef PRIV_H_GUARD
|
2
|
+
#define PRIV_H_GUARD
|
3
|
+
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#ifndef SFLOAT
|
9
|
+
#define SCS_CUDA_FLOAT CUDA_R_64F
|
10
|
+
#else
|
11
|
+
#define SCS_CUDA_FLOAT CUDA_R_32F
|
12
|
+
#endif
|
13
|
+
|
14
|
+
#ifndef DLONG
|
15
|
+
#define SCS_CUDA_INDEX CUDA_R_32I
|
16
|
+
#else
|
17
|
+
#define SCS_CUDA_INDEX CUDA_R_64I
|
18
|
+
#endif
|
19
|
+
|
20
|
+
#include "csparse.h"
|
21
|
+
#include "linsys.h"
|
22
|
+
#include <cuda_runtime.h>
|
23
|
+
#include <cudss.h>
|
24
|
+
|
25
|
+
struct SCS_LIN_SYS_WORK {
|
26
|
+
/* General problem dimensions */
|
27
|
+
scs_int n; /* number of QP variables */
|
28
|
+
scs_int m; /* number of QP constraints */
|
29
|
+
scs_int n_plus_m; /* dimension of the linear system */
|
30
|
+
|
31
|
+
/* CPU matrices and vectors */
|
32
|
+
ScsMatrix *kkt; /* KKT matrix in CSR format */
|
33
|
+
scs_float *sol; /* solution to the KKT system */
|
34
|
+
|
35
|
+
/* cuDSS handle and descriptors */
|
36
|
+
cudssHandle_t handle; /* cuDSS library handle */
|
37
|
+
cudssMatrix_t d_kkt_mat; /* cuDSS matrix descriptors */
|
38
|
+
cudssMatrix_t d_b_mat;
|
39
|
+
cudssMatrix_t d_sol_mat;
|
40
|
+
|
41
|
+
/* Device memory for KKT matrix */
|
42
|
+
scs_float *d_kkt_val; /* device copy of KKT values */
|
43
|
+
scs_int *d_kkt_row_ptr; /* device copy of KKT row pointers */
|
44
|
+
scs_int *d_kkt_col_ind; /* device copy of KKT column indices */
|
45
|
+
|
46
|
+
/* Device memory for vectors */
|
47
|
+
scs_float *d_b; /* device copy of right-hand side */
|
48
|
+
scs_float *d_sol; /* device copy of solution */
|
49
|
+
|
50
|
+
/* These are required for matrix updates */
|
51
|
+
scs_int *diag_r_idxs; /* indices where R appears in the KKT matrix */
|
52
|
+
scs_float *diag_p; /* Diagonal values of P */
|
53
|
+
|
54
|
+
/* cuDSS configuration */
|
55
|
+
cudssConfig_t solver_config; /* cuDSS solver handle */
|
56
|
+
cudssData_t solver_data; /* cuDSS data handle */
|
57
|
+
};
|
58
|
+
|
59
|
+
#ifdef __cplusplus
|
60
|
+
}
|
61
|
+
#endif
|
62
|
+
|
63
|
+
#endif
|
@@ -147,28 +147,24 @@ static void mat_vec(ScsLinSysWork *p, const scs_float *x, scs_float *y) {
|
|
147
147
|
|
148
148
|
if (p->Pg) {
|
149
149
|
/* y = R_x * x + P x */
|
150
|
-
SCS(accum_by_p_gpu)
|
151
|
-
|
152
|
-
&p->buffer);
|
150
|
+
SCS(accum_by_p_gpu)(p->Pg, p->dn_vec_n, p->dn_vec_n_p, p->cusparse_handle,
|
151
|
+
&p->buffer_size, &p->buffer);
|
153
152
|
}
|
154
153
|
|
155
154
|
/* z = Ax */
|
156
155
|
#if GPU_TRANSPOSE_MAT > 0
|
157
|
-
SCS(accum_by_atrans_gpu)
|
158
|
-
|
159
|
-
&p->buffer);
|
156
|
+
SCS(accum_by_atrans_gpu)(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
|
157
|
+
&p->buffer_size, &p->buffer);
|
160
158
|
#else
|
161
|
-
SCS(accum_by_a_gpu)
|
162
|
-
|
163
|
-
&p->buffer);
|
159
|
+
SCS(accum_by_a_gpu)(p->Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
|
160
|
+
&p->buffer_size, &p->buffer);
|
164
161
|
#endif
|
165
162
|
/* z = R_y^{-1} A x */
|
166
163
|
scale_by_diag(p->cublas_handle, p->inv_r_y_gpu, z, p->m);
|
167
164
|
|
168
165
|
/* y += A'z => y = R_x * x + P x + A' R_y^{-1} Ax */
|
169
|
-
SCS(accum_by_atrans_gpu)
|
170
|
-
|
171
|
-
&p->buffer);
|
166
|
+
SCS(accum_by_atrans_gpu)(p->Ag, p->dn_vec_m, p->dn_vec_n_p,
|
167
|
+
p->cusparse_handle, &p->buffer_size, &p->buffer);
|
172
168
|
}
|
173
169
|
|
174
170
|
/* P comes in upper triangular, expand to full
|
@@ -488,9 +484,8 @@ scs_int scs_solve_lin_sys(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
488
484
|
cusparseDnVecSetValues(p->dn_vec_m, (void *)tmp_m); /* R * ry */
|
489
485
|
cusparseDnVecSetValues(p->dn_vec_n, (void *)bg); /* rx */
|
490
486
|
/* bg[:n] = rx + A' R ry */
|
491
|
-
SCS(accum_by_atrans_gpu)
|
492
|
-
|
493
|
-
&p->buffer);
|
487
|
+
SCS(accum_by_atrans_gpu)(Ag, p->dn_vec_m, p->dn_vec_n, p->cusparse_handle,
|
488
|
+
&p->buffer_size, &p->buffer);
|
494
489
|
|
495
490
|
/* set max_iters to 10 * n (though in theory n is enough for any tol) */
|
496
491
|
max_iters = 10 * Ag->n;
|
@@ -506,13 +501,11 @@ scs_int scs_solve_lin_sys(ScsLinSysWork *p, scs_float *b, const scs_float *s,
|
|
506
501
|
|
507
502
|
/* b[n:] = Ax - ry */
|
508
503
|
#if GPU_TRANSPOSE_MAT > 0
|
509
|
-
SCS(accum_by_atrans_gpu)
|
510
|
-
|
511
|
-
&p->buffer);
|
504
|
+
SCS(accum_by_atrans_gpu)(p->Agt, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
|
505
|
+
&p->buffer_size, &p->buffer);
|
512
506
|
#else
|
513
|
-
SCS(accum_by_a_gpu)
|
514
|
-
|
515
|
-
&p->buffer);
|
507
|
+
SCS(accum_by_a_gpu)(Ag, p->dn_vec_n, p->dn_vec_m, p->cusparse_handle,
|
508
|
+
&p->buffer_size, &p->buffer);
|
516
509
|
#endif
|
517
510
|
|
518
511
|
/* bg[n:] = R_y^{-1} bg[n:] = R_y^{-1} (Ax - ry) = y */
|
data/vendor/scs/scs.mk
CHANGED
@@ -8,7 +8,6 @@ endif
|
|
8
8
|
# For cross-compiling with mingw use these.
|
9
9
|
#CC = i686-w64-mingw32-gcc -m32
|
10
10
|
#CC = x86_64-w64-mingw32-gcc-4.8
|
11
|
-
CUCC = $(CC) #Don't need to use nvcc, since using cuda blas APIs
|
12
11
|
|
13
12
|
# For GPU must add cuda libs to path, e.g.
|
14
13
|
# export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
|
@@ -53,13 +52,19 @@ endif
|
|
53
52
|
#TODO: check if this works for all platforms:
|
54
53
|
ifeq ($(CUDA_PATH), )
|
55
54
|
CUDA_PATH=/usr/local/cuda
|
55
|
+
CUCC = $(CUDA_PATH)/bin/nvcc
|
56
56
|
endif
|
57
|
+
|
57
58
|
CULDFLAGS = -L$(CUDA_PATH)/lib -L$(CUDA_PATH)/lib64 -lcudart -lcublas -lcusparse
|
58
59
|
CUDAFLAGS = $(CFLAGS) -I$(CUDA_PATH)/include -Ilinsys/gpu -Wno-c++11-long-long # turn off annoying long-long warnings in cuda header files
|
59
60
|
|
61
|
+
CUDSS_FLAGS = -I$(CUDSS_PATH)/include -I$(CUDA_PATH)/include
|
62
|
+
CUDSS_LDFLAGS = $(CULDFLAGS) -L$(CUDSS_PATH)/lib -lcudss
|
63
|
+
|
60
64
|
# Add on default CFLAGS
|
61
65
|
OPT = -O3
|
62
|
-
|
66
|
+
INCLUDE = -I. -Iinclude -Ilinsys
|
67
|
+
override CFLAGS += -g -Wall -Wwrite-strings -pedantic -funroll-loops -Wstrict-prototypes $(INCLUDE) $(OPT) -Werror=incompatible-pointer-types
|
63
68
|
ifneq ($(ISWINDOWS), 1)
|
64
69
|
override CFLAGS += -fPIC
|
65
70
|
endif
|
@@ -70,6 +75,7 @@ INDIRSRC = $(LINSYS)/cpu/indirect
|
|
70
75
|
GPUDIR = $(LINSYS)/gpu/direct
|
71
76
|
GPUINDIR = $(LINSYS)/gpu/indirect
|
72
77
|
MKLSRC = $(LINSYS)/mkl/direct
|
78
|
+
CUDSSSRC = $(LINSYS)/cudss/direct
|
73
79
|
|
74
80
|
EXTSRC = $(LINSYS)/external
|
75
81
|
|
@@ -174,6 +180,15 @@ ifneq ($(USE_LAPACK), 0)
|
|
174
180
|
endif
|
175
181
|
endif
|
176
182
|
|
183
|
+
############ SPECTRAL CONES ############
|
184
|
+
USE_SPECTRAL_CONES = 0
|
185
|
+
ifneq ($(USE_SPECTRAL_CONES), 0)
|
186
|
+
ifeq ($(USE_LAPACK), 0)
|
187
|
+
$(error USE_SPECTRAL_CONES requires USE_LAPACK to be enabled)
|
188
|
+
endif
|
189
|
+
CUSTOM_FLAGS += -DUSE_SPECTRAL_CONES
|
190
|
+
endif
|
191
|
+
|
177
192
|
MATLAB_MEX_FILE = 0
|
178
193
|
ifneq ($(MATLAB_MEX_FILE), 0)
|
179
194
|
CUSTOM_FLAGS += -DMATLAB_MEX_FILE=$(MATLAB_MEX_FILE) # matlab mex
|
data/vendor/scs/src/aa.c
CHANGED
@@ -189,9 +189,8 @@ static void set_m(AaWork *a, aa_int len) {
|
|
189
189
|
blas_int blen = (blas_int)len;
|
190
190
|
aa_float onef = 1.0, zerof = 0.0, r;
|
191
191
|
/* if len < mem this only uses len cols */
|
192
|
-
BLAS(gemm)
|
193
|
-
|
194
|
-
a->Y, &bdim, &zerof, a->M, &blen);
|
192
|
+
BLAS(gemm)("Trans", "No", &blen, &blen, &bdim, &onef, a->type1 ? a->S : a->Y,
|
193
|
+
&bdim, a->Y, &bdim, &zerof, a->M, &blen);
|
195
194
|
if (a->regularization > 0) {
|
196
195
|
r = compute_regularization(a, len);
|
197
196
|
for (i = 0; i < len; ++i) {
|
@@ -287,9 +286,8 @@ static void relax(aa_float *f, AaWork *a, aa_int len) {
|
|
287
286
|
aa_float onef = 1.0, neg_onef = -1.0;
|
288
287
|
aa_float one_m_relaxation = 1. - a->relaxation;
|
289
288
|
/* x_work = x - S * work */
|
290
|
-
BLAS(gemv)
|
291
|
-
|
292
|
-
a->x_work, &one);
|
289
|
+
BLAS(gemv)("NoTrans", &bdim, &blen, &neg_onef, a->S, &bdim, a->work, &one,
|
290
|
+
&onef, a->x_work, &one);
|
293
291
|
/* f = relaxation * f */
|
294
292
|
BLAS(scal)(&bdim, &a->relaxation, f, &one);
|
295
293
|
/* f += (1 - relaxation) * x_work */
|
@@ -306,9 +304,8 @@ static aa_float solve(aa_float *f, AaWork *a, aa_int len) {
|
|
306
304
|
aa_float onef = 1.0, zerof = 0.0, neg_onef = -1.0, aa_norm;
|
307
305
|
|
308
306
|
/* work = S'g or Y'g */
|
309
|
-
BLAS(gemv)
|
310
|
-
|
311
|
-
&zerof, a->work, &one);
|
307
|
+
BLAS(gemv)("Trans", &bdim, &blen, &onef, a->type1 ? a->S : a->Y, &bdim, a->g,
|
308
|
+
&one, &zerof, a->work, &one);
|
312
309
|
|
313
310
|
/* work = M \ work, where update_accel_params has set M = S'Y or M = Y'Y */
|
314
311
|
BLAS(gesv)(&blen, &one, a->M, &blen, a->ipiv, a->work, &blen, &info);
|
@@ -335,9 +332,8 @@ static aa_float solve(aa_float *f, AaWork *a, aa_int len) {
|
|
335
332
|
/* if solve was successful compute new point */
|
336
333
|
|
337
334
|
/* first set f -= D * work */
|
338
|
-
BLAS(gemv)
|
339
|
-
|
340
|
-
&one);
|
335
|
+
BLAS(gemv)("NoTrans", &bdim, &blen, &neg_onef, a->D, &bdim, a->work, &one,
|
336
|
+
&onef, f, &one);
|
341
337
|
|
342
338
|
/* if relaxation is not 1 then need to incorporate */
|
343
339
|
if (a->relaxation != 1.0) {
|