RubyGems - scs - Versions diffs - 0.2.2 - Mend

scs 0.2.2

Files changed (106) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +12 -0
data/LICENSE.txt +22 -0
data/README.md +98 -0
data/ext/scs/extconf.rb +29 -0
data/lib/scs.rb +17 -0
data/lib/scs/ffi.rb +117 -0
data/lib/scs/solver.rb +173 -0
data/lib/scs/version.rb +3 -0
data/vendor/scs/LICENSE.txt +21 -0
data/vendor/scs/Makefile +164 -0
data/vendor/scs/README.md +222 -0
data/vendor/scs/include/aa.h +56 -0
data/vendor/scs/include/cones.h +46 -0
data/vendor/scs/include/ctrlc.h +33 -0
data/vendor/scs/include/glbopts.h +177 -0
data/vendor/scs/include/linalg.h +26 -0
data/vendor/scs/include/linsys.h +64 -0
data/vendor/scs/include/normalize.h +18 -0
data/vendor/scs/include/rw.h +17 -0
data/vendor/scs/include/scs.h +161 -0
data/vendor/scs/include/scs_blas.h +51 -0
data/vendor/scs/include/util.h +65 -0
data/vendor/scs/linsys/amatrix.c +305 -0
data/vendor/scs/linsys/amatrix.h +36 -0
data/vendor/scs/linsys/amatrix.o +0 -0
data/vendor/scs/linsys/cpu/direct/private.c +366 -0
data/vendor/scs/linsys/cpu/direct/private.h +26 -0
data/vendor/scs/linsys/cpu/direct/private.o +0 -0
data/vendor/scs/linsys/cpu/indirect/private.c +256 -0
data/vendor/scs/linsys/cpu/indirect/private.h +31 -0
data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
data/vendor/scs/linsys/external/amd/LICENSE.txt +934 -0
data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +469 -0
data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +254 -0
data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
data/vendor/scs/linsys/external/amd/amd.h +400 -0
data/vendor/scs/linsys/external/amd/amd_1.c +180 -0
data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
data/vendor/scs/linsys/external/amd/amd_2.c +1842 -0
data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
data/vendor/scs/linsys/external/amd/amd_aat.c +184 -0
data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
data/vendor/scs/linsys/external/amd/amd_control.c +64 -0
data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
data/vendor/scs/linsys/external/amd/amd_defaults.c +37 -0
data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
data/vendor/scs/linsys/external/amd/amd_dump.c +179 -0
data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
data/vendor/scs/linsys/external/amd/amd_global.c +16 -0
data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
data/vendor/scs/linsys/external/amd/amd_info.c +119 -0
data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
data/vendor/scs/linsys/external/amd/amd_internal.h +304 -0
data/vendor/scs/linsys/external/amd/amd_order.c +199 -0
data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
data/vendor/scs/linsys/external/amd/amd_post_tree.c +120 -0
data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
data/vendor/scs/linsys/external/amd/amd_postorder.c +206 -0
data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
data/vendor/scs/linsys/external/amd/amd_preprocess.c +118 -0
data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
data/vendor/scs/linsys/external/amd/amd_valid.c +92 -0
data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
data/vendor/scs/linsys/external/amd/changes +11 -0
data/vendor/scs/linsys/external/qdldl/LICENSE +201 -0
data/vendor/scs/linsys/external/qdldl/README.md +120 -0
data/vendor/scs/linsys/external/qdldl/changes +4 -0
data/vendor/scs/linsys/external/qdldl/qdldl.c +298 -0
data/vendor/scs/linsys/external/qdldl/qdldl.h +177 -0
data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
data/vendor/scs/linsys/external/qdldl/qdldl_types.h +21 -0
data/vendor/scs/linsys/gpu/gpu.c +41 -0
data/vendor/scs/linsys/gpu/gpu.h +85 -0
data/vendor/scs/linsys/gpu/indirect/private.c +304 -0
data/vendor/scs/linsys/gpu/indirect/private.h +36 -0
data/vendor/scs/scs.mk +181 -0
data/vendor/scs/src/aa.c +224 -0
data/vendor/scs/src/aa.o +0 -0
data/vendor/scs/src/cones.c +802 -0
data/vendor/scs/src/cones.o +0 -0
data/vendor/scs/src/ctrlc.c +77 -0
data/vendor/scs/src/ctrlc.o +0 -0
data/vendor/scs/src/linalg.c +84 -0
data/vendor/scs/src/linalg.o +0 -0
data/vendor/scs/src/normalize.c +93 -0
data/vendor/scs/src/normalize.o +0 -0
data/vendor/scs/src/rw.c +167 -0
data/vendor/scs/src/rw.o +0 -0
data/vendor/scs/src/scs.c +978 -0
data/vendor/scs/src/scs.o +0 -0
data/vendor/scs/src/scs_version.c +5 -0
data/vendor/scs/src/scs_version.o +0 -0
data/vendor/scs/src/util.c +196 -0
data/vendor/scs/src/util.o +0 -0
data/vendor/scs/test/data/small_random_socp +0 -0
data/vendor/scs/test/minunit.h +13 -0
data/vendor/scs/test/problem_utils.h +93 -0
data/vendor/scs/test/problems/rob_gauss_cov_est.h +85 -0
data/vendor/scs/test/problems/small_lp.h +50 -0
data/vendor/scs/test/problems/small_random_socp.h +33 -0
data/vendor/scs/test/random_socp_prob.c +171 -0
data/vendor/scs/test/run_from_file.c +69 -0
data/vendor/scs/test/run_tests +2 -0
data/vendor/scs/test/run_tests.c +32 -0
metadata +203 -0

data/vendor/scs/linsys/external/qdldl/qdldl.h ADDED

@@ -0,0 +1,177 @@
+#ifndef QDLDL_H
+#define QDLDL_H
+// Include qdldl type options
+#include "qdldl_types.h"
+# ifdef __cplusplus
+extern "C" {
+# endif // ifdef __cplusplus
+/**
+  * Compute the elimination tree for a quasidefinite matrix
+  * in compressed sparse column form, where the input matrix is
+  * assumed to contain data for the upper triangular part of A only,
+  * and there are no duplicate indices.
+  *
+  * Returns an elimination tree for the factorization A = LDL^T and a
+  * count of the nonzeros in each column of L that are strictly below the
+  * diagonal.
+  *
+  * Does not use MALLOC.  It is assumed that the arrays work, Lnz, and
+  * etree will be allocated with a number of elements equal to n.
+  *
+  * The data in (n,Ap,Ai) are from a square matrix A in CSC format, and
+  * should include the upper triangular part of A only.
+  *
+  * This function is only intended for factorisation of QD matrices specified
+  * by their upper triangular part.  An error is returned if any column has
+  * data below the diagonal or s completely empty.
+  *
+  * For matrices with a non-empty column but a zero on the corresponding diagonal,
+  * this function will *not* return an error, as it may still be possible to factor
+  * such a matrix in LDL form.   No promises are made in this case though...
+  *
+  * @param   n     number of columns in CSC matrix A (assumed square)
+  * @param  Ap     column pointers (size n+1) for columns of A
+  * @param  Ai     row indices of A.  Has Ap[n] elements
+  * @param  work   work vector (size n) (no meaning on return)
+  * @param  Lnz    count of nonzeros in each column of L (size n) below diagonal
+  * @param  etree  elimination tree (size n)
+  * @return total  sum of Lnz (i.e. total nonzeros in L below diagonal). Returns
+  *                -1 if the input does not have triu structure or has an empty
+  *                column.
+  *
+  *
+*/
+ QDLDL_int QDLDL_etree(const QDLDL_int   n,
+                       const QDLDL_int* Ap,
+                       const QDLDL_int* Ai,
+                       QDLDL_int* work,
+                       QDLDL_int* Lnz,
+                       QDLDL_int* etree);
+/**
+  * Compute an LDL decomposition for a quasidefinite matrix
+  * in compressed sparse column form, where the input matrix is
+  * assumed to contain data for the upper triangular part of A only,
+  * and there are no duplicate indices.
+  *
+  * Returns factors L, D and Dinv = 1./D.
+  *
+  * Does not use MALLOC.  It is assumed that L will be a compressed
+  * sparse column matrix with data (Ln,Lp,Li)  with sufficient space
+  * allocated, with a number of nonzeros equal to the count given
+  * as a return value by osqp_ldl_etree
+  *
+  * @param   n     number of columns in L and A (both square)
+  * @param  Ap     column pointers (size n+1) for columns of A
+  * @param  Ai     row indices of A.  Has Ap[n] elements
+  * @param  Ln     number of columns in CSC matrix L
+  * @param  Lp     column pointers (size Ln+1) for columns of L
+  * @param  Li     row indices of L.  Has Lp[Ln] elements
+  * @param  D      vectorized factor D.  Length is n
+  * @param  Dinv   reciprocal of D.  Length is n
+  * @param  Lnz    count of nonzeros in each column of L below diagonal,
+  *                as given by osqp_ldl_etree (not modified)
+  * @param  etree  elimination tree as as given by osqp_ldl_etree (not modified)
+  * @param  bwork  working array of bools. Length is n
+  * @param  iwork  working array of integers. Length is 3*n
+  * @param  fwork  working array of floats. Length is n
+  * @return        Returns a count of the number of positive elements
+  *                in D.  Returns -1 and exits immediately if any element
+  *                of D evaluates exactly to zero (matrix is not quasidefinite
+  *                or otherwise LDL factorisable)
+  *
+*/
+QDLDL_int QDLDL_factor(const QDLDL_int    n,
+                  const QDLDL_int*   Ap,
+                  const QDLDL_int*   Ai,
+                  const QDLDL_float* Ax,
+                  QDLDL_int*   Lp,
+                  QDLDL_int*   Li,
+                  QDLDL_float* Lx,
+                  QDLDL_float* D,
+                  QDLDL_float* Dinv,
+                  const QDLDL_int* Lnz,
+                  const QDLDL_int* etree,
+                  QDLDL_bool* bwork,
+                  QDLDL_int* iwork,
+                  QDLDL_float* fwork);
+/**
+  * Solves LDL'x = b
+  *
+  * It is assumed that L will be a compressed
+  * sparse column matrix with data (Ln,Lp,Li).
+  *
+  * @param   n     number of columns in L (both square)
+  * @param  Ln     number of columns in CSC matrix L
+  * @param  Lp     column pointers (size Ln+1) for columns of L
+  * @param  Li     row indices of L.  Has Lp[Ln] elements
+  * @param  Dinv   reciprocal of D.  Length is n
+  * @param  x      initialized to b.  Equal to x on return
+  *
+  *
+*/
+void QDLDL_solve(const QDLDL_int    n,
+                 const QDLDL_int*   Lp,
+                 const QDLDL_int*   Li,
+                 const QDLDL_float* Lx,
+                 const QDLDL_float* Dinv,
+                 QDLDL_float* x);
+/**
+ * Solves (L+I)x = b
+ *
+ * It is assumed that L will be a compressed
+ * sparse column matrix with data (Ln,Lp,Li).
+ *
+ * @param   n     number of columns in L (both square)
+ * @param  Ln     number of columns in CSC matrix L
+ * @param  Lp     column pointers (size Ln+1) for columns of L
+ * @param  Li     row indices of L.  Has Lp[Ln] elements
+ * @param  Dinv   reciprocal of D.  Length is n
+ * @param  x      initialized to b.  Equal to x on return
+ *
+ *
+*/
+void QDLDL_Lsolve(const QDLDL_int    n,
+                  const QDLDL_int*   Lp,
+                  const QDLDL_int*   Li,
+                  const QDLDL_float* Lx,
+                  QDLDL_float* x);
+/**
+ * Solves (L+I)'x = b
+ *
+ * It is assumed that L will be a compressed
+ * sparse column matrix with data (Ln,Lp,Li).
+ *
+ * @param   n     number of columns in L (both square)
+ * @param  Ln     number of columns in CSC matrix L
+ * @param  Lp     column pointers (size Ln+1) for columns of L
+ * @param  Li     row indices of L.  Has Lp[Ln] elements
+ * @param  Dinv   reciprocal of D.  Length is n
+ * @param  x      initialized to b.  Equal to x on return
+ *
+ *
+*/
+void QDLDL_Ltsolve(const QDLDL_int    n,
+                   const QDLDL_int*   Lp,
+                   const QDLDL_int*   Li,
+                   const QDLDL_float* Lx,
+                   QDLDL_float* x);
+# ifdef __cplusplus
+}
+# endif // ifdef __cplusplus
+#endif // ifndef QDLDL_H

data/vendor/scs/linsys/external/qdldl/qdldl.o ADDED

Binary file

data/vendor/scs/linsys/external/qdldl/qdldl_types.h ADDED

@@ -0,0 +1,21 @@
+#ifndef QDLDL_TYPES_H
+# define QDLDL_TYPES_H
+#include "glbopts.h"
+# ifdef __cplusplus
+extern "C" {
+# endif /* ifdef __cplusplus */
+// QDLDL integer and float types
+#define QDLDL_int scs_int
+#define QDLDL_float scs_float
+#define QDLDL_bool scs_int
+# ifdef __cplusplus
+}
+# endif /* ifdef __cplusplus */
+#endif /* ifndef QDLDL_TYPES_H */

data/vendor/scs/linsys/gpu/gpu.c ADDED

@@ -0,0 +1,41 @@
+#include "gpu.h"
+void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
+                               scs_float *y, cusparseHandle_t cusparse_handle) {
+  /* y += A'*x
+     x and y MUST be on GPU already
+  */
+  const scs_float onef = 1.0;
+  CUSPARSE(csrmv)
+  (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, Ag->n, Ag->m, Ag->Annz,
+   &onef, Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
+}
+void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
+                          scs_float *y, cusparseHandle_t cusparse_handle) {
+  /* y += A*x
+     x and y MUST be on GPU already
+   */
+  const scs_float onef = 1.0;
+  /* The A matrix idx pointers must be ORDERED */
+  CUSPARSE(csrmv)
+  (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, Ag->n, Ag->m, Ag->Annz, &onef,
+   Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
+}
+void SCS(free_gpu_matrix)(ScsGpuMatrix *A) {
+  cudaFree(A->x);
+  cudaFree(A->i);
+  cudaFree(A->p);
+  cusparseDestroyMatDescr(A->descr);
+}
+void SCS(normalize_a)(ScsMatrix *A, const ScsSettings *stgs, const ScsCone *k,
+                      ScsScaling *scal) {
+  SCS(_normalize_a)(A, stgs, k, scal);
+}
+void SCS(un_normalize_a)(ScsMatrix *A, const ScsSettings *stgs,
+                         const ScsScaling *scal) {
+  SCS(_un_normalize_a)(A, stgs, scal);
+}

data/vendor/scs/linsys/gpu/gpu.h ADDED

@@ -0,0 +1,85 @@
+#ifndef SCSGPU_H_GUARD
+#define SCSGPU_H_GUARD
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cusparse.h>
+#include "amatrix.h"
+#include "glbopts.h"
+#include "linalg.h"
+#include "linsys.h"
+#include "scs.h"
+#include "util.h"
+#define CUDA_CHECK_ERR                                                    \
+  do {                                                                    \
+    cudaError_t err = cudaGetLastError();                                 \
+    if (err != cudaSuccess) {                                             \
+      printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
+             cudaGetErrorString(err));                                    \
+    }                                                                     \
+  } while (0)
+#ifndef EXTRA_VERBOSE
+#ifndef SFLOAT
+#define CUBLAS(x) cublasD##x
+#define CUSPARSE(x) cusparseD##x
+#else
+#define CUBLAS(x) cublasS##x
+#define CUSPARSE(x) cusparseS##x
+#endif
+#else
+#ifndef SFLOAT
+#define CUBLAS(x) \
+  CUDA_CHECK_ERR; \
+  cublasD##x
+#define CUSPARSE(x) \
+  CUDA_CHECK_ERR;   \
+  cusparseD##x
+#else
+#define CUBLAS(x) \
+  CUDA_CHECK_ERR; \
+  cublasS##x
+#define CUSPARSE(x) \
+  CUDA_CHECK_ERR;   \
+  cusparseS##x
+#endif
+#endif
+/*
+ CUDA matrix routines only for CSR, not CSC matrices:
+    CSC             CSR             GPU     Mult
+    A (m x n)       A' (n x m)      Ag      accum_by_a_trans_gpu
+    A'(n x m)       A  (m x n)      Agt     accum_by_a_gpu
+*/
+/* this struct defines the data matrix A on GPU */
+typedef struct SCS_GPU_A_DATA_MATRIX {
+  /* A is supplied in column compressed format */
+  scs_float *x; /* A values, size: NNZ A */
+  scs_int *i;   /* A row index, size: NNZ A */
+  scs_int *p;   /* A column pointer, size: n+1 */
+  scs_int m, n; /* m rows, n cols */
+  scs_int Annz; /* num non-zeros in A matrix */
+  /* CUDA */
+  cusparseMatDescr_t descr;
+} ScsGpuMatrix;
+void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const scs_float *x,
+                               scs_float *y, cusparseHandle_t cusparse_handle);
+void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const scs_float *x,
+                          scs_float *y, cusparseHandle_t cusparse_handle);
+void SCS(free_gpu_matrix)(ScsGpuMatrix *A);
+#ifdef __cplusplus
+}
+#endif
+#endif

data/vendor/scs/linsys/gpu/indirect/private.c ADDED

@@ -0,0 +1,304 @@
+#include "private.h"
+#define CG_BEST_TOL 1e-9
+#define CG_MIN_TOL 1e-1
+/* do not use within pcg, reuses memory */
+void SCS(accum_by_atrans)(const ScsMatrix *A, ScsLinSysWork *p,
+                          const scs_float *x, scs_float *y) {
+  scs_float *v_m = p->tmp_m;
+  scs_float *v_n = p->r;
+  cudaMemcpy(v_m, x, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
+  cudaMemcpy(v_n, y, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
+  SCS(_accum_by_atrans_gpu)(p->Ag, v_m, v_n, p->cusparse_handle);
+  cudaMemcpy(y, v_n, A->n * sizeof(scs_float), cudaMemcpyDeviceToHost);
+}
+/* do not use within pcg, reuses memory */
+void SCS(accum_by_a)(const ScsMatrix *A, ScsLinSysWork *p, const scs_float *x,
+                     scs_float *y) {
+  scs_float *v_m = p->tmp_m;
+  scs_float *v_n = p->r;
+  cudaMemcpy(v_n, x, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
+  cudaMemcpy(v_m, y, A->m * sizeof(scs_float), cudaMemcpyHostToDevice);
+#if GPU_TRANSPOSE_MAT > 0
+  SCS(_accum_by_atrans_gpu)(p->Agt, v_n, v_m, p->cusparse_handle);
+#else
+  SCS(_accum_by_a_gpu)(p->Ag, v_n, v_m, p->cusparse_handle);
+#endif
+  cudaMemcpy(y, v_m, A->m * sizeof(scs_float), cudaMemcpyDeviceToHost);
+}
+char *SCS(get_lin_sys_method)(const ScsMatrix *A, const ScsSettings *stgs) {
+  char *str = (char *)scs_malloc(sizeof(char) * 128);
+  sprintf(str, "sparse-indirect GPU, nnz in A = %li, CG tol ~ 1/iter^(%2.2f)",
+          (long)A->p[A->n], stgs->cg_rate);
+  return str;
+}
+char *SCS(get_lin_sys_summary)(ScsLinSysWork *p, const ScsInfo *info) {
+  char *str = (char *)scs_malloc(sizeof(char) * 128);
+  sprintf(str,
+          "\tLin-sys: avg # CG iterations: %2.2f, avg solve time: %1.2es\n",
+          (scs_float)p->tot_cg_its / (info->iter + 1),
+          p->total_solve_time / (info->iter + 1) / 1e3);
+  p->tot_cg_its = 0;
+  p->total_solve_time = 0;
+  return str;
+}
+void SCS(free_lin_sys_work)(ScsLinSysWork *p) {
+  if (p) {
+    cudaFree(p->p);
+    cudaFree(p->r);
+    cudaFree(p->Gp);
+    cudaFree(p->bg);
+    cudaFree(p->tmp_m);
+    cudaFree(p->z);
+    cudaFree(p->M);
+    if (p->Ag) {
+      SCS(free_gpu_matrix)(p->Ag);
+      scs_free(p->Ag);
+    }
+    if (p->Agt) {
+      SCS(free_gpu_matrix)(p->Agt);
+      scs_free(p->Agt);
+    }
+    cusparseDestroy(p->cusparse_handle);
+    cublasDestroy(p->cublas_handle);
+    /* Don't reset because it interferes with other GPU programs. */
+    /* cudaDeviceReset(); */
+    scs_free(p);
+  }
+}
+/*y = (RHO_X * I + A'A)x */
+static void mat_vec(const ScsGpuMatrix *A, const ScsSettings *s,
+                    ScsLinSysWork *p, const scs_float *x, scs_float *y) {
+  /* x and y MUST already be loaded to GPU */
+  scs_float *tmp_m = p->tmp_m; /* temp memory */
+  cudaMemset(tmp_m, 0, A->m * sizeof(scs_float));
+  SCS(_accum_by_a_gpu)(A, x, tmp_m, p->cusparse_handle);
+  cudaMemset(y, 0, A->n * sizeof(scs_float));
+  SCS(_accum_by_atrans_gpu)(A, tmp_m, y, p->cusparse_handle);
+  CUBLAS(axpy)(p->cublas_handle, A->n, &(s->rho_x), x, 1, y, 1);
+}
+/* M = inv ( diag ( RHO_X * I + A'A ) ) */
+static void get_preconditioner(const ScsMatrix *A, const ScsSettings *stgs,
+                               ScsLinSysWork *p) {
+  scs_int i;
+  scs_float *M = (scs_float *)scs_malloc(A->n * sizeof(scs_float));
+#if EXTRA_VERBOSE > 0
+  scs_printf("getting pre-conditioner\n");
+#endif
+  for (i = 0; i < A->n; ++i) {
+    M[i] = 1 / (stgs->rho_x +
+                SCS(norm_sq)(&(A->x[A->p[i]]), A->p[i + 1] - A->p[i]));
+    /* M[i] = 1; */
+  }
+  cudaMemcpy(p->M, M, A->n * sizeof(scs_float), cudaMemcpyHostToDevice);
+  scs_free(M);
+#if EXTRA_VERBOSE > 0
+  scs_printf("finished getting pre-conditioner\n");
+#endif
+}
+ScsLinSysWork *SCS(init_lin_sys_work)(const ScsMatrix *A,
+                                      const ScsSettings *stgs) {
+  cudaError_t err;
+  ScsLinSysWork *p = (ScsLinSysWork *)scs_calloc(1, sizeof(ScsLinSysWork));
+  ScsGpuMatrix *Ag = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
+  p->cublas_handle = 0;
+  p->cusparse_handle = 0;
+  p->total_solve_time = 0;
+  p->tot_cg_its = 0;
+  /* Get handle to the CUBLAS context */
+  cublasCreate(&p->cublas_handle);
+  /* Get handle to the CUSPARSE context */
+  cusparseCreate(&p->cusparse_handle);
+  Ag->n = A->n;
+  Ag->m = A->m;
+  Ag->Annz = A->p[A->n];
+  Ag->descr = 0;
+  /* Matrix description */
+  cusparseCreateMatDescr(&Ag->descr);
+  cusparseSetMatType(Ag->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatIndexBase(Ag->descr, CUSPARSE_INDEX_BASE_ZERO);
+  p->Ag = Ag;
+  p->Agt = SCS_NULL;
+  cudaMalloc((void **)&Ag->i, (A->p[A->n]) * sizeof(scs_int));
+  cudaMalloc((void **)&Ag->p, (A->n + 1) * sizeof(scs_int));
+  cudaMalloc((void **)&Ag->x, (A->p[A->n]) * sizeof(scs_float));
+  cudaMalloc((void **)&p->p, A->n * sizeof(scs_float));
+  cudaMalloc((void **)&p->r, A->n * sizeof(scs_float));
+  cudaMalloc((void **)&p->Gp, A->n * sizeof(scs_float));
+  cudaMalloc((void **)&p->bg, (A->n + A->m) * sizeof(scs_float));
+  cudaMalloc((void **)&p->tmp_m,
+             A->m * sizeof(scs_float)); /* intermediate result */
+  cudaMalloc((void **)&p->z, A->n * sizeof(scs_float));
+  cudaMalloc((void **)&p->M, A->n * sizeof(scs_float));
+  cudaMemcpy(Ag->i, A->i, (A->p[A->n]) * sizeof(scs_int),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(Ag->p, A->p, (A->n + 1) * sizeof(scs_int), cudaMemcpyHostToDevice);
+  cudaMemcpy(Ag->x, A->x, (A->p[A->n]) * sizeof(scs_float),
+             cudaMemcpyHostToDevice);
+  get_preconditioner(A, stgs, p);
+#if GPU_TRANSPOSE_MAT > 0
+  p->Agt = (ScsGpuMatrix *)scs_malloc(sizeof(ScsGpuMatrix));
+  p->Agt->n = A->m;
+  p->Agt->m = A->n;
+  p->Agt->Annz = A->p[A->n];
+  p->Agt->descr = 0;
+  /* Matrix description */
+  cusparseCreateMatDescr(&p->Agt->descr);
+  cusparseSetMatType(p->Agt->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatIndexBase(p->Agt->descr, CUSPARSE_INDEX_BASE_ZERO);
+  cudaMalloc((void **)&p->Agt->i, (A->p[A->n]) * sizeof(scs_int));
+  cudaMalloc((void **)&p->Agt->p, (A->m + 1) * sizeof(scs_int));
+  cudaMalloc((void **)&p->Agt->x, (A->p[A->n]) * sizeof(scs_float));
+  /* transpose Ag into Agt for faster multiplies */
+  /* TODO: memory intensive, could perform transpose in CPU and copy to GPU */
+  CUSPARSE(csr2csc)
+  (p->cusparse_handle, A->n, A->m, A->p[A->n], Ag->x, Ag->p, Ag->i, p->Agt->x,
+   p->Agt->i, p->Agt->p, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO);
+#endif
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("%s:%d:%s\nERROR_CUDA: %s\n", __FILE__, __LINE__, __func__,
+           cudaGetErrorString(err));
+    SCS(free_lin_sys_work)(p);
+    return SCS_NULL;
+  }
+  return p;
+}
+static void apply_pre_conditioner(cublasHandle_t cublas_handle, scs_float *M,
+                                  scs_float *z, scs_float *r, scs_int n) {
+  cudaMemcpy(z, r, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
+  CUBLAS(tbmv)
+  (cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n,
+   0, M, 1, z, 1);
+}
+/* solves (I+A'A)x = b, s warm start, solution stored in bg (on GPU) */
+static scs_int pcg(const ScsGpuMatrix *A, const ScsSettings *stgs,
+                   ScsLinSysWork *pr, const scs_float *s, scs_float *bg,
+                   scs_int max_its, scs_float tol) {
+  scs_int i, n = A->n;
+  scs_float alpha, nrm_r, p_gp, neg_alpha, beta, ipzr, ipzr_old;
+  scs_float onef = 1.0, neg_onef = -1.0;
+  scs_float *p = pr->p;   /* cg direction */
+  scs_float *Gp = pr->Gp; /* updated CG direction */
+  scs_float *r = pr->r;   /* cg residual */
+  scs_float *z = pr->z;   /* preconditioned */
+  scs_float *M = pr->M;   /* preconditioner */
+  cublasHandle_t cublas_handle = pr->cublas_handle;
+  if (s == SCS_NULL) {
+    cudaMemcpy(r, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
+    cudaMemset(bg, 0, n * sizeof(scs_float));
+  } else {
+    /* p contains bg temporarily */
+    cudaMemcpy(p, bg, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
+    /* bg contains s */
+    cudaMemcpy(bg, s, n * sizeof(scs_float), cudaMemcpyHostToDevice);
+    mat_vec(A, stgs, pr, bg, r);
+    CUBLAS(axpy)(cublas_handle, n, &neg_onef, p, 1, r, 1);
+    CUBLAS(scal)(cublas_handle, n, &neg_onef, r, 1);
+  }
+  /* for some reason nrm2 is VERY slow */
+  /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
+  CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
+  nrm_r = SQRTF(nrm_r);
+  /* check to see if we need to run CG at all */
+  if (nrm_r < MIN(tol, 1e-18)) {
+    return 0;
+  }
+  apply_pre_conditioner(cublas_handle, M, z, r, n);
+  CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
+  /* put z in p, replacing temp mem */
+  cudaMemcpy(p, z, n * sizeof(scs_float), cudaMemcpyDeviceToDevice);
+  for (i = 0; i < max_its; ++i) {
+    mat_vec(A, stgs, pr, p, Gp);
+    CUBLAS(dot)(cublas_handle, n, p, 1, Gp, 1, &p_gp);
+    alpha = ipzr / p_gp;
+    neg_alpha = -alpha;
+    CUBLAS(axpy)(cublas_handle, n, &alpha, p, 1, bg, 1);
+    CUBLAS(axpy)(cublas_handle, n, &neg_alpha, Gp, 1, r, 1);
+    /* for some reason nrm2 is VERY slow */
+    /* CUBLAS(nrm2)(cublas_handle, n, r, 1, &nrm_r); */
+    CUBLAS(dot)(cublas_handle, n, r, 1, r, 1, &nrm_r);
+    nrm_r = SQRTF(nrm_r);
+    if (nrm_r < tol) {
+      i++;
+      break;
+    }
+    ipzr_old = ipzr;
+    apply_pre_conditioner(cublas_handle, M, z, r, n);
+    CUBLAS(dot)(cublas_handle, n, r, 1, z, 1, &ipzr);
+    beta = ipzr / ipzr_old;
+    CUBLAS(scal)(cublas_handle, n, &beta, p, 1);
+    CUBLAS(axpy)(cublas_handle, n, &onef, z, 1, p, 1);
+  }
+#if EXTRA_VERBOSE > 0
+  scs_printf("tol: %.4e, resid: %.4e, iters: %li\n", tol, nrm_r, (long)i + 1);
+#endif
+  return i;
+}
+scs_int SCS(solve_lin_sys)(const ScsMatrix *A, const ScsSettings *stgs,
+                           ScsLinSysWork *p, scs_float *b, const scs_float *s,
+                           scs_int iter) {
+  scs_int cg_its;
+  SCS(timer) linsys_timer;
+  scs_float *bg = p->bg;
+  scs_float neg_onef = -1.0;
+  ScsGpuMatrix *Ag = p->Ag;
+  scs_float cg_tol =
+      SCS(norm)(b, Ag->n) *
+      (iter < 0 ? CG_BEST_TOL
+                : CG_MIN_TOL / POWF((scs_float)iter + 1., stgs->cg_rate));
+  SCS(tic)(&linsys_timer);
+  /* all on GPU */
+  cudaMemcpy(bg, b, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyHostToDevice);
+  SCS(_accum_by_atrans_gpu)(Ag, &(bg[Ag->n]), bg, p->cusparse_handle);
+  /* solves (I+A'A)x = b, s warm start, solution stored in b */
+  cg_its = pcg(p->Ag, stgs, p, s, bg, Ag->n, MAX(cg_tol, CG_BEST_TOL));
+  CUBLAS(scal)(p->cublas_handle, Ag->m, &neg_onef, &(bg[Ag->n]), 1);
+  SCS(_accum_by_a_gpu)(Ag, bg, &(bg[Ag->n]), p->cusparse_handle);
+  cudaMemcpy(b, bg, (Ag->n + Ag->m) * sizeof(scs_float), cudaMemcpyDeviceToHost);
+  if (iter >= 0) {
+    p->tot_cg_its += cg_its;
+  }
+  p->total_solve_time += SCS(tocq)(&linsys_timer);
+#if EXTRAVERBOSE > 0
+  scs_printf("linsys solve time: %1.2es\n", SCS(tocq)(&linsys_timer) / 1e3);
+#endif
+  return 0;
+}