RubyGems - scs - Versions diffs - 0.2.2 → 0.3.2 - Mend

scs 0.2.2 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/LICENSE.txt +18 -18
data/README.md +19 -14
data/lib/scs/ffi.rb +31 -20
data/lib/scs/solver.rb +32 -9
data/lib/scs/version.rb +1 -1
data/vendor/scs/CITATION.cff +39 -0
data/vendor/scs/CMakeLists.txt +320 -0
data/vendor/scs/Makefile +32 -23
data/vendor/scs/README.md +9 -218
data/vendor/scs/include/aa.h +67 -23
data/vendor/scs/include/cones.h +22 -19
data/vendor/scs/include/glbopts.h +107 -79
data/vendor/scs/include/linalg.h +3 -4
data/vendor/scs/include/linsys.h +58 -44
data/vendor/scs/include/normalize.h +6 -5
data/vendor/scs/include/rw.h +8 -2
data/vendor/scs/include/scs.h +257 -141
data/vendor/scs/include/scs_types.h +34 -0
data/vendor/scs/include/scs_work.h +83 -0
data/vendor/scs/include/util.h +3 -15
data/vendor/scs/linsys/cpu/direct/private.c +241 -232
data/vendor/scs/linsys/cpu/direct/private.h +13 -7
data/vendor/scs/linsys/cpu/indirect/private.c +194 -118
data/vendor/scs/linsys/cpu/indirect/private.h +7 -4
data/vendor/scs/linsys/csparse.c +87 -0
data/vendor/scs/linsys/csparse.h +34 -0
data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +6 -6
data/vendor/scs/linsys/external/amd/SuiteSparse_config.h +6 -1
data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
data/vendor/scs/linsys/external/amd/amd_order.c +5 -5
data/vendor/scs/linsys/external/qdldl/changes +2 -0
data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
data/vendor/scs/linsys/gpu/gpu.c +58 -21
data/vendor/scs/linsys/gpu/gpu.h +70 -35
data/vendor/scs/linsys/gpu/indirect/private.c +394 -157
data/vendor/scs/linsys/gpu/indirect/private.h +27 -12
data/vendor/scs/linsys/scs_matrix.c +478 -0
data/vendor/scs/linsys/scs_matrix.h +70 -0
data/vendor/scs/scs.mk +14 -10
data/vendor/scs/src/aa.c +394 -110
data/vendor/scs/src/cones.c +497 -359
data/vendor/scs/src/ctrlc.c +15 -5
data/vendor/scs/src/linalg.c +107 -26
data/vendor/scs/src/normalize.c +30 -72
data/vendor/scs/src/rw.c +202 -27
data/vendor/scs/src/scs.c +769 -571
data/vendor/scs/src/scs_version.c +11 -3
data/vendor/scs/src/util.c +37 -106
data/vendor/scs/test/minunit.h +22 -8
data/vendor/scs/test/problem_utils.h +180 -25
data/vendor/scs/test/problems/degenerate.h +130 -0
data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
data/vendor/scs/test/problems/random_prob +0 -0
data/vendor/scs/test/problems/random_prob.h +45 -0
data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
data/vendor/scs/test/problems/small_lp.h +14 -13
data/vendor/scs/test/problems/small_qp.h +352 -0
data/vendor/scs/test/problems/test_validation.h +43 -0
data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
data/vendor/scs/test/random_socp_prob.c +54 -53
data/vendor/scs/test/rng.h +109 -0
data/vendor/scs/test/run_from_file.c +20 -11
data/vendor/scs/test/run_tests.c +35 -2
metadata +29 -98
data/vendor/scs/linsys/amatrix.c +0 -305
data/vendor/scs/linsys/amatrix.h +0 -36
data/vendor/scs/linsys/amatrix.o +0 -0
data/vendor/scs/linsys/cpu/direct/private.o +0 -0
data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
data/vendor/scs/src/aa.o +0 -0
data/vendor/scs/src/cones.o +0 -0
data/vendor/scs/src/ctrlc.o +0 -0
data/vendor/scs/src/linalg.o +0 -0
data/vendor/scs/src/normalize.o +0 -0
data/vendor/scs/src/rw.o +0 -0
data/vendor/scs/src/scs.o +0 -0
data/vendor/scs/src/scs_version.o +0 -0
data/vendor/scs/src/util.o +0 -0
data/vendor/scs/test/data/small_random_socp +0 -0
data/vendor/scs/test/problems/small_random_socp.h +0 -33
data/vendor/scs/test/run_tests +0 -2

data/vendor/scs/linsys/external/qdldl/changes CHANGED Viewed

@@ -1,3 +1,5 @@
+Last qdldl commit: a00d500906621fbf014b39e42a3304d1143eb65f
 flatten into one dir
 create qdldl_types.h from template file
 add 'include "glbopts.h"' to qdldl_types.h

data/vendor/scs/linsys/external/qdldl/qdldl.c CHANGED Viewed

@@ -1,35 +1,9 @@
 #include "qdldl.h"
-#include "ctrlc.h"
 #define QDLDL_UNKNOWN (-1)
 #define QDLDL_USED (1)
 #define QDLDL_UNUSED (0)
-// //DEBUG
-// #include <stdio.h>
-// void qdprint_arrayi(const QDLDL_int* data, QDLDL_int n,char* varName){
-//   QDLDL_int i;
-//   printf("%s = [",varName);
-//   for(i=0; i< n; i++){
-//     printf("%lli,",data[i]);
-//   }
-//   printf("]\n");
-// }
-// void qdprint_arrayf(const QDLDL_float* data, QDLDL_int n, char* varName){
-//   QDLDL_int i;
-//   printf("%s = [",varName);
-//   for(i=0; i< n; i++){
-//     printf("%.3g,",data[i]);
-//   }
-//   printf("]\n");
-// }
-// // END DEBUG
 /* Compute the elimination tree for a quasidefinite matrix
    in compressed sparse column form.
 */
@@ -41,7 +15,7 @@ QDLDL_int QDLDL_etree(const QDLDL_int  n,
                       QDLDL_int* Lnz,
                       QDLDL_int* etree){
-  QDLDL_int sumLnz = 0;
+  QDLDL_int sumLnz;
   QDLDL_int i,j,p;
@@ -76,8 +50,19 @@ QDLDL_int QDLDL_etree(const QDLDL_int  n,
   }
   //compute the total nonzeros in L.  This much
-  //space is required to store Li and Lx
-  for(i = 0; i < n; i++){sumLnz += Lnz[i];}
+  //space is required to store Li and Lx.  Return
+  //error code -2 if the nonzero count will overflow
+  //its unteger type.
+  sumLnz  = 0;
+  for(i = 0; i < n; i++){
+    if(sumLnz > QDLDL_INT_MAX - Lnz[i]){
+      sumLnz = -2;
+      break;
+    }
+    else{
+      sumLnz += Lnz[i];
+    }
+  }
   return sumLnz;
 }
@@ -139,10 +124,6 @@ QDLDL_int QDLDL_factor(const QDLDL_int    n,
   //Start from 1 here. The upper LH corner is trivially 0
   //in L b/c we are only computing the subdiagonal elements
   for(k = 1; k < n; k++){
-    if(scs_is_interrupted()) {
-      scs_printf("interrupt detected in factorization\n");
-      return -1;
-    }
     //NB : For each k, we compute a solution to
     //y = L(0:(k-1),0:k-1))\b, where b is the kth
@@ -258,11 +239,12 @@ void QDLDL_Lsolve(const QDLDL_int    n,
                   const QDLDL_float* Lx,
                   QDLDL_float* x){
-QDLDL_int i,j;
+  QDLDL_int i,j;
   for(i = 0; i < n; i++){
-      for(j = Lp[i]; j < Lp[i+1]; j++){
-          x[Li[j]] -= Lx[j]*x[i];
-      }
+    QDLDL_float val = x[i];
+    for(j = Lp[i]; j < Lp[i+1]; j++){
+      x[Li[j]] -= Lx[j]*val;
+    }
   }
 }
@@ -273,11 +255,13 @@ void QDLDL_Ltsolve(const QDLDL_int    n,
                    const QDLDL_float* Lx,
                    QDLDL_float* x){
-QDLDL_int i,j;
+  QDLDL_int i,j;
   for(i = n-1; i>=0; i--){
-      for(j = Lp[i]; j < Lp[i+1]; j++){
-          x[i] -= Lx[j]*x[Li[j]];
-      }
+    QDLDL_float val = x[i];
+    for(j = Lp[i]; j < Lp[i+1]; j++){
+      val -= Lx[j]*x[Li[j]];
+    }
+    x[i] = val;
   }
 }
@@ -289,10 +273,9 @@ void QDLDL_solve(const QDLDL_int       n,
                     const QDLDL_float* Dinv,
                     QDLDL_float* x){
-QDLDL_int i;
-QDLDL_Lsolve(n,Lp,Li,Lx,x);
-for(i = 0; i < n; i++) x[i] *= Dinv[i];
-QDLDL_Ltsolve(n,Lp,Li,Lx,x);
+  QDLDL_int i;
+  QDLDL_Lsolve(n,Lp,Li,Lx,x);
+  for(i = 0; i < n; i++) x[i] *= Dinv[i];
+  QDLDL_Ltsolve(n,Lp,Li,Lx,x);
 }

data/vendor/scs/linsys/external/qdldl/qdldl.h CHANGED Viewed

@@ -32,19 +32,17 @@ extern "C" {
   * this function will *not* return an error, as it may still be possible to factor
   * such a matrix in LDL form.   No promises are made in this case though...
   *
-  * @param   n     number of columns in CSC matrix A (assumed square)
+  * @param  n      number of columns in CSC matrix A (assumed square)
   * @param  Ap     column pointers (size n+1) for columns of A
   * @param  Ai     row indices of A.  Has Ap[n] elements
   * @param  work   work vector (size n) (no meaning on return)
   * @param  Lnz    count of nonzeros in each column of L (size n) below diagonal
   * @param  etree  elimination tree (size n)
-  * @return total  sum of Lnz (i.e. total nonzeros in L below diagonal). Returns
-  *                -1 if the input does not have triu structure or has an empty
-  *                column.
-  *
+  * @return total  sum of Lnz (i.e. total nonzeros in L below diagonal).
+  *                Returns -1 if the input is not triu or has an empty column.
+  *                Returns -2 if the return value overflows QDLDL_int.
   *
 */
  QDLDL_int QDLDL_etree(const QDLDL_int   n,
                        const QDLDL_int* Ap,
                        const QDLDL_int* Ai,
@@ -52,6 +50,7 @@ extern "C" {
                        QDLDL_int* Lnz,
                        QDLDL_int* etree);
 /**
   * Compute an LDL decomposition for a quasidefinite matrix
   * in compressed sparse column form, where the input matrix is
@@ -61,21 +60,22 @@ extern "C" {
   * Returns factors L, D and Dinv = 1./D.
   *
   * Does not use MALLOC.  It is assumed that L will be a compressed
-  * sparse column matrix with data (Ln,Lp,Li)  with sufficient space
+  * sparse column matrix with data (n,Lp,Li,Lx)  with sufficient space
   * allocated, with a number of nonzeros equal to the count given
-  * as a return value by osqp_ldl_etree
-  *
-  * @param   n     number of columns in L and A (both square)
-  * @param  Ap     column pointers (size n+1) for columns of A
-  * @param  Ai     row indices of A.  Has Ap[n] elements
-  * @param  Ln     number of columns in CSC matrix L
-  * @param  Lp     column pointers (size Ln+1) for columns of L
-  * @param  Li     row indices of L.  Has Lp[Ln] elements
+  * as a return value by QDLDL_etree
+  *
+  * @param  n      number of columns in L and A (both square)
+  * @param  Ap     column pointers (size n+1) for columns of A (not modified)
+  * @param  Ai     row indices of A.  Has Ap[n] elements (not modified)
+  * @param  Ax     data of A.  Has Ap[n] elements (not modified)
+  * @param  Lp     column pointers (size n+1) for columns of L
+  * @param  Li     row indices of L.  Has Lp[n] elements
+  * @param  Lx     data of L.  Has Lp[n] elements
   * @param  D      vectorized factor D.  Length is n
   * @param  Dinv   reciprocal of D.  Length is n
   * @param  Lnz    count of nonzeros in each column of L below diagonal,
-  *                as given by osqp_ldl_etree (not modified)
-  * @param  etree  elimination tree as as given by osqp_ldl_etree (not modified)
+  *                as given by QDLDL_etree (not modified)
+  * @param  etree  elimination tree as as given by QDLDL_etree (not modified)
   * @param  bwork  working array of bools. Length is n
   * @param  iwork  working array of integers. Length is 3*n
   * @param  fwork  working array of floats. Length is n
@@ -85,8 +85,6 @@ extern "C" {
   *                or otherwise LDL factorisable)
   *
 */
 QDLDL_int QDLDL_factor(const QDLDL_int    n,
                   const QDLDL_int*   Ap,
                   const QDLDL_int*   Ai,
@@ -107,16 +105,15 @@ QDLDL_int QDLDL_factor(const QDLDL_int    n,
   * Solves LDL'x = b
   *
   * It is assumed that L will be a compressed
-  * sparse column matrix with data (Ln,Lp,Li).
+  * sparse column matrix with data (n,Lp,Li,Lx).
   *
-  * @param   n     number of columns in L (both square)
-  * @param  Ln     number of columns in CSC matrix L
-  * @param  Lp     column pointers (size Ln+1) for columns of L
-  * @param  Li     row indices of L.  Has Lp[Ln] elements
+  * @param  n      number of columns in L
+  * @param  Lp     column pointers (size n+1) for columns of L
+  * @param  Li     row indices of L.  Has Lp[n] elements
+  * @param  Lx     data of L.  Has Lp[n] elements
   * @param  Dinv   reciprocal of D.  Length is n
   * @param  x      initialized to b.  Equal to x on return
   *
-  *
 */
 void QDLDL_solve(const QDLDL_int    n,
                  const QDLDL_int*   Lp,
@@ -130,40 +127,35 @@ void QDLDL_solve(const QDLDL_int    n,
  * Solves (L+I)x = b
  *
  * It is assumed that L will be a compressed
- * sparse column matrix with data (Ln,Lp,Li).
+ * sparse column matrix with data (n,Lp,Li,Lx).
  *
- * @param   n     number of columns in L (both square)
- * @param  Ln     number of columns in CSC matrix L
- * @param  Lp     column pointers (size Ln+1) for columns of L
- * @param  Li     row indices of L.  Has Lp[Ln] elements
- * @param  Dinv   reciprocal of D.  Length is n
+ * @param  n      number of columns in L
+ * @param  Lp     column pointers (size n+1) for columns of L
+ * @param  Li     row indices of L.  Has Lp[n] elements
+ * @param  Lx     data of L.  Has Lp[n] elements
  * @param  x      initialized to b.  Equal to x on return
  *
- *
 */
 void QDLDL_Lsolve(const QDLDL_int    n,
                   const QDLDL_int*   Lp,
                   const QDLDL_int*   Li,
                   const QDLDL_float* Lx,
                   QDLDL_float* x);
 /**
  * Solves (L+I)'x = b
  *
  * It is assumed that L will be a compressed
- * sparse column matrix with data (Ln,Lp,Li).
+ * sparse column matrix with data (n,Lp,Li,Lx).
  *
- * @param   n     number of columns in L (both square)
- * @param  Ln     number of columns in CSC matrix L
- * @param  Lp     column pointers (size Ln+1) for columns of L
- * @param  Li     row indices of L.  Has Lp[Ln] elements
- * @param  Dinv   reciprocal of D.  Length is n
+ * @param  n      number of columns in L
+ * @param  Lp     column pointers (size n+1) for columns of L
+ * @param  Li     row indices of L.  Has Lp[n] elements
+ * @param  Lx     data of L.  Has Lp[n] elements
  * @param  x      initialized to b.  Equal to x on return
  *
- *
 */
 void QDLDL_Ltsolve(const QDLDL_int    n,
                    const QDLDL_int*   Lp,
                    const QDLDL_int*   Li,

data/vendor/scs/linsys/external/qdldl/qdldl_types.h CHANGED Viewed

@@ -1,18 +1,26 @@
 #ifndef QDLDL_TYPES_H
 # define QDLDL_TYPES_H
-#include "glbopts.h"
 # ifdef __cplusplus
 extern "C" {
 # endif /* ifdef __cplusplus */
-// QDLDL integer and float types
+#include "glbopts.h"
+#include <limits.h> //for the QDLDL_INT_TYPE_MAX
+/* QDLDL integer and float types */
 #define QDLDL_int scs_int
 #define QDLDL_float scs_float
 #define QDLDL_bool scs_int
+/* Maximum value of the signed type QDLDL_int */
+#ifdef DLONG
+#define QDLDL_INT_MAX LLONG_MAX
+#else
+#define QDLDL_INT_MAX INT_MAX
+#endif
 # ifdef __cplusplus
 }
 # endif /* ifdef __cplusplus */

data/vendor/scs/linsys/gpu/gpu.c CHANGED Viewed

@@ -1,41 +1,78 @@
 #include "gpu.h"
-void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
-                               scs_float *y, cusparseHandle_t cusparse_handle) {
+void SCS(accum_by_atrans_gpu)(const ScsGpuMatrix *Ag,
+                              const cusparseDnVecDescr_t x,
+                              cusparseDnVecDescr_t y,
+                              cusparseHandle_t cusparse_handle,
+                              size_t *buffer_size, void **buffer) {
   /* y += A'*x
      x and y MUST be on GPU already
   */
   const scs_float onef = 1.0;
-  CUSPARSE(csrmv)
-  (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, Ag->n, Ag->m, Ag->Annz,
-   &onef, Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
+  size_t new_buffer_size = 0;
+  CUSPARSE_GEN(SpMV_bufferSize)
+  (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &onef, Ag->descr, x,
+   &onef, y, SCS_CUDA_FLOAT, SCS_CSRMV_ALG, &new_buffer_size);
+  if (new_buffer_size > *buffer_size) {
+    if (*buffer != SCS_NULL) {
+      cudaFree(*buffer);
+    }
+    cudaMalloc(buffer, *buffer_size);
+    *buffer_size = new_buffer_size;
+  }
+  CUSPARSE_GEN(SpMV)
+  (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &onef, Ag->descr, x,
+   &onef, y, SCS_CUDA_FLOAT, SCS_CSRMV_ALG, buffer);
 }
-void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *Ag, const scs_float *x,
-                          scs_float *y, cusparseHandle_t cusparse_handle) {
+/* this is slow, use trans routine if possible */
+void SCS(accum_by_a_gpu)(const ScsGpuMatrix *Ag, const cusparseDnVecDescr_t x,
+                         cusparseDnVecDescr_t y,
+                         cusparseHandle_t cusparse_handle, size_t *buffer_size,
+                         void **buffer) {
   /* y += A*x
      x and y MUST be on GPU already
    */
   const scs_float onef = 1.0;
+  size_t new_buffer_size = 0;
   /* The A matrix idx pointers must be ORDERED */
-  CUSPARSE(csrmv)
-  (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, Ag->n, Ag->m, Ag->Annz, &onef,
-   Ag->descr, Ag->x, Ag->p, Ag->i, x, &onef, y);
+  CUSPARSE_GEN(SpMV_bufferSize)
+  (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, &onef, Ag->descr, x, &onef, y,
+   SCS_CUDA_FLOAT, SCS_CSRMV_ALG, &new_buffer_size);
+  if (new_buffer_size > *buffer_size) {
+    if (*buffer != SCS_NULL) {
+      cudaFree(*buffer);
+    }
+    cudaMalloc(buffer, *buffer_size);
+    *buffer_size = new_buffer_size;
+  }
+  CUSPARSE_GEN(SpMV)
+  (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, &onef, Ag->descr, x, &onef, y,
+   SCS_CUDA_FLOAT, SCS_CSRMV_ALG, buffer);
+}
+/* This assumes that P has been made full (ie not triangular) and uses the
+ * fact that the GPU is faster for general sparse matrices than for symmetric
+ */
+/* y += P*x
+   x and y MUST be on GPU already
+ */
+void SCS(accum_by_p_gpu)(const ScsGpuMatrix *Pg, const cusparseDnVecDescr_t x,
+                         cusparseDnVecDescr_t y,
+                         cusparseHandle_t cusparse_handle, size_t *buffer_size,
+                         void **buffer) {
+  SCS(accum_by_atrans_gpu)(Pg, x, y, cusparse_handle, buffer_size, buffer);
 }
 void SCS(free_gpu_matrix)(ScsGpuMatrix *A) {
   cudaFree(A->x);
   cudaFree(A->i);
   cudaFree(A->p);
-  cusparseDestroyMatDescr(A->descr);
-}
-void SCS(normalize_a)(ScsMatrix *A, const ScsSettings *stgs, const ScsCone *k,
-                      ScsScaling *scal) {
-  SCS(_normalize_a)(A, stgs, k, scal);
-}
-void SCS(un_normalize_a)(ScsMatrix *A, const ScsSettings *stgs,
-                         const ScsScaling *scal) {
-  SCS(_un_normalize_a)(A, stgs, scal);
+  cusparseDestroySpMat(A->descr);
 }

data/vendor/scs/linsys/gpu/gpu.h CHANGED Viewed

@@ -1,57 +1,82 @@
-#ifndef SCSGPU_H_GUARD
-#define SCSGPU_H_GUARD
+#ifndef SCS_GPU_H_GUARD
+#define SCS_GPU_H_GUARD
 #ifdef __cplusplus
 extern "C" {
 #endif
-#include <cublas_v2.h>
+/* TODO: Do we need this?
 #include <cuda.h>
+*/
+#include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <cusparse.h>
-#include "amatrix.h"
 #include "glbopts.h"
 #include "linalg.h"
 #include "linsys.h"
 #include "scs.h"
+#include "scs_matrix.h"
 #include "util.h"
-#define CUDA_CHECK_ERR                                                    \
-  do {                                                                    \
-    cudaError_t err = cudaGetLastError();                                 \
-    if (err != cudaSuccess) {                                             \
-      printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
-             cudaGetErrorString(err));                                    \
-    }                                                                     \
+#define CUDA_CHECK_ERR                                                         \
+  do {                                                                         \
+    cudaDeviceSynchronize();                                                   \
+    cudaError_t err = cudaGetLastError();                                      \
+    if (err != cudaSuccess) {                                                  \
+      scs_printf("%s:%d:%s\n ERROR_CUDA (#): %s\n", __FILE__, __LINE__,        \
+                 __func__, cudaGetErrorString(err));                           \
+    }                                                                          \
   } while (0)
-#ifndef EXTRA_VERBOSE
+#if VERBOSITY == 0
 #ifndef SFLOAT
 #define CUBLAS(x) cublasD##x
-#define CUSPARSE(x) cusparseD##x
+#define CUBLASI(x) cublasId##x
 #else
 #define CUBLAS(x) cublasS##x
-#define CUSPARSE(x) cusparseS##x
+#define CUBLASI(x) cublasIs##x
 #endif
+#define CUSPARSE_GEN(x) cusparse##x
 #else
 #ifndef SFLOAT
-#define CUBLAS(x) \
-  CUDA_CHECK_ERR; \
+#define CUBLAS(x)                                                              \
+  CUDA_CHECK_ERR;                                                              \
   cublasD##x
-#define CUSPARSE(x) \
-  CUDA_CHECK_ERR;   \
-  cusparseD##x
+#define CUBLASI(x)                                                             \
+  CUDA_CHECK_ERR;                                                              \
+  cublasId##x
 #else
-#define CUBLAS(x) \
-  CUDA_CHECK_ERR; \
+#define CUBLAS(x)                                                              \
+  CUDA_CHECK_ERR;                                                              \
   cublasS##x
-#define CUSPARSE(x) \
-  CUDA_CHECK_ERR;   \
-  cusparseS##x
+#define CUBLASI(x)                                                             \
+  CUDA_CHECK_ERR;                                                              \
+  cublasIs##x
 #endif
+#define CUSPARSE_GEN(x)                                                        \
+  CUDA_CHECK_ERR;                                                              \
+  cusparse##x
 #endif
+#ifndef SFLOAT
+#define SCS_CUDA_FLOAT CUDA_R_64F
+#else
+#define SCS_CUDA_FLOAT CUDA_R_32F
+#endif
+#ifndef DLONG
+#define SCS_CUSPARSE_INDEX CUSPARSE_INDEX_32I
+#else
+#define SCS_CUSPARSE_INDEX CUSPARSE_INDEX_64I
+#endif
+#define SCS_CSRMV_ALG CUSPARSE_CSRMV_ALG1
+#define SCS_CSR2CSC_ALG CUSPARSE_CSR2CSC_ALG1
 /*
  CUDA matrix routines only for CSR, not CSC matrices:
     CSC             CSR             GPU     Mult
@@ -59,23 +84,33 @@ extern "C" {
     A'(n x m)       A  (m x n)      Agt     accum_by_a_gpu
 */
-/* this struct defines the data matrix A on GPU */
-typedef struct SCS_GPU_A_DATA_MATRIX {
+/* this struct defines the data matrix on GPU */
+typedef struct SCS_GPU_DATA_MATRIX {
   /* A is supplied in column compressed format */
-  scs_float *x; /* A values, size: NNZ A */
-  scs_int *i;   /* A row index, size: NNZ A */
-  scs_int *p;   /* A column pointer, size: n+1 */
+  scs_float *x; /* values, size: NNZ */
+  scs_int *i;   /* row index, size: NNZ */
+  scs_int *p;   /* column pointer, size: n+1 */
   scs_int m, n; /* m rows, n cols */
-  scs_int Annz; /* num non-zeros in A matrix */
+  scs_int nnz;  /* num non-zeros in matrix */
   /* CUDA */
-  cusparseMatDescr_t descr;
+  cusparseSpMatDescr_t descr;
 } ScsGpuMatrix;
-void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const scs_float *x,
-                               scs_float *y, cusparseHandle_t cusparse_handle);
+void SCS(accum_by_atrans_gpu)(const ScsGpuMatrix *A,
+                              const cusparseDnVecDescr_t x,
+                              cusparseDnVecDescr_t y,
+                              cusparseHandle_t cusparse_handle,
+                              size_t *buffer_size, void **buffer);
+void SCS(accum_by_a_gpu)(const ScsGpuMatrix *A, const cusparseDnVecDescr_t x,
+                         cusparseDnVecDescr_t y,
+                         cusparseHandle_t cusparse_handle, size_t *buffer_size,
+                         void **buffer);
-void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const scs_float *x,
-                          scs_float *y, cusparseHandle_t cusparse_handle);
+void SCS(accum_by_p_gpu)(const ScsGpuMatrix *P, const cusparseDnVecDescr_t x,
+                         cusparseDnVecDescr_t y,
+                         cusparseHandle_t cusparse_handle, size_t *buffer_size,
+                         void **buffer);
 void SCS(free_gpu_matrix)(ScsGpuMatrix *A);