RubyGems - scs - Versions diffs - 0.2.3 → 0.3.0 - Mend

scs 0.2.3 → 0.3.0

Files changed (100) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/README.md +11 -6
data/lib/scs/ffi.rb +30 -13
data/lib/scs/solver.rb +32 -9
data/lib/scs/version.rb +1 -1
data/vendor/scs/CITATION.cff +39 -0
data/vendor/scs/CMakeLists.txt +7 -8
data/vendor/scs/Makefile +24 -15
data/vendor/scs/README.md +5 -263
data/vendor/scs/include/aa.h +67 -23
data/vendor/scs/include/cones.h +17 -17
data/vendor/scs/include/glbopts.h +98 -32
data/vendor/scs/include/linalg.h +2 -4
data/vendor/scs/include/linsys.h +58 -44
data/vendor/scs/include/normalize.h +3 -3
data/vendor/scs/include/rw.h +8 -2
data/vendor/scs/include/scs.h +293 -133
data/vendor/scs/include/util.h +3 -15
data/vendor/scs/linsys/cpu/direct/private.c +220 -224
data/vendor/scs/linsys/cpu/direct/private.h +13 -7
data/vendor/scs/linsys/cpu/direct/private.o +0 -0
data/vendor/scs/linsys/cpu/indirect/private.c +177 -110
data/vendor/scs/linsys/cpu/indirect/private.h +8 -4
data/vendor/scs/linsys/cpu/indirect/private.o +0 -0
data/vendor/scs/linsys/csparse.c +87 -0
data/vendor/scs/linsys/csparse.h +34 -0
data/vendor/scs/linsys/csparse.o +0 -0
data/vendor/scs/linsys/external/amd/SuiteSparse_config.c +1 -1
data/vendor/scs/linsys/external/amd/SuiteSparse_config.o +0 -0
data/vendor/scs/linsys/external/amd/amd_1.o +0 -0
data/vendor/scs/linsys/external/amd/amd_2.o +0 -0
data/vendor/scs/linsys/external/amd/amd_aat.o +0 -0
data/vendor/scs/linsys/external/amd/amd_control.o +0 -0
data/vendor/scs/linsys/external/amd/amd_defaults.o +0 -0
data/vendor/scs/linsys/external/amd/amd_dump.o +0 -0
data/vendor/scs/linsys/external/amd/amd_global.o +0 -0
data/vendor/scs/linsys/external/amd/amd_info.o +0 -0
data/vendor/scs/linsys/external/amd/amd_internal.h +1 -1
data/vendor/scs/linsys/external/amd/amd_order.o +0 -0
data/vendor/scs/linsys/external/amd/amd_post_tree.o +0 -0
data/vendor/scs/linsys/external/amd/amd_postorder.o +0 -0
data/vendor/scs/linsys/external/amd/amd_preprocess.o +0 -0
data/vendor/scs/linsys/external/amd/amd_valid.o +0 -0
data/vendor/scs/linsys/external/qdldl/changes +2 -0
data/vendor/scs/linsys/external/qdldl/qdldl.c +29 -46
data/vendor/scs/linsys/external/qdldl/qdldl.h +33 -41
data/vendor/scs/linsys/external/qdldl/qdldl.o +0 -0
data/vendor/scs/linsys/external/qdldl/qdldl_types.h +11 -3
data/vendor/scs/linsys/gpu/gpu.c +31 -33
data/vendor/scs/linsys/gpu/gpu.h +48 -31
data/vendor/scs/linsys/gpu/indirect/private.c +338 -232
data/vendor/scs/linsys/gpu/indirect/private.h +23 -14
data/vendor/scs/linsys/scs_matrix.c +498 -0
data/vendor/scs/linsys/scs_matrix.h +70 -0
data/vendor/scs/linsys/scs_matrix.o +0 -0
data/vendor/scs/scs.mk +13 -9
data/vendor/scs/src/aa.c +384 -109
data/vendor/scs/src/aa.o +0 -0
data/vendor/scs/src/cones.c +440 -353
data/vendor/scs/src/cones.o +0 -0
data/vendor/scs/src/ctrlc.c +15 -5
data/vendor/scs/src/ctrlc.o +0 -0
data/vendor/scs/src/linalg.c +84 -28
data/vendor/scs/src/linalg.o +0 -0
data/vendor/scs/src/normalize.c +22 -64
data/vendor/scs/src/normalize.o +0 -0
data/vendor/scs/src/rw.c +160 -21
data/vendor/scs/src/rw.o +0 -0
data/vendor/scs/src/scs.c +767 -563
data/vendor/scs/src/scs.o +0 -0
data/vendor/scs/src/scs_indir.o +0 -0
data/vendor/scs/src/scs_version.c +9 -3
data/vendor/scs/src/scs_version.o +0 -0
data/vendor/scs/src/util.c +37 -106
data/vendor/scs/src/util.o +0 -0
data/vendor/scs/test/minunit.h +17 -8
data/vendor/scs/test/problem_utils.h +176 -14
data/vendor/scs/test/problems/degenerate.h +130 -0
data/vendor/scs/test/problems/hs21_tiny_qp.h +124 -0
data/vendor/scs/test/problems/hs21_tiny_qp_rw.h +116 -0
data/vendor/scs/test/problems/infeasible_tiny_qp.h +100 -0
data/vendor/scs/test/problems/qafiro_tiny_qp.h +199 -0
data/vendor/scs/test/problems/random_prob +0 -0
data/vendor/scs/test/problems/random_prob.h +45 -0
data/vendor/scs/test/problems/rob_gauss_cov_est.h +188 -31
data/vendor/scs/test/problems/small_lp.h +13 -14
data/vendor/scs/test/problems/test_fails.h +43 -0
data/vendor/scs/test/problems/unbounded_tiny_qp.h +82 -0
data/vendor/scs/test/random_socp_prob.c +54 -53
data/vendor/scs/test/rng.h +109 -0
data/vendor/scs/test/run_from_file.c +19 -10
data/vendor/scs/test/run_tests.c +27 -3
metadata +20 -8
data/vendor/scs/linsys/amatrix.c +0 -305
data/vendor/scs/linsys/amatrix.h +0 -36
data/vendor/scs/linsys/amatrix.o +0 -0
data/vendor/scs/test/data/small_random_socp +0 -0
data/vendor/scs/test/problems/small_random_socp.h +0 -33
data/vendor/scs/test/run_tests +0 -2

data/vendor/scs/linsys/external/qdldl/qdldl.c CHANGED Viewed

@@ -1,35 +1,9 @@
 #include "qdldl.h"
-#include "ctrlc.h"
 #define QDLDL_UNKNOWN (-1)
 #define QDLDL_USED (1)
 #define QDLDL_UNUSED (0)
-// //DEBUG
-// #include <stdio.h>
-// void qdprint_arrayi(const QDLDL_int* data, QDLDL_int n,char* varName){
-//   QDLDL_int i;
-//   printf("%s = [",varName);
-//   for(i=0; i< n; i++){
-//     printf("%lli,",data[i]);
-//   }
-//   printf("]\n");
-// }
-// void qdprint_arrayf(const QDLDL_float* data, QDLDL_int n, char* varName){
-//   QDLDL_int i;
-//   printf("%s = [",varName);
-//   for(i=0; i< n; i++){
-//     printf("%.3g,",data[i]);
-//   }
-//   printf("]\n");
-// }
-// // END DEBUG
 /* Compute the elimination tree for a quasidefinite matrix
    in compressed sparse column form.
 */
@@ -41,7 +15,7 @@ QDLDL_int QDLDL_etree(const QDLDL_int  n,
                       QDLDL_int* Lnz,
                       QDLDL_int* etree){
-  QDLDL_int sumLnz = 0;
+  QDLDL_int sumLnz;
   QDLDL_int i,j,p;
@@ -76,8 +50,19 @@ QDLDL_int QDLDL_etree(const QDLDL_int  n,
   }
   //compute the total nonzeros in L.  This much
-  //space is required to store Li and Lx
-  for(i = 0; i < n; i++){sumLnz += Lnz[i];}
+  //space is required to store Li and Lx.  Return
+  //error code -2 if the nonzero count will overflow
+  //its unteger type.
+  sumLnz  = 0;
+  for(i = 0; i < n; i++){
+    if(sumLnz > QDLDL_INT_MAX - Lnz[i]){
+      sumLnz = -2;
+      break;
+    }
+    else{
+      sumLnz += Lnz[i];
+    }
+  }
   return sumLnz;
 }
@@ -139,10 +124,6 @@ QDLDL_int QDLDL_factor(const QDLDL_int    n,
   //Start from 1 here. The upper LH corner is trivially 0
   //in L b/c we are only computing the subdiagonal elements
   for(k = 1; k < n; k++){
-    if(scs_is_interrupted()) {
-      scs_printf("interrupt detected in factorization\n");
-      return -1;
-    }
     //NB : For each k, we compute a solution to
     //y = L(0:(k-1),0:k-1))\b, where b is the kth
@@ -258,11 +239,12 @@ void QDLDL_Lsolve(const QDLDL_int    n,
                   const QDLDL_float* Lx,
                   QDLDL_float* x){
-QDLDL_int i,j;
+  QDLDL_int i,j;
   for(i = 0; i < n; i++){
-      for(j = Lp[i]; j < Lp[i+1]; j++){
-          x[Li[j]] -= Lx[j]*x[i];
-      }
+    QDLDL_float val = x[i];
+    for(j = Lp[i]; j < Lp[i+1]; j++){
+      x[Li[j]] -= Lx[j]*val;
+    }
   }
 }
@@ -273,11 +255,13 @@ void QDLDL_Ltsolve(const QDLDL_int    n,
                    const QDLDL_float* Lx,
                    QDLDL_float* x){
-QDLDL_int i,j;
+  QDLDL_int i,j;
   for(i = n-1; i>=0; i--){
-      for(j = Lp[i]; j < Lp[i+1]; j++){
-          x[i] -= Lx[j]*x[Li[j]];
-      }
+    QDLDL_float val = x[i];
+    for(j = Lp[i]; j < Lp[i+1]; j++){
+      val -= Lx[j]*x[Li[j]];
+    }
+    x[i] = val;
   }
 }
@@ -289,10 +273,9 @@ void QDLDL_solve(const QDLDL_int       n,
                     const QDLDL_float* Dinv,
                     QDLDL_float* x){
-QDLDL_int i;
-QDLDL_Lsolve(n,Lp,Li,Lx,x);
-for(i = 0; i < n; i++) x[i] *= Dinv[i];
-QDLDL_Ltsolve(n,Lp,Li,Lx,x);
+  QDLDL_int i;
+  QDLDL_Lsolve(n,Lp,Li,Lx,x);
+  for(i = 0; i < n; i++) x[i] *= Dinv[i];
+  QDLDL_Ltsolve(n,Lp,Li,Lx,x);
 }

data/vendor/scs/linsys/external/qdldl/qdldl.h CHANGED Viewed

@@ -32,19 +32,17 @@ extern "C" {
   * this function will *not* return an error, as it may still be possible to factor
   * such a matrix in LDL form.   No promises are made in this case though...
   *
-  * @param   n     number of columns in CSC matrix A (assumed square)
+  * @param  n      number of columns in CSC matrix A (assumed square)
   * @param  Ap     column pointers (size n+1) for columns of A
   * @param  Ai     row indices of A.  Has Ap[n] elements
   * @param  work   work vector (size n) (no meaning on return)
   * @param  Lnz    count of nonzeros in each column of L (size n) below diagonal
   * @param  etree  elimination tree (size n)
-  * @return total  sum of Lnz (i.e. total nonzeros in L below diagonal). Returns
-  *                -1 if the input does not have triu structure or has an empty
-  *                column.
-  *
+  * @return total  sum of Lnz (i.e. total nonzeros in L below diagonal).
+  *                Returns -1 if the input is not triu or has an empty column.
+  *                Returns -2 if the return value overflows QDLDL_int.
   *
 */
  QDLDL_int QDLDL_etree(const QDLDL_int   n,
                        const QDLDL_int* Ap,
                        const QDLDL_int* Ai,
@@ -52,6 +50,7 @@ extern "C" {
                        QDLDL_int* Lnz,
                        QDLDL_int* etree);
 /**
   * Compute an LDL decomposition for a quasidefinite matrix
   * in compressed sparse column form, where the input matrix is
@@ -61,21 +60,22 @@ extern "C" {
   * Returns factors L, D and Dinv = 1./D.
   *
   * Does not use MALLOC.  It is assumed that L will be a compressed
-  * sparse column matrix with data (Ln,Lp,Li)  with sufficient space
+  * sparse column matrix with data (n,Lp,Li,Lx)  with sufficient space
   * allocated, with a number of nonzeros equal to the count given
-  * as a return value by osqp_ldl_etree
-  *
-  * @param   n     number of columns in L and A (both square)
-  * @param  Ap     column pointers (size n+1) for columns of A
-  * @param  Ai     row indices of A.  Has Ap[n] elements
-  * @param  Ln     number of columns in CSC matrix L
-  * @param  Lp     column pointers (size Ln+1) for columns of L
-  * @param  Li     row indices of L.  Has Lp[Ln] elements
+  * as a return value by QDLDL_etree
+  *
+  * @param  n      number of columns in L and A (both square)
+  * @param  Ap     column pointers (size n+1) for columns of A (not modified)
+  * @param  Ai     row indices of A.  Has Ap[n] elements (not modified)
+  * @param  Ax     data of A.  Has Ap[n] elements (not modified)
+  * @param  Lp     column pointers (size n+1) for columns of L
+  * @param  Li     row indices of L.  Has Lp[n] elements
+  * @param  Lx     data of L.  Has Lp[n] elements
   * @param  D      vectorized factor D.  Length is n
   * @param  Dinv   reciprocal of D.  Length is n
   * @param  Lnz    count of nonzeros in each column of L below diagonal,
-  *                as given by osqp_ldl_etree (not modified)
-  * @param  etree  elimination tree as as given by osqp_ldl_etree (not modified)
+  *                as given by QDLDL_etree (not modified)
+  * @param  etree  elimination tree as as given by QDLDL_etree (not modified)
   * @param  bwork  working array of bools. Length is n
   * @param  iwork  working array of integers. Length is 3*n
   * @param  fwork  working array of floats. Length is n
@@ -85,8 +85,6 @@ extern "C" {
   *                or otherwise LDL factorisable)
   *
 */
 QDLDL_int QDLDL_factor(const QDLDL_int    n,
                   const QDLDL_int*   Ap,
                   const QDLDL_int*   Ai,
@@ -107,16 +105,15 @@ QDLDL_int QDLDL_factor(const QDLDL_int    n,
   * Solves LDL'x = b
   *
   * It is assumed that L will be a compressed
-  * sparse column matrix with data (Ln,Lp,Li).
+  * sparse column matrix with data (n,Lp,Li,Lx).
   *
-  * @param   n     number of columns in L (both square)
-  * @param  Ln     number of columns in CSC matrix L
-  * @param  Lp     column pointers (size Ln+1) for columns of L
-  * @param  Li     row indices of L.  Has Lp[Ln] elements
+  * @param  n      number of columns in L
+  * @param  Lp     column pointers (size n+1) for columns of L
+  * @param  Li     row indices of L.  Has Lp[n] elements
+  * @param  Lx     data of L.  Has Lp[n] elements
   * @param  Dinv   reciprocal of D.  Length is n
   * @param  x      initialized to b.  Equal to x on return
   *
-  *
 */
 void QDLDL_solve(const QDLDL_int    n,
                  const QDLDL_int*   Lp,
@@ -130,40 +127,35 @@ void QDLDL_solve(const QDLDL_int    n,
  * Solves (L+I)x = b
  *
  * It is assumed that L will be a compressed
- * sparse column matrix with data (Ln,Lp,Li).
+ * sparse column matrix with data (n,Lp,Li,Lx).
  *
- * @param   n     number of columns in L (both square)
- * @param  Ln     number of columns in CSC matrix L
- * @param  Lp     column pointers (size Ln+1) for columns of L
- * @param  Li     row indices of L.  Has Lp[Ln] elements
- * @param  Dinv   reciprocal of D.  Length is n
+ * @param  n      number of columns in L
+ * @param  Lp     column pointers (size n+1) for columns of L
+ * @param  Li     row indices of L.  Has Lp[n] elements
+ * @param  Lx     data of L.  Has Lp[n] elements
  * @param  x      initialized to b.  Equal to x on return
  *
- *
 */
 void QDLDL_Lsolve(const QDLDL_int    n,
                   const QDLDL_int*   Lp,
                   const QDLDL_int*   Li,
                   const QDLDL_float* Lx,
                   QDLDL_float* x);
 /**
  * Solves (L+I)'x = b
  *
  * It is assumed that L will be a compressed
- * sparse column matrix with data (Ln,Lp,Li).
+ * sparse column matrix with data (n,Lp,Li,Lx).
  *
- * @param   n     number of columns in L (both square)
- * @param  Ln     number of columns in CSC matrix L
- * @param  Lp     column pointers (size Ln+1) for columns of L
- * @param  Li     row indices of L.  Has Lp[Ln] elements
- * @param  Dinv   reciprocal of D.  Length is n
+ * @param  n      number of columns in L
+ * @param  Lp     column pointers (size n+1) for columns of L
+ * @param  Li     row indices of L.  Has Lp[n] elements
+ * @param  Lx     data of L.  Has Lp[n] elements
  * @param  x      initialized to b.  Equal to x on return
  *
- *
 */
 void QDLDL_Ltsolve(const QDLDL_int    n,
                    const QDLDL_int*   Lp,
                    const QDLDL_int*   Li,

data/vendor/scs/linsys/external/qdldl/qdldl.o CHANGED Viewed

Binary file

data/vendor/scs/linsys/external/qdldl/qdldl_types.h CHANGED Viewed

@@ -1,18 +1,26 @@
 #ifndef QDLDL_TYPES_H
 # define QDLDL_TYPES_H
-#include "glbopts.h"
 # ifdef __cplusplus
 extern "C" {
 # endif /* ifdef __cplusplus */
-// QDLDL integer and float types
+#include "glbopts.h"
+#include <limits.h> //for the QDLDL_INT_TYPE_MAX
+/* QDLDL integer and float types */
 #define QDLDL_int scs_int
 #define QDLDL_float scs_float
 #define QDLDL_bool scs_int
+/* Maximum value of the signed type QDLDL_int */
+#ifdef DLONG
+#define QDLDL_INT_MAX LLONG_MAX
+#else
+#define QDLDL_INT_MAX INT_MAX
+#endif
 # ifdef __cplusplus
 }
 # endif /* ifdef __cplusplus */

data/vendor/scs/linsys/gpu/gpu.c CHANGED Viewed

@@ -1,8 +1,10 @@
 #include "gpu.h"
-void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *Ag, const cusparseDnVecDescr_t x,
-                               cusparseDnVecDescr_t y, cusparseHandle_t cusparse_handle,
-                               size_t *buffer_size, void **buffer) {
+void SCS(accum_by_atrans_gpu)(const ScsGpuMatrix *Ag,
+                              const cusparseDnVecDescr_t x,
+                              cusparseDnVecDescr_t y,
+                              cusparseHandle_t cusparse_handle,
+                              size_t *buffer_size, void **buffer) {
   /* y += A'*x
      x and y MUST be on GPU already
   */
@@ -10,10 +12,8 @@ void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *Ag, const cusparseDnVecDescr_
   size_t new_buffer_size = 0;
   CUSPARSE_GEN(SpMV_bufferSize)
-  (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-    &onef, Ag->descr, x, &onef, y,
-    SCS_CUDA_FLOAT, SCS_CSRMV_ALG,
-    &new_buffer_size);
+  (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &onef, Ag->descr, x,
+   &onef, y, SCS_CUDA_FLOAT, SCS_CSRMV_ALG, &new_buffer_size);
   if (new_buffer_size > *buffer_size) {
     if (*buffer != SCS_NULL) {
@@ -24,15 +24,15 @@ void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *Ag, const cusparseDnVecDescr_
   }
   CUSPARSE_GEN(SpMV)
-  (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-    &onef, Ag->descr, x, &onef, y,
-    SCS_CUDA_FLOAT, SCS_CSRMV_ALG,
-    buffer);
+  (cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &onef, Ag->descr, x,
+   &onef, y, SCS_CUDA_FLOAT, SCS_CSRMV_ALG, buffer);
 }
-void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *Ag, const cusparseDnVecDescr_t x,
-                          cusparseDnVecDescr_t y, cusparseHandle_t cusparse_handle,
-                          size_t *buffer_size, void **buffer) {
+/* this is slow, use trans routine if possible */
+void SCS(accum_by_a_gpu)(const ScsGpuMatrix *Ag, const cusparseDnVecDescr_t x,
+                         cusparseDnVecDescr_t y,
+                         cusparseHandle_t cusparse_handle, size_t *buffer_size,
+                         void **buffer) {
   /* y += A*x
      x and y MUST be on GPU already
    */
@@ -40,12 +40,9 @@ void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *Ag, const cusparseDnVecDescr_t x,
   size_t new_buffer_size = 0;
   /* The A matrix idx pointers must be ORDERED */
   CUSPARSE_GEN(SpMV_bufferSize)
-  (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE,
-    &onef, Ag->descr, x, &onef, y,
-    SCS_CUDA_FLOAT, SCS_CSRMV_ALG,
-    &new_buffer_size);
+  (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, &onef, Ag->descr, x, &onef, y,
+   SCS_CUDA_FLOAT, SCS_CSRMV_ALG, &new_buffer_size);
   if (new_buffer_size > *buffer_size) {
     if (*buffer != SCS_NULL) {
@@ -56,10 +53,21 @@ void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *Ag, const cusparseDnVecDescr_t x,
   }
   CUSPARSE_GEN(SpMV)
-  (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE,
-    &onef, Ag->descr, x, &onef, y,
-    SCS_CUDA_FLOAT, SCS_CSRMV_ALG,
-    buffer);
+  (cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, &onef, Ag->descr, x, &onef, y,
+   SCS_CUDA_FLOAT, SCS_CSRMV_ALG, buffer);
+}
+/* This assumes that P has been made full (ie not triangular) and uses the
+ * fact that the GPU is faster for general sparse matrices than for symmetric
+ */
+/* y += P*x
+   x and y MUST be on GPU already
+ */
+void SCS(accum_by_p_gpu)(const ScsGpuMatrix *Pg, const cusparseDnVecDescr_t x,
+                         cusparseDnVecDescr_t y,
+                         cusparseHandle_t cusparse_handle, size_t *buffer_size,
+                         void **buffer) {
+  SCS(accum_by_atrans_gpu)(Pg, x, y, cusparse_handle, buffer_size, buffer);
 }
 void SCS(free_gpu_matrix)(ScsGpuMatrix *A) {
@@ -68,13 +76,3 @@ void SCS(free_gpu_matrix)(ScsGpuMatrix *A) {
   cudaFree(A->p);
   cusparseDestroySpMat(A->descr);
 }
-void SCS(normalize_a)(ScsMatrix *A, const ScsSettings *stgs, const ScsCone *k,
-                      ScsScaling *scal) {
-  SCS(_normalize_a)(A, stgs, k, scal);
-}
-void SCS(un_normalize_a)(ScsMatrix *A, const ScsSettings *stgs,
-                         const ScsScaling *scal) {
-  SCS(_un_normalize_a)(A, stgs, scal);
-}

data/vendor/scs/linsys/gpu/gpu.h CHANGED Viewed

@@ -10,49 +10,58 @@ extern "C" {
 #include <cuda_runtime_api.h>
 #include <cusparse.h>
-#include "amatrix.h"
 #include "glbopts.h"
 #include "linalg.h"
 #include "linsys.h"
 #include "scs.h"
+#include "scs_matrix.h"
 #include "util.h"
-#define CUDA_CHECK_ERR                                                    \
-  do {                                                                    \
-    cudaError_t err = cudaGetLastError();                                 \
-    if (err != cudaSuccess) {                                             \
-      printf("%s:%d:%s\n ERROR_CUDA: %s\n", __FILE__, __LINE__, __func__, \
-             cudaGetErrorString(err));                                    \
-    }                                                                     \
+#define CUDA_CHECK_ERR                                                         \
+  do {                                                                         \
+    cudaDeviceSynchronize();                                                   \
+    cudaError_t err = cudaGetLastError();                                      \
+    if (err != cudaSuccess) {                                                  \
+      scs_printf("%s:%d:%s\n ERROR_CUDA (#): %s\n", __FILE__, __LINE__,        \
+                 __func__, cudaGetErrorString(err));                           \
+    }                                                                          \
   } while (0)
-#ifndef EXTRA_VERBOSE
+#if VERBOSITY == 0
 #ifndef SFLOAT
 #define CUBLAS(x) cublasD##x
+#define CUBLASI(x) cublasId##x
 #define CUSPARSE(x) cusparseD##x
 #else
 #define CUBLAS(x) cublasS##x
+#define CUBLASI(x) cublasIs##x
 #define CUSPARSE(x) cusparseS##x
 #endif
 #define CUSPARSE_GEN(x) cusparse##x
 #else
 #ifndef SFLOAT
-#define CUBLAS(x) \
-  CUDA_CHECK_ERR; \
+#define CUBLAS(x)                                                              \
+  CUDA_CHECK_ERR;                                                              \
   cublasD##x
-#define CUSPARSE(x) \
-  CUDA_CHECK_ERR;   \
+#define CUBLASI(x)                                                             \
+  CUDA_CHECK_ERR;                                                              \
+  cublasId##x
+#define CUSPARSE(x)                                                            \
+  CUDA_CHECK_ERR;                                                              \
   cusparseD##x
 #else
-#define CUBLAS(x) \
-  CUDA_CHECK_ERR; \
+#define CUBLAS(x)                                                              \
+  CUDA_CHECK_ERR;                                                              \
   cublasS##x
-#define CUSPARSE(x) \
-  CUDA_CHECK_ERR;   \
+#define CUBLASI(x)                                                             \
+  CUDA_CHECK_ERR;                                                              \
+  cublasIs##x
+#define CUSPARSE(x)                                                            \
+  CUDA_CHECK_ERR;                                                              \
   cusparseS##x
 #endif
-#define CUSPARSE_GEN(x) \
-  CUDA_CHECK_ERR;       \
+#define CUSPARSE_GEN(x)                                                        \
+  CUDA_CHECK_ERR;                                                              \
   cusparse##x
 #endif
@@ -78,25 +87,33 @@ extern "C" {
     A'(n x m)       A  (m x n)      Agt     accum_by_a_gpu
 */
-/* this struct defines the data matrix A on GPU */
-typedef struct SCS_GPU_A_DATA_MATRIX {
+/* this struct defines the data matrix on GPU */
+typedef struct SCS_GPU_DATA_MATRIX {
   /* A is supplied in column compressed format */
-  scs_float *x; /* A values, size: NNZ A */
-  scs_int *i;   /* A row index, size: NNZ A */
-  scs_int *p;   /* A column pointer, size: n+1 */
+  scs_float *x; /* values, size: NNZ */
+  scs_int *i;   /* row index, size: NNZ */
+  scs_int *p;   /* column pointer, size: n+1 */
   scs_int m, n; /* m rows, n cols */
-  scs_int Annz; /* num non-zeros in A matrix */
+  scs_int nnz;  /* num non-zeros in matrix */
   /* CUDA */
   cusparseSpMatDescr_t descr;
 } ScsGpuMatrix;
-void SCS(_accum_by_atrans_gpu)(const ScsGpuMatrix *A, const cusparseDnVecDescr_t x,
-                               cusparseDnVecDescr_t y, cusparseHandle_t cusparse_handle,
-                               size_t *buffer_size, void **buffer);
+void SCS(accum_by_atrans_gpu)(const ScsGpuMatrix *A,
+                              const cusparseDnVecDescr_t x,
+                              cusparseDnVecDescr_t y,
+                              cusparseHandle_t cusparse_handle,
+                              size_t *buffer_size, void **buffer);
-void SCS(_accum_by_a_gpu)(const ScsGpuMatrix *A, const cusparseDnVecDescr_t x,
-                          cusparseDnVecDescr_t y, cusparseHandle_t cusparse_handle,
-                          size_t *buffer_size, void **buffer);
+void SCS(accum_by_a_gpu)(const ScsGpuMatrix *A, const cusparseDnVecDescr_t x,
+                         cusparseDnVecDescr_t y,
+                         cusparseHandle_t cusparse_handle, size_t *buffer_size,
+                         void **buffer);
+void SCS(accum_by_p_gpu)(const ScsGpuMatrix *P, const cusparseDnVecDescr_t x,
+                         cusparseDnVecDescr_t y,
+                         cusparseHandle_t cusparse_handle, size_t *buffer_size,
+                         void **buffer);
 void SCS(free_gpu_matrix)(ScsGpuMatrix *A);