RubyGems - nmatrix - Versions diffs - 0.0.4 → 0.0.5 - Mend

nmatrix 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +7 -0
data/History.txt +68 -2
data/Manifest.txt +1 -0
data/README.rdoc +8 -7
data/Rakefile +13 -2
data/ext/nmatrix/data/complex.h +19 -1
data/ext/nmatrix/data/data.h +8 -0
data/ext/nmatrix/data/ruby_object.h +1 -0
data/ext/nmatrix/extconf.rb +6 -4
data/ext/nmatrix/nmatrix.cpp +97 -35
data/ext/nmatrix/nmatrix.h +2 -0
data/ext/nmatrix/ruby_constants.cpp +11 -1
data/ext/nmatrix/ruby_constants.h +6 -1
data/ext/nmatrix/storage/dense.cpp +2 -2
data/ext/nmatrix/storage/yale.cpp +303 -49
data/ext/nmatrix/storage/yale.h +3 -0
data/ext/nmatrix/util/math.cpp +112 -0
data/ext/nmatrix/util/math.h +372 -72
data/lib/nmatrix/blas.rb +55 -9
data/lib/nmatrix/nmatrix.rb +315 -2
data/lib/nmatrix/nvector.rb +156 -95
data/lib/nmatrix/version.rb +1 -1
data/lib/nmatrix/yale_functions.rb +112 -0
data/spec/blas_spec.rb +11 -0
data/spec/elementwise_spec.rb +4 -1
data/spec/io_spec.rb +8 -0
data/spec/lapack_spec.rb +37 -15
data/spec/leakcheck.rb +16 -0
data/spec/math_spec.rb +6 -2
data/spec/nmatrix_spec.rb +209 -3
data/spec/nmatrix_yale_spec.rb +55 -0
data/spec/nvector_spec.rb +33 -14
data/spec/slice_spec.rb +26 -17
data/spec/spec_helper.rb +17 -0
metadata +60 -45
data/ext/nmatrix/new_extconf.rb +0 -55

data/ext/nmatrix/storage/yale.h CHANGED Viewed

@@ -102,6 +102,9 @@ extern "C" {
   void*	nm_yale_storage_ref(STORAGE* s, SLICE* slice);
   char  nm_yale_storage_set(STORAGE* storage, SLICE* slice, void* v);
+  //char  nm_yale_storage_vector_insert(YALE_STORAGE* s, size_t pos, size_t* js, void* vals, size_t n, bool struct_only, nm::dtype_t dtype, nm::itype_t itype);
+  //void  nm_yale_storage_increment_ia_after(YALE_STORAGE* s, size_t ija_size, size_t i, size_t n);
   size_t  nm_yale_storage_get_size(const YALE_STORAGE* storage);
   ///////////

data/ext/nmatrix/util/math.cpp CHANGED Viewed

@@ -127,6 +127,8 @@ extern "C" {
   #include <clapack.h>
 #endif
+  static VALUE nm_cblas_nrm2(VALUE self, VALUE n, VALUE x, VALUE incx);
+  static VALUE nm_cblas_asum(VALUE self, VALUE n, VALUE x, VALUE incx);
   static VALUE nm_cblas_rot(VALUE self, VALUE n, VALUE x, VALUE incx, VALUE y, VALUE incy, VALUE c, VALUE s);
   static VALUE nm_cblas_rotg(VALUE self, VALUE ab);
@@ -307,6 +309,8 @@ void nm_math_init_blas() {
   cNMatrix_BLAS = rb_define_module_under(cNMatrix, "BLAS");
+  rb_define_singleton_method(cNMatrix_BLAS, "cblas_nrm2", (METHOD)nm_cblas_nrm2, 3);
+  rb_define_singleton_method(cNMatrix_BLAS, "cblas_asum", (METHOD)nm_cblas_asum, 3);
   rb_define_singleton_method(cNMatrix_BLAS, "cblas_rot",  (METHOD)nm_cblas_rot,  7);
   rb_define_singleton_method(cNMatrix_BLAS, "cblas_rotg", (METHOD)nm_cblas_rotg, 1);
@@ -515,6 +519,114 @@ static VALUE nm_cblas_rot(VALUE self, VALUE n, VALUE x, VALUE incx, VALUE y, VAL
 }
+/*
+ * Call any of the cblas_xnrm2 functions as directly as possible.
+ *
+ * xNRM2 is a BLAS level 1 routine which calculates the 2-norm of an n-vector x.
+ *
+ * Arguments:
+ *  * n     :: length of x, must be at least 0
+ *  * x     :: pointer to first entry of input vector
+ *  * incx  :: stride of x, must be POSITIVE (ATLAS says non-zero, but 3.8.4 code only allows positive)
+ *
+ * You probably don't want to call this function. Instead, why don't you try nrm2, which is more flexible
+ * with its arguments?
+ *
+ * This function does almost no type checking. Seriously, be really careful when you call it! There's no exception
+ * handling, so you can easily crash Ruby!
+ */
+static VALUE nm_cblas_nrm2(VALUE self, VALUE n, VALUE x, VALUE incx) {
+  static void (*ttable[nm::NUM_DTYPES])(const int N, const void* X, const int incX, void* sum) = {
+/*      nm::math::cblas_nrm2<uint8_t,uint8_t>,
+      nm::math::cblas_nrm2<int8_t,int8_t>,
+      nm::math::cblas_nrm2<int16_t,int16_t>,
+      nm::math::cblas_nrm2<int32_t,int32_t>, */
+      NULL, NULL, NULL, NULL, NULL, // no help for integers
+      nm::math::cblas_nrm2<float32_t,float32_t>,
+      nm::math::cblas_nrm2<float64_t,float64_t>,
+      nm::math::cblas_nrm2<float32_t,nm::Complex64>,
+      nm::math::cblas_nrm2<float64_t,nm::Complex128>,
+      nm::math::cblas_nrm2<nm::Rational32,nm::Rational32>,
+      nm::math::cblas_nrm2<nm::Rational64,nm::Rational64>,
+      nm::math::cblas_nrm2<nm::Rational128,nm::Rational128>,
+      nm::math::cblas_nrm2<nm::RubyObject,nm::RubyObject>
+  };
+  nm::dtype_t dtype  = NM_DTYPE(x);
+  if (!ttable[dtype]) {
+    rb_raise(nm_eDataTypeError, "this vector operation undefined for integer vectors");
+    return Qnil;
+  } else {
+    // Determine the return dtype and allocate it
+    nm::dtype_t rdtype = dtype;
+    if      (dtype == nm::COMPLEX64)  rdtype = nm::FLOAT32;
+    else if (dtype == nm::COMPLEX128) rdtype = nm::FLOAT64;
+    void *Result = ALLOCA_N(char, DTYPE_SIZES[rdtype]);
+    ttable[dtype](FIX2INT(n), NM_STORAGE_DENSE(x)->elements, FIX2INT(incx), Result);
+    return rubyobj_from_cval(Result, rdtype).rval;
+  }
+}
+/*
+ * Call any of the cblas_xasum functions as directly as possible.
+ *
+ * xASUM is a BLAS level 1 routine which calculates the sum of absolute values of the entries
+ * of a vector x.
+ *
+ * Arguments:
+ *  * n     :: length of x, must be at least 0
+ *  * x     :: pointer to first entry of input vector
+ *  * incx  :: stride of x, must be POSITIVE (ATLAS says non-zero, but 3.8.4 code only allows positive)
+ *
+ * You probably don't want to call this function. Instead, why don't you try asum, which is more flexible
+ * with its arguments?
+ *
+ * This function does almost no type checking. Seriously, be really careful when you call it! There's no exception
+ * handling, so you can easily crash Ruby!
+ */
+static VALUE nm_cblas_asum(VALUE self, VALUE n, VALUE x, VALUE incx) {
+  static void (*ttable[nm::NUM_DTYPES])(const int N, const void* X, const int incX, void* sum) = {
+      nm::math::cblas_asum<uint8_t,uint8_t>,
+      nm::math::cblas_asum<int8_t,int8_t>,
+      nm::math::cblas_asum<int16_t,int16_t>,
+      nm::math::cblas_asum<int32_t,int32_t>,
+      nm::math::cblas_asum<int64_t,int64_t>,
+      nm::math::cblas_asum<float32_t,float32_t>,
+      nm::math::cblas_asum<float64_t,float64_t>,
+      nm::math::cblas_asum<float32_t,nm::Complex64>,
+      nm::math::cblas_asum<float64_t,nm::Complex128>,
+      nm::math::cblas_asum<nm::Rational32,nm::Rational32>,
+      nm::math::cblas_asum<nm::Rational64,nm::Rational64>,
+      nm::math::cblas_asum<nm::Rational128,nm::Rational128>,
+      nm::math::cblas_asum<nm::RubyObject,nm::RubyObject>
+  };
+  nm::dtype_t dtype  = NM_DTYPE(x);
+  // Determine the return dtype and allocate it
+  nm::dtype_t rdtype = dtype;
+  if      (dtype == nm::COMPLEX64)  rdtype = nm::FLOAT32;
+  else if (dtype == nm::COMPLEX128) rdtype = nm::FLOAT64;
+  void *Result = ALLOCA_N(char, DTYPE_SIZES[rdtype]);
+  ttable[dtype](FIX2INT(n), NM_STORAGE_DENSE(x)->elements, FIX2INT(incx), Result);
+  return rubyobj_from_cval(Result, rdtype).rval;
+}
 /* Call any of the cblas_xgemm functions as directly as possible.
  *
  * The cblas_xgemm functions (dgemm, sgemm, cgemm, and zgemm) define the following operation:

data/ext/nmatrix/util/math.h CHANGED Viewed

@@ -1026,33 +1026,31 @@ inline bool gemv(const enum CBLAS_TRANSPOSE Trans, const int M, const int N, con
 // Yale: numeric matrix multiply c=a*b
 template <typename DType, typename IType>
-inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const DType* a, const bool diaga,
+inline void numbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const DType* a, const bool diaga,
             const IType* ib, const IType* jb, const DType* b, const bool diagb, IType* ic, IType* jc, DType* c, const bool diagc) {
-  IType next[m];
-  DType sums[m];
+  const unsigned int max_lmn = std::max(std::max(m, n), l);
+  IType next[max_lmn];
+  DType sums[max_lmn];
   DType v;
   IType head, length, temp, ndnz = 0;
-  IType jj_start, jj_end, kk_start, kk_end;
-  IType i, j, k, kk, jj;
   IType minmn = std::min(m,n);
+  IType minlm = std::min(l,m);
-  for (i = 0; i < m; ++i) { // initialize scratch arrays
-    next[i] = std::numeric_limits<IType>::max();
-    sums[i] = 0;
+  for (IType idx = 0; idx < max_lmn; ++idx) { // initialize scratch arrays
+    next[idx] = std::numeric_limits<IType>::max();
+    sums[idx] = 0;
   }
-  for (i = 0; i < n; ++i) { // walk down the rows
+  for (IType i = 0; i < n; ++i) { // walk down the rows
     head = std::numeric_limits<IType>::max()-1; // head gets assigned as whichever column of B's row j we last visited
     length = 0;
-    jj_start = ia[i];
-    jj_end   = ia[i+1];
+    for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // walk through entries in each row
+      IType j;
-    for (jj = jj_start; jj <= jj_end; ++jj) { // walk through entries in each row
-      if (jj == jj_end) { // if we're in the last entry for this row:
+      if (jj == ia[i+1]) { // if we're in the last entry for this row:
         if (!diaga || i >= minmn) continue;
         j   = i;      // if it's a new Yale matrix, and last entry, get the diagonal position (j) and entry (ajj)
         v   = a[i];
@@ -1061,12 +1059,12 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
         v   = a[jj];
       }
-      kk_start = ib[j];   // Find the first entry of row j of matrix B
-      kk_end   = ib[j+1];
-      for (kk = kk_start; kk <= kk_end; ++kk) {
+      for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) {
+        IType k;
-        if (kk == kk_end) { // Get the column id for that entry
-          if (!diagb || j >= minmn) continue;
+        if (kk == ib[j+1]) { // Get the column id for that entry
+          if (!diagb || j >= minlm) continue;
           k  = j;
           sums[k] += v*b[k];
         } else {
@@ -1079,10 +1077,10 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
           head    = k;
           ++length;
         }
-      }
-    }
+      } // end of kk loop
+    } // end of jj loop
-    for (jj = 0; jj < length; ++jj) {
+    for (IType jj = 0; jj < length; ++jj) {
       if (sums[head] != 0) {
         if (diagc && head == i) {
           c[head] = sums[head];
@@ -1105,22 +1103,64 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
 } /* numbmm_ */
+/*
+template <typename DType, typename IType>
+inline void new_yale_matrix_multiply(const unsigned int m, const IType* ija, const DType* a, const IType* ijb, const DType* b, YALE_STORAGE* c_storage) {
+  unsigned int n = c_storage->shape[0],
+               l = c_storage->shape[1];
+  // Create a working vector of dimension max(m,l,n) and initial value IType::max():
+  std::vector<IType> mask(std::max(std::max(m,l),n), std::numeric_limits<IType>::max());
+  for (IType i = 0; i < n; ++i) { // A.rows.each_index do |i|
+    IType j, k;
+    size_t ndnz;
+    for (IType jj = ija[i]; jj <= ija[i+1]; ++jj) { // walk through column pointers for row i of A
+      j = (jj == ija[i+1]) ? i : ija[jj];   // Get the current column index (handle diagonals last)
+      if (j >= m) {
+        if (j == ija[jj]) rb_raise(rb_eIndexError, "ija array for left-hand matrix contains an out-of-bounds column index %u at position %u", jj, j);
+        else              break;
+      }
+      for (IType kk = ijb[j]; kk <= ijb[j+1]; ++kk) { // walk through column pointers for row j of B
+        if (j >= m) continue; // first of all, does B *have* a row j?
+        k = (kk == ijb[j+1]) ? j : ijb[kk];   // Get the current column index (handle diagonals last)
+        if (k >= l) {
+          if (k == ijb[kk]) rb_raise(rb_eIndexError, "ija array for right-hand matrix contains an out-of-bounds column index %u at position %u", kk, k);
+          else              break;
+        }
+        if (mask[k] == )
+      }
+    }
+  }
+}
+*/
 // Yale: Symbolic matrix multiply c=a*b
 template <typename IType>
-inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const bool diaga,
+inline size_t symbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const bool diaga,
             const IType* ib, const IType* jb, const bool diagb, IType* ic, const bool diagc) {
-  IType mask[m];
-  IType j, k, ndnz = n; /* Local variables */
+  unsigned int max_lmn = std::max(std::max(m,n), l);
+  IType mask[max_lmn];  // INDEX in the SMMP paper.
+  IType j, k; /* Local variables */
+  size_t ndnz = n;
+  for (IType idx = 0; idx < max_lmn; ++idx)
+    mask[idx] = std::numeric_limits<IType>::max();
-  for (j = 0; j < m; ++j)
-    mask[j] = std::numeric_limits<IType>::max();
-  if (diagc)  ic[0] = n+1;
-  else        ic[0] = 0;
+  if (ic) { // Only write to ic if it's supplied; otherwise, we're just counting.
+    if (diagc)  ic[0] = n+1;
+    else        ic[0] = 0;
+  }
   IType minmn = std::min(m,n);
+  IType minlm = std::min(l,m);
   for (IType i = 0; i < n; ++i) { // MAIN LOOP: through rows
@@ -1132,9 +1172,9 @@ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia,
         j = i;
       } else j = ja[jj];
-      for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns of row J in matrix B.
+      for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns K of row J in matrix B.
         if (kk == ib[j+1]) {
-          if (!diagb || j >= minmn) continue;
+          if (!diagb || j >= minlm) continue;
           k = j;
         } else k = jb[kk];
@@ -1145,65 +1185,138 @@ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia,
       }
     }
-    if (diagc && !mask[i]) --ndnz;
+    if (diagc && mask[i] == std::numeric_limits<IType>::max()) --ndnz;
-    ic[i+1] = ndnz;
+    if (ic) ic[i+1] = ndnz;
   }
-} /* symbmm_ */
+  return ndnz;
+} /* symbmm_ */
-//TODO: More efficient sorting algorithm than selection sort would be nice, probably.
-// Remember, we're dealing with unique keys, which simplifies things.
-// Doesn't have to be in-place, since we probably just multiplied and that wasn't in-place.
-template <typename DType, typename IType>
-inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
-  IType jj, min, min_jj;
-  DType temp_val;
-  for (size_t i = 0; i < n; ++i) {
-    // No need to sort if there are 0 or 1 entries in the row
-    if (ia[i+1] - ia[i] < 2) continue;
+// In-place quicksort (from Wikipedia) -- called by smmp_sort_columns, below. All functions are inclusive of left, right.
+namespace smmp_sort {
+  const size_t THRESHOLD = 4;  // switch to insertion sort for 4 elements or fewer
-    for (IType jj_start = ia[i]; jj_start < ia[i+1]; ++jj_start) {
+  template <typename DType, typename IType>
+  void print_array(DType* vals, IType* array, IType left, IType right) {
+    for (IType i = left; i <= right; ++i) {
+      std::cerr << array[i] << ":" << vals[i] << "  ";
+    }
+    std::cerr << std::endl;
+  }
-      // If the previous min is just current-1, this key/value pair is already in sorted order.
-      // This follows from the unique condition on our column keys.
-      if (jj_start > ia[i] && min+1 == ja[jj_start]) {
-        min    = ja[jj_start];
-        continue;
+  template <typename DType, typename IType>
+  IType partition(DType* vals, IType* array, IType left, IType right, IType pivot) {
+    IType pivotJ = array[pivot];
+    DType pivotV = vals[pivot];
+    // Swap pivot and right
+    array[pivot] = array[right];
+    vals[pivot]  = vals[right];
+    array[right] = pivotJ;
+    vals[right]  = pivotV;
+    IType store = left;
+    for (IType idx = left; idx < right; ++idx) {
+      if (array[idx] <= pivotJ) {
+        // Swap i and store
+        std::swap(array[idx], array[store]);
+        std::swap(vals[idx],  vals[store]);
+        ++store;
       }
+    }
-      // find the minimum key (column index) between jj_start and ia[i+1]
-      min    = ja[jj_start];
-      min_jj = jj_start;
-      for (jj = jj_start+1; jj < ia[i+1]; ++jj) {
-        if (ja[jj] < min) {
-          min_jj = jj;
-          min    = ja[jj];
-        }
+    std::swap(array[store], array[right]);
+    std::swap(vals[store],  vals[right]);
+    return store;
+  }
+  // Recommended to use the median of left, right, and mid for the pivot.
+  template <typename IType>
+  IType median(IType a, IType b, IType c) {
+    if (a < b) {
+      if (b < c) return b; // a b c
+      if (a < c) return c; // a c b
+                 return a; // c a b
+    } else { // a > b
+      if (a < c) return a; // b a c
+      if (b < c) return c; // b c a
+                 return b; // c b a
+    }
+  }
+  // Insertion sort is more efficient than quicksort for small N
+  template <typename DType, typename IType>
+  void insertion_sort(DType* vals, IType* array, IType left, IType right) {
+    for (IType idx = left; idx <= right; ++idx) {
+      IType col_to_insert = array[idx];
+      DType val_to_insert = vals[idx];
+      IType hole_pos = idx;
+      for (; hole_pos > left && col_to_insert < array[hole_pos-1]; --hole_pos) {
+        array[hole_pos] = array[hole_pos - 1];  // shift the larger column index up
+        vals[hole_pos]  = vals[hole_pos - 1];   // value goes along with it
       }
-      // if min is already first, skip this iteration
-      if (min_jj == jj_start) continue;
+      array[hole_pos] = col_to_insert;
+      vals[hole_pos]  = val_to_insert;
+    }
+  }
-      for (jj = jj_start; jj < ia[i+1]; ++jj) {
-        // swap minimum key/value pair with key/value pair in the first position.
-        if (min_jj != jj) {
-          // min already = ja[min_jj], so use this as temp_key
-          temp_val = a[min_jj];
+  template <typename DType, typename IType>
+  void quicksort(DType* vals, IType* array, IType left, IType right) {
-          ja[min_jj] = ja[jj];
-          a[min_jj] = a[jj];
+    if (left < right) {
+      if (right - left < THRESHOLD) {
+        insertion_sort(vals, array, left, right);
+      } else {
+        // choose any pivot such that left < pivot < right
+        IType pivot = median(left, right, (IType)(((unsigned long)left + (unsigned long)right) / 2));
+        pivot = partition(vals, array, left, right, pivot);
-          ja[jj] = min;
-          a[jj] = temp_val;
-        }
+        // recursively sort elements smaller than the pivot
+        quicksort<DType,IType>(vals, array, left, pivot-1);
+        // recursively sort elements at least as big as the pivot
+        quicksort<DType,IType>(vals, array, pivot+1, right);
       }
     }
   }
+}; // end of namespace smmp_sort
+/*
+ * For use following symbmm and numbmm. Sorts the matrix entries in each row according to the column index.
+ * This utilizes quicksort, which is an in-place unstable sort (since there are no duplicate entries, we don't care
+ * about stability).
+ *
+ * TODO: It might be worthwhile to do a test for free memory, and if available, use an unstable sort that isn't in-place.
+ *
+ * TODO: It's actually probably possible to write an even faster sort, since symbmm/numbmm are not producing a random
+ * ordering. If someone is doing a lot of Yale matrix multiplication, it might benefit them to consider even insertion
+ * sort.
+ */
+template <typename DType, typename IType>
+inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
+  for (size_t i = 0; i < n; ++i) {
+    if (ia[i+1] - ia[i] < 2) continue; // no need to sort rows containing only one or two elements.
+    else if (ia[i+1] - ia[i] <= smmp_sort::THRESHOLD) {
+      smmp_sort::insertion_sort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for small rows
+    } else {
+      smmp_sort::quicksort<DType, IType>(a, ja, ia[i], ia[i+1]-1);      // faster for large rows (and may call insertion_sort as well)
+    }
+  }
 }
 /*
  * Transposes a generic Yale matrix (old or new). Specify new by setting diaga = true.
  *
@@ -2025,7 +2138,194 @@ inline void rot(const int N, Complex128* X, const int incX, Complex128* Y, const
 template <typename DType, typename CSDType>
 inline void cblas_rot(const int N, void* X, const int incX, void* Y, const int incY, const void* c, const void* s) {
-  rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY, *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
+  rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY,
+                       *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
+}
+/*
+ * Level 1 BLAS routine which returns the 2-norm of an n-vector x.
+ #
+ * Based on input types, these are the valid return types:
+ *    int -> int
+ *    float -> float or double
+ *    double -> double
+ *    complex64 -> float or double
+ *    complex128 -> double
+ *    rational -> rational
+ */
+template <typename ReturnDType, typename DType>
+ReturnDType nrm2(const int N, const DType* X, const int incX) {
+  const DType ONE = 1, ZERO = 0;
+  typename LongDType<DType>::type scale = 0, ssq = 1, absxi, temp;
+  if ((N < 1) || (incX < 1))    return ZERO;
+  else if (N == 1)              return std::abs(X[0]);
+  for (int i = 0; i < N; ++i) {
+    absxi = std::abs(X[i*incX]);
+    if (scale < absxi) {
+      temp  = scale / absxi;
+      scale = absxi;
+      ssq   = ONE + ssq * (temp * temp);
+    } else {
+      temp = absxi / scale;
+      ssq += temp * temp;
+    }
+  }
+  return scale * std::sqrt( ssq );
+}
+#ifdef HAVE_CBLAS_H
+template <>
+inline float nrm2(const int N, const float* X, const int incX) {
+  return cblas_snrm2(N, X, incX);
+}
+template <>
+inline double nrm2(const int N, const double* X, const int incX) {
+  return cblas_dnrm2(N, X, incX);
+}
+template <>
+inline float nrm2(const int N, const Complex64* X, const int incX) {
+  return cblas_scnrm2(N, X, incX);
+}
+template <>
+inline double nrm2(const int N, const Complex128* X, const int incX) {
+  return cblas_dznrm2(N, X, incX);
+}
+#else
+template <typename FloatDType>
+static inline void nrm2_complex_helper(const FloatDType& xr, const FloatDType& xi, double& scale, double& ssq) {
+  double absx = std::abs(xr);
+  if (scale < absx) {
+    double temp  = scale / absx;
+    scale = absx;
+    ssq   = 1.0 + ssq * (temp * temp);
+  } else {
+    double temp = absx / scale;
+    ssq += temp * temp;
+  }
+  absx = std::abs(xi);
+  if (scale < absx) {
+    double temp  = scale / absx;
+    scale = absx;
+    ssq   = 1.0 + ssq * (temp * temp);
+  } else {
+    double temp = absx / scale;
+    ssq += temp * temp;
+  }
+}
+template <>
+float nrm2(const int N, const Complex64* X, const int incX) {
+  double scale = 0, ssq = 1, temp;
+  if ((N < 1) || (incX < 1))    return 0.0;
+  for (int i = 0; i < N; ++i) {
+    nrm2_complex_helper<float>(X[i*incX].r, X[i*incX].i, scale, temp);
+  }
+  return scale * std::sqrt( ssq );
+}
+template <>
+double nrm2(const int N, const Complex128* X, const int incX) {
+  double scale = 0, ssq = 1, temp;
+  if ((N < 1) || (incX < 1))    return 0.0;
+  for (int i = 0; i < N; ++i) {
+    nrm2_complex_helper<double>(X[i*incX].r, X[i*incX].i, scale, temp);
+  }
+  return scale * std::sqrt( ssq );
+}
+#endif
+template <typename ReturnDType, typename DType>
+inline void cblas_nrm2(const int N, const void* X, const int incX, void* result) {
+  *reinterpret_cast<ReturnDType*>( result ) = nrm2<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
+}
+/*
+ * Level 1 BLAS routine which sums the absolute values of a vector's contents. If the vector consists of complex values,
+ * the routine sums the absolute values of the real and imaginary components as well.
+ *
+ * So, based on input types, these are the valid return types:
+ *    int -> int
+ *    float -> float or double
+ *    double -> double
+ *    complex64 -> float or double
+ *    complex128 -> double
+ *    rational -> rational
+ */
+template <typename ReturnDType, typename DType>
+inline ReturnDType asum(const int N, const DType* X, const int incX) {
+  ReturnDType sum = 0;
+  if ((N > 0) && (incX > 0)) {
+    for (int i = 0; i < N; ++i) {
+      sum += std::abs(X[i*incX]);
+    }
+  }
+  return sum;
+}
+#ifdef HAVE_CBLAS_H
+template <>
+inline float asum(const int N, const float* X, const int incX) {
+  return cblas_sasum(N, X, incX);
+}
+template <>
+inline double asum(const int N, const double* X, const int incX) {
+  return cblas_dasum(N, X, incX);
+}
+template <>
+inline float asum(const int N, const Complex64* X, const int incX) {
+  return cblas_scasum(N, X, incX);
+}
+template <>
+inline double asum(const int N, const Complex128* X, const int incX) {
+  return cblas_dzasum(N, X, incX);
+}
+#else
+template <>
+inline float asum(const int N, const Complex64* X, const int incX) {
+  float sum = 0;
+  if ((N > 0) && (incX > 0)) {
+    for (int i = 0; i < N; ++i) {
+      sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
+    }
+  }
+  return sum;
+}
+template <>
+inline double asum(const int N, const Complex128* X, const int incX) {
+  double sum = 0;
+  if ((N > 0) && (incX > 0)) {
+    for (int i = 0; i < N; ++i) {
+      sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
+    }
+  }
+  return sum;
+}
+#endif
+template <typename ReturnDType, typename DType>
+inline void cblas_asum(const int N, const void* X, const int incX, void* sum) {
+  *reinterpret_cast<ReturnDType*>( sum ) = asum<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
 }