nmatrix 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -102,6 +102,9 @@ extern "C" {
102
102
  void* nm_yale_storage_ref(STORAGE* s, SLICE* slice);
103
103
  char nm_yale_storage_set(STORAGE* storage, SLICE* slice, void* v);
104
104
 
105
+ //char nm_yale_storage_vector_insert(YALE_STORAGE* s, size_t pos, size_t* js, void* vals, size_t n, bool struct_only, nm::dtype_t dtype, nm::itype_t itype);
106
+ //void nm_yale_storage_increment_ia_after(YALE_STORAGE* s, size_t ija_size, size_t i, size_t n);
107
+
105
108
  size_t nm_yale_storage_get_size(const YALE_STORAGE* storage);
106
109
 
107
110
  ///////////
@@ -127,6 +127,8 @@ extern "C" {
127
127
  #include <clapack.h>
128
128
  #endif
129
129
 
130
+ static VALUE nm_cblas_nrm2(VALUE self, VALUE n, VALUE x, VALUE incx);
131
+ static VALUE nm_cblas_asum(VALUE self, VALUE n, VALUE x, VALUE incx);
130
132
  static VALUE nm_cblas_rot(VALUE self, VALUE n, VALUE x, VALUE incx, VALUE y, VALUE incy, VALUE c, VALUE s);
131
133
  static VALUE nm_cblas_rotg(VALUE self, VALUE ab);
132
134
 
@@ -307,6 +309,8 @@ void nm_math_init_blas() {
307
309
 
308
310
  cNMatrix_BLAS = rb_define_module_under(cNMatrix, "BLAS");
309
311
 
312
+ rb_define_singleton_method(cNMatrix_BLAS, "cblas_nrm2", (METHOD)nm_cblas_nrm2, 3);
313
+ rb_define_singleton_method(cNMatrix_BLAS, "cblas_asum", (METHOD)nm_cblas_asum, 3);
310
314
  rb_define_singleton_method(cNMatrix_BLAS, "cblas_rot", (METHOD)nm_cblas_rot, 7);
311
315
  rb_define_singleton_method(cNMatrix_BLAS, "cblas_rotg", (METHOD)nm_cblas_rotg, 1);
312
316
 
@@ -515,6 +519,114 @@ static VALUE nm_cblas_rot(VALUE self, VALUE n, VALUE x, VALUE incx, VALUE y, VAL
515
519
  }
516
520
 
517
521
 
522
+ /*
523
+ * Call any of the cblas_xnrm2 functions as directly as possible.
524
+ *
525
+ * xNRM2 is a BLAS level 1 routine which calculates the 2-norm of an n-vector x.
526
+ *
527
+ * Arguments:
528
+ * * n :: length of x, must be at least 0
529
+ * * x :: pointer to first entry of input vector
530
+ * * incx :: stride of x, must be POSITIVE (ATLAS says non-zero, but 3.8.4 code only allows positive)
531
+ *
532
+ * You probably don't want to call this function. Instead, why don't you try nrm2, which is more flexible
533
+ * with its arguments?
534
+ *
535
+ * This function does almost no type checking. Seriously, be really careful when you call it! There's no exception
536
+ * handling, so you can easily crash Ruby!
537
+ */
538
+ static VALUE nm_cblas_nrm2(VALUE self, VALUE n, VALUE x, VALUE incx) {
539
+
540
+ static void (*ttable[nm::NUM_DTYPES])(const int N, const void* X, const int incX, void* sum) = {
541
+ /* nm::math::cblas_nrm2<uint8_t,uint8_t>,
542
+ nm::math::cblas_nrm2<int8_t,int8_t>,
543
+ nm::math::cblas_nrm2<int16_t,int16_t>,
544
+ nm::math::cblas_nrm2<int32_t,int32_t>, */
545
+ NULL, NULL, NULL, NULL, NULL, // no help for integers
546
+ nm::math::cblas_nrm2<float32_t,float32_t>,
547
+ nm::math::cblas_nrm2<float64_t,float64_t>,
548
+ nm::math::cblas_nrm2<float32_t,nm::Complex64>,
549
+ nm::math::cblas_nrm2<float64_t,nm::Complex128>,
550
+ nm::math::cblas_nrm2<nm::Rational32,nm::Rational32>,
551
+ nm::math::cblas_nrm2<nm::Rational64,nm::Rational64>,
552
+ nm::math::cblas_nrm2<nm::Rational128,nm::Rational128>,
553
+ nm::math::cblas_nrm2<nm::RubyObject,nm::RubyObject>
554
+ };
555
+
556
+ nm::dtype_t dtype = NM_DTYPE(x);
557
+
558
+ if (!ttable[dtype]) {
559
+ rb_raise(nm_eDataTypeError, "this vector operation undefined for integer vectors");
560
+ return Qnil;
561
+
562
+ } else {
563
+ // Determine the return dtype and allocate it
564
+ nm::dtype_t rdtype = dtype;
565
+ if (dtype == nm::COMPLEX64) rdtype = nm::FLOAT32;
566
+ else if (dtype == nm::COMPLEX128) rdtype = nm::FLOAT64;
567
+
568
+ void *Result = ALLOCA_N(char, DTYPE_SIZES[rdtype]);
569
+
570
+ ttable[dtype](FIX2INT(n), NM_STORAGE_DENSE(x)->elements, FIX2INT(incx), Result);
571
+
572
+ return rubyobj_from_cval(Result, rdtype).rval;
573
+ }
574
+ }
575
+
576
+
577
+
578
+ /*
579
+ * Call any of the cblas_xasum functions as directly as possible.
580
+ *
581
+ * xASUM is a BLAS level 1 routine which calculates the sum of absolute values of the entries
582
+ * of a vector x.
583
+ *
584
+ * Arguments:
585
+ * * n :: length of x, must be at least 0
586
+ * * x :: pointer to first entry of input vector
587
+ * * incx :: stride of x, must be POSITIVE (ATLAS says non-zero, but 3.8.4 code only allows positive)
588
+ *
589
+ * You probably don't want to call this function. Instead, why don't you try asum, which is more flexible
590
+ * with its arguments?
591
+ *
592
+ * This function does almost no type checking. Seriously, be really careful when you call it! There's no exception
593
+ * handling, so you can easily crash Ruby!
594
+ */
595
+ static VALUE nm_cblas_asum(VALUE self, VALUE n, VALUE x, VALUE incx) {
596
+
597
+ static void (*ttable[nm::NUM_DTYPES])(const int N, const void* X, const int incX, void* sum) = {
598
+ nm::math::cblas_asum<uint8_t,uint8_t>,
599
+ nm::math::cblas_asum<int8_t,int8_t>,
600
+ nm::math::cblas_asum<int16_t,int16_t>,
601
+ nm::math::cblas_asum<int32_t,int32_t>,
602
+ nm::math::cblas_asum<int64_t,int64_t>,
603
+ nm::math::cblas_asum<float32_t,float32_t>,
604
+ nm::math::cblas_asum<float64_t,float64_t>,
605
+ nm::math::cblas_asum<float32_t,nm::Complex64>,
606
+ nm::math::cblas_asum<float64_t,nm::Complex128>,
607
+ nm::math::cblas_asum<nm::Rational32,nm::Rational32>,
608
+ nm::math::cblas_asum<nm::Rational64,nm::Rational64>,
609
+ nm::math::cblas_asum<nm::Rational128,nm::Rational128>,
610
+ nm::math::cblas_asum<nm::RubyObject,nm::RubyObject>
611
+ };
612
+
613
+ nm::dtype_t dtype = NM_DTYPE(x);
614
+
615
+ // Determine the return dtype and allocate it
616
+ nm::dtype_t rdtype = dtype;
617
+ if (dtype == nm::COMPLEX64) rdtype = nm::FLOAT32;
618
+ else if (dtype == nm::COMPLEX128) rdtype = nm::FLOAT64;
619
+
620
+ void *Result = ALLOCA_N(char, DTYPE_SIZES[rdtype]);
621
+
622
+ ttable[dtype](FIX2INT(n), NM_STORAGE_DENSE(x)->elements, FIX2INT(incx), Result);
623
+
624
+ return rubyobj_from_cval(Result, rdtype).rval;
625
+ }
626
+
627
+
628
+
629
+
518
630
  /* Call any of the cblas_xgemm functions as directly as possible.
519
631
  *
520
632
  * The cblas_xgemm functions (dgemm, sgemm, cgemm, and zgemm) define the following operation:
@@ -1026,33 +1026,31 @@ inline bool gemv(const enum CBLAS_TRANSPOSE Trans, const int M, const int N, con
1026
1026
 
1027
1027
  // Yale: numeric matrix multiply c=a*b
1028
1028
  template <typename DType, typename IType>
1029
- inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const DType* a, const bool diaga,
1029
+ inline void numbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const DType* a, const bool diaga,
1030
1030
  const IType* ib, const IType* jb, const DType* b, const bool diagb, IType* ic, IType* jc, DType* c, const bool diagc) {
1031
- IType next[m];
1032
- DType sums[m];
1031
+ const unsigned int max_lmn = std::max(std::max(m, n), l);
1032
+ IType next[max_lmn];
1033
+ DType sums[max_lmn];
1033
1034
 
1034
1035
  DType v;
1035
1036
 
1036
1037
  IType head, length, temp, ndnz = 0;
1037
- IType jj_start, jj_end, kk_start, kk_end;
1038
- IType i, j, k, kk, jj;
1039
1038
  IType minmn = std::min(m,n);
1039
+ IType minlm = std::min(l,m);
1040
1040
 
1041
- for (i = 0; i < m; ++i) { // initialize scratch arrays
1042
- next[i] = std::numeric_limits<IType>::max();
1043
- sums[i] = 0;
1041
+ for (IType idx = 0; idx < max_lmn; ++idx) { // initialize scratch arrays
1042
+ next[idx] = std::numeric_limits<IType>::max();
1043
+ sums[idx] = 0;
1044
1044
  }
1045
1045
 
1046
- for (i = 0; i < n; ++i) { // walk down the rows
1046
+ for (IType i = 0; i < n; ++i) { // walk down the rows
1047
1047
  head = std::numeric_limits<IType>::max()-1; // head gets assigned as whichever column of B's row j we last visited
1048
1048
  length = 0;
1049
1049
 
1050
- jj_start = ia[i];
1051
- jj_end = ia[i+1];
1050
+ for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // walk through entries in each row
1051
+ IType j;
1052
1052
 
1053
- for (jj = jj_start; jj <= jj_end; ++jj) { // walk through entries in each row
1054
-
1055
- if (jj == jj_end) { // if we're in the last entry for this row:
1053
+ if (jj == ia[i+1]) { // if we're in the last entry for this row:
1056
1054
  if (!diaga || i >= minmn) continue;
1057
1055
  j = i; // if it's a new Yale matrix, and last entry, get the diagonal position (j) and entry (ajj)
1058
1056
  v = a[i];
@@ -1061,12 +1059,12 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
1061
1059
  v = a[jj];
1062
1060
  }
1063
1061
 
1064
- kk_start = ib[j]; // Find the first entry of row j of matrix B
1065
- kk_end = ib[j+1];
1066
- for (kk = kk_start; kk <= kk_end; ++kk) {
1062
+ for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) {
1063
+
1064
+ IType k;
1067
1065
 
1068
- if (kk == kk_end) { // Get the column id for that entry
1069
- if (!diagb || j >= minmn) continue;
1066
+ if (kk == ib[j+1]) { // Get the column id for that entry
1067
+ if (!diagb || j >= minlm) continue;
1070
1068
  k = j;
1071
1069
  sums[k] += v*b[k];
1072
1070
  } else {
@@ -1079,10 +1077,10 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
1079
1077
  head = k;
1080
1078
  ++length;
1081
1079
  }
1082
- }
1083
- }
1080
+ } // end of kk loop
1081
+ } // end of jj loop
1084
1082
 
1085
- for (jj = 0; jj < length; ++jj) {
1083
+ for (IType jj = 0; jj < length; ++jj) {
1086
1084
  if (sums[head] != 0) {
1087
1085
  if (diagc && head == i) {
1088
1086
  c[head] = sums[head];
@@ -1105,22 +1103,64 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
1105
1103
  } /* numbmm_ */
1106
1104
 
1107
1105
 
1106
+ /*
1107
+ template <typename DType, typename IType>
1108
+ inline void new_yale_matrix_multiply(const unsigned int m, const IType* ija, const DType* a, const IType* ijb, const DType* b, YALE_STORAGE* c_storage) {
1109
+ unsigned int n = c_storage->shape[0],
1110
+ l = c_storage->shape[1];
1111
+
1112
+ // Create a working vector of dimension max(m,l,n) and initial value IType::max():
1113
+ std::vector<IType> mask(std::max(std::max(m,l),n), std::numeric_limits<IType>::max());
1114
+
1115
+ for (IType i = 0; i < n; ++i) { // A.rows.each_index do |i|
1116
+
1117
+ IType j, k;
1118
+ size_t ndnz;
1119
+
1120
+ for (IType jj = ija[i]; jj <= ija[i+1]; ++jj) { // walk through column pointers for row i of A
1121
+ j = (jj == ija[i+1]) ? i : ija[jj]; // Get the current column index (handle diagonals last)
1122
+
1123
+ if (j >= m) {
1124
+ if (j == ija[jj]) rb_raise(rb_eIndexError, "ija array for left-hand matrix contains an out-of-bounds column index %u at position %u", jj, j);
1125
+ else break;
1126
+ }
1127
+
1128
+ for (IType kk = ijb[j]; kk <= ijb[j+1]; ++kk) { // walk through column pointers for row j of B
1129
+ if (j >= m) continue; // first of all, does B *have* a row j?
1130
+ k = (kk == ijb[j+1]) ? j : ijb[kk]; // Get the current column index (handle diagonals last)
1131
+
1132
+ if (k >= l) {
1133
+ if (k == ijb[kk]) rb_raise(rb_eIndexError, "ija array for right-hand matrix contains an out-of-bounds column index %u at position %u", kk, k);
1134
+ else break;
1135
+ }
1136
+
1137
+ if (mask[k] == )
1138
+ }
1139
+
1140
+ }
1141
+ }
1142
+ }
1143
+ */
1108
1144
 
1109
1145
  // Yale: Symbolic matrix multiply c=a*b
1110
1146
  template <typename IType>
1111
- inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const bool diaga,
1147
+ inline size_t symbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const bool diaga,
1112
1148
  const IType* ib, const IType* jb, const bool diagb, IType* ic, const bool diagc) {
1113
- IType mask[m];
1114
- IType j, k, ndnz = n; /* Local variables */
1149
+ unsigned int max_lmn = std::max(std::max(m,n), l);
1150
+ IType mask[max_lmn]; // INDEX in the SMMP paper.
1151
+ IType j, k; /* Local variables */
1152
+ size_t ndnz = n;
1115
1153
 
1154
+ for (IType idx = 0; idx < max_lmn; ++idx)
1155
+ mask[idx] = std::numeric_limits<IType>::max();
1116
1156
 
1117
- for (j = 0; j < m; ++j)
1118
- mask[j] = std::numeric_limits<IType>::max();
1119
-
1120
- if (diagc) ic[0] = n+1;
1121
- else ic[0] = 0;
1157
+ if (ic) { // Only write to ic if it's supplied; otherwise, we're just counting.
1158
+ if (diagc) ic[0] = n+1;
1159
+ else ic[0] = 0;
1160
+ }
1122
1161
 
1123
1162
  IType minmn = std::min(m,n);
1163
+ IType minlm = std::min(l,m);
1124
1164
 
1125
1165
  for (IType i = 0; i < n; ++i) { // MAIN LOOP: through rows
1126
1166
 
@@ -1132,9 +1172,9 @@ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia,
1132
1172
  j = i;
1133
1173
  } else j = ja[jj];
1134
1174
 
1135
- for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns of row J in matrix B.
1175
+ for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns K of row J in matrix B.
1136
1176
  if (kk == ib[j+1]) {
1137
- if (!diagb || j >= minmn) continue;
1177
+ if (!diagb || j >= minlm) continue;
1138
1178
  k = j;
1139
1179
  } else k = jb[kk];
1140
1180
 
@@ -1145,65 +1185,138 @@ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia,
1145
1185
  }
1146
1186
  }
1147
1187
 
1148
- if (diagc && !mask[i]) --ndnz;
1188
+ if (diagc && mask[i] == std::numeric_limits<IType>::max()) --ndnz;
1149
1189
 
1150
- ic[i+1] = ndnz;
1190
+ if (ic) ic[i+1] = ndnz;
1151
1191
  }
1152
- } /* symbmm_ */
1153
1192
 
1193
+ return ndnz;
1194
+ } /* symbmm_ */
1154
1195
 
1155
- //TODO: More efficient sorting algorithm than selection sort would be nice, probably.
1156
- // Remember, we're dealing with unique keys, which simplifies things.
1157
- // Doesn't have to be in-place, since we probably just multiplied and that wasn't in-place.
1158
- template <typename DType, typename IType>
1159
- inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
1160
- IType jj, min, min_jj;
1161
- DType temp_val;
1162
1196
 
1163
- for (size_t i = 0; i < n; ++i) {
1164
- // No need to sort if there are 0 or 1 entries in the row
1165
- if (ia[i+1] - ia[i] < 2) continue;
1197
+ // In-place quicksort (from Wikipedia) -- called by smmp_sort_columns, below. All functions are inclusive of left, right.
1198
+ namespace smmp_sort {
1199
+ const size_t THRESHOLD = 4; // switch to insertion sort for 4 elements or fewer
1166
1200
 
1167
- for (IType jj_start = ia[i]; jj_start < ia[i+1]; ++jj_start) {
1201
+ template <typename DType, typename IType>
1202
+ void print_array(DType* vals, IType* array, IType left, IType right) {
1203
+ for (IType i = left; i <= right; ++i) {
1204
+ std::cerr << array[i] << ":" << vals[i] << " ";
1205
+ }
1206
+ std::cerr << std::endl;
1207
+ }
1168
1208
 
1169
- // If the previous min is just current-1, this key/value pair is already in sorted order.
1170
- // This follows from the unique condition on our column keys.
1171
- if (jj_start > ia[i] && min+1 == ja[jj_start]) {
1172
- min = ja[jj_start];
1173
- continue;
1209
+ template <typename DType, typename IType>
1210
+ IType partition(DType* vals, IType* array, IType left, IType right, IType pivot) {
1211
+ IType pivotJ = array[pivot];
1212
+ DType pivotV = vals[pivot];
1213
+
1214
+ // Swap pivot and right
1215
+ array[pivot] = array[right];
1216
+ vals[pivot] = vals[right];
1217
+ array[right] = pivotJ;
1218
+ vals[right] = pivotV;
1219
+
1220
+ IType store = left;
1221
+ for (IType idx = left; idx < right; ++idx) {
1222
+ if (array[idx] <= pivotJ) {
1223
+ // Swap i and store
1224
+ std::swap(array[idx], array[store]);
1225
+ std::swap(vals[idx], vals[store]);
1226
+ ++store;
1174
1227
  }
1228
+ }
1175
1229
 
1176
- // find the minimum key (column index) between jj_start and ia[i+1]
1177
- min = ja[jj_start];
1178
- min_jj = jj_start;
1179
- for (jj = jj_start+1; jj < ia[i+1]; ++jj) {
1180
- if (ja[jj] < min) {
1181
- min_jj = jj;
1182
- min = ja[jj];
1183
- }
1230
+ std::swap(array[store], array[right]);
1231
+ std::swap(vals[store], vals[right]);
1232
+
1233
+ return store;
1234
+ }
1235
+
1236
+ // Recommended to use the median of left, right, and mid for the pivot.
1237
+ template <typename IType>
1238
+ IType median(IType a, IType b, IType c) {
1239
+ if (a < b) {
1240
+ if (b < c) return b; // a b c
1241
+ if (a < c) return c; // a c b
1242
+ return a; // c a b
1243
+
1244
+ } else { // a > b
1245
+ if (a < c) return a; // b a c
1246
+ if (b < c) return c; // b c a
1247
+ return b; // c b a
1248
+ }
1249
+ }
1250
+
1251
+
1252
+ // Insertion sort is more efficient than quicksort for small N
1253
+ template <typename DType, typename IType>
1254
+ void insertion_sort(DType* vals, IType* array, IType left, IType right) {
1255
+ for (IType idx = left; idx <= right; ++idx) {
1256
+ IType col_to_insert = array[idx];
1257
+ DType val_to_insert = vals[idx];
1258
+
1259
+ IType hole_pos = idx;
1260
+ for (; hole_pos > left && col_to_insert < array[hole_pos-1]; --hole_pos) {
1261
+ array[hole_pos] = array[hole_pos - 1]; // shift the larger column index up
1262
+ vals[hole_pos] = vals[hole_pos - 1]; // value goes along with it
1184
1263
  }
1185
1264
 
1186
- // if min is already first, skip this iteration
1187
- if (min_jj == jj_start) continue;
1265
+ array[hole_pos] = col_to_insert;
1266
+ vals[hole_pos] = val_to_insert;
1267
+ }
1268
+ }
1269
+
1188
1270
 
1189
- for (jj = jj_start; jj < ia[i+1]; ++jj) {
1190
- // swap minimum key/value pair with key/value pair in the first position.
1191
- if (min_jj != jj) {
1192
- // min already = ja[min_jj], so use this as temp_key
1193
- temp_val = a[min_jj];
1271
+ template <typename DType, typename IType>
1272
+ void quicksort(DType* vals, IType* array, IType left, IType right) {
1194
1273
 
1195
- ja[min_jj] = ja[jj];
1196
- a[min_jj] = a[jj];
1274
+ if (left < right) {
1275
+ if (right - left < THRESHOLD) {
1276
+ insertion_sort(vals, array, left, right);
1277
+ } else {
1278
+ // choose any pivot such that left < pivot < right
1279
+ IType pivot = median(left, right, (IType)(((unsigned long)left + (unsigned long)right) / 2));
1280
+ pivot = partition(vals, array, left, right, pivot);
1197
1281
 
1198
- ja[jj] = min;
1199
- a[jj] = temp_val;
1200
- }
1282
+ // recursively sort elements smaller than the pivot
1283
+ quicksort<DType,IType>(vals, array, left, pivot-1);
1284
+
1285
+ // recursively sort elements at least as big as the pivot
1286
+ quicksort<DType,IType>(vals, array, pivot+1, right);
1201
1287
  }
1202
1288
  }
1203
1289
  }
1290
+
1291
+
1292
+ }; // end of namespace smmp_sort
1293
+
1294
+
1295
+ /*
1296
+ * For use following symbmm and numbmm. Sorts the matrix entries in each row according to the column index.
1297
+ * This utilizes quicksort, which is an in-place unstable sort (since there are no duplicate entries, we don't care
1298
+ * about stability).
1299
+ *
1300
+ * TODO: It might be worthwhile to do a test for free memory, and if available, use an unstable sort that isn't in-place.
1301
+ *
1302
+ * TODO: It's actually probably possible to write an even faster sort, since symbmm/numbmm are not producing a random
1303
+ * ordering. If someone is doing a lot of Yale matrix multiplication, it might benefit them to consider even insertion
1304
+ * sort.
1305
+ */
1306
+ template <typename DType, typename IType>
1307
+ inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
1308
+ for (size_t i = 0; i < n; ++i) {
1309
+ if (ia[i+1] - ia[i] < 2) continue; // no need to sort rows containing only one or two elements.
1310
+ else if (ia[i+1] - ia[i] <= smmp_sort::THRESHOLD) {
1311
+ smmp_sort::insertion_sort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for small rows
1312
+ } else {
1313
+ smmp_sort::quicksort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for large rows (and may call insertion_sort as well)
1314
+ }
1315
+ }
1204
1316
  }
1205
1317
 
1206
1318
 
1319
+
1207
1320
  /*
1208
1321
  * Transposes a generic Yale matrix (old or new). Specify new by setting diaga = true.
1209
1322
  *
@@ -2025,7 +2138,194 @@ inline void rot(const int N, Complex128* X, const int incX, Complex128* Y, const
2025
2138
 
2026
2139
  template <typename DType, typename CSDType>
2027
2140
  inline void cblas_rot(const int N, void* X, const int incX, void* Y, const int incY, const void* c, const void* s) {
2028
- rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY, *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
2141
+ rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY,
2142
+ *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
2143
+ }
2144
+
2145
+ /*
2146
+ * Level 1 BLAS routine which returns the 2-norm of an n-vector x.
2147
+ #
2148
+ * Based on input types, these are the valid return types:
2149
+ * int -> int
2150
+ * float -> float or double
2151
+ * double -> double
2152
+ * complex64 -> float or double
2153
+ * complex128 -> double
2154
+ * rational -> rational
2155
+ */
2156
+ template <typename ReturnDType, typename DType>
2157
+ ReturnDType nrm2(const int N, const DType* X, const int incX) {
2158
+ const DType ONE = 1, ZERO = 0;
2159
+ typename LongDType<DType>::type scale = 0, ssq = 1, absxi, temp;
2160
+
2161
+
2162
+ if ((N < 1) || (incX < 1)) return ZERO;
2163
+ else if (N == 1) return std::abs(X[0]);
2164
+
2165
+ for (int i = 0; i < N; ++i) {
2166
+ absxi = std::abs(X[i*incX]);
2167
+ if (scale < absxi) {
2168
+ temp = scale / absxi;
2169
+ scale = absxi;
2170
+ ssq = ONE + ssq * (temp * temp);
2171
+ } else {
2172
+ temp = absxi / scale;
2173
+ ssq += temp * temp;
2174
+ }
2175
+ }
2176
+
2177
+ return scale * std::sqrt( ssq );
2178
+ }
2179
+
2180
+
2181
+ #ifdef HAVE_CBLAS_H
2182
+ template <>
2183
+ inline float nrm2(const int N, const float* X, const int incX) {
2184
+ return cblas_snrm2(N, X, incX);
2185
+ }
2186
+
2187
+ template <>
2188
+ inline double nrm2(const int N, const double* X, const int incX) {
2189
+ return cblas_dnrm2(N, X, incX);
2190
+ }
2191
+
2192
+ template <>
2193
+ inline float nrm2(const int N, const Complex64* X, const int incX) {
2194
+ return cblas_scnrm2(N, X, incX);
2195
+ }
2196
+
2197
+ template <>
2198
+ inline double nrm2(const int N, const Complex128* X, const int incX) {
2199
+ return cblas_dznrm2(N, X, incX);
2200
+ }
2201
+ #else
2202
+ template <typename FloatDType>
2203
+ static inline void nrm2_complex_helper(const FloatDType& xr, const FloatDType& xi, double& scale, double& ssq) {
2204
+ double absx = std::abs(xr);
2205
+ if (scale < absx) {
2206
+ double temp = scale / absx;
2207
+ scale = absx;
2208
+ ssq = 1.0 + ssq * (temp * temp);
2209
+ } else {
2210
+ double temp = absx / scale;
2211
+ ssq += temp * temp;
2212
+ }
2213
+
2214
+ absx = std::abs(xi);
2215
+ if (scale < absx) {
2216
+ double temp = scale / absx;
2217
+ scale = absx;
2218
+ ssq = 1.0 + ssq * (temp * temp);
2219
+ } else {
2220
+ double temp = absx / scale;
2221
+ ssq += temp * temp;
2222
+ }
2223
+ }
2224
+
2225
+ template <>
2226
+ float nrm2(const int N, const Complex64* X, const int incX) {
2227
+ double scale = 0, ssq = 1, temp;
2228
+
2229
+ if ((N < 1) || (incX < 1)) return 0.0;
2230
+
2231
+ for (int i = 0; i < N; ++i) {
2232
+ nrm2_complex_helper<float>(X[i*incX].r, X[i*incX].i, scale, temp);
2233
+ }
2234
+
2235
+ return scale * std::sqrt( ssq );
2236
+ }
2237
+
2238
+ template <>
2239
+ double nrm2(const int N, const Complex128* X, const int incX) {
2240
+ double scale = 0, ssq = 1, temp;
2241
+
2242
+ if ((N < 1) || (incX < 1)) return 0.0;
2243
+
2244
+ for (int i = 0; i < N; ++i) {
2245
+ nrm2_complex_helper<double>(X[i*incX].r, X[i*incX].i, scale, temp);
2246
+ }
2247
+
2248
+ return scale * std::sqrt( ssq );
2249
+ }
2250
+ #endif
2251
+
2252
+ template <typename ReturnDType, typename DType>
2253
+ inline void cblas_nrm2(const int N, const void* X, const int incX, void* result) {
2254
+ *reinterpret_cast<ReturnDType*>( result ) = nrm2<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
2255
+ }
2256
+
2257
+ /*
2258
+ * Level 1 BLAS routine which sums the absolute values of a vector's contents. If the vector consists of complex values,
2259
+ * the routine sums the absolute values of the real and imaginary components as well.
2260
+ *
2261
+ * So, based on input types, these are the valid return types:
2262
+ * int -> int
2263
+ * float -> float or double
2264
+ * double -> double
2265
+ * complex64 -> float or double
2266
+ * complex128 -> double
2267
+ * rational -> rational
2268
+ */
2269
+ template <typename ReturnDType, typename DType>
2270
+ inline ReturnDType asum(const int N, const DType* X, const int incX) {
2271
+ ReturnDType sum = 0;
2272
+ if ((N > 0) && (incX > 0)) {
2273
+ for (int i = 0; i < N; ++i) {
2274
+ sum += std::abs(X[i*incX]);
2275
+ }
2276
+ }
2277
+ return sum;
2278
+ }
2279
+
2280
+
2281
+ #ifdef HAVE_CBLAS_H
2282
+ template <>
2283
+ inline float asum(const int N, const float* X, const int incX) {
2284
+ return cblas_sasum(N, X, incX);
2285
+ }
2286
+
2287
+ template <>
2288
+ inline double asum(const int N, const double* X, const int incX) {
2289
+ return cblas_dasum(N, X, incX);
2290
+ }
2291
+
2292
+ template <>
2293
+ inline float asum(const int N, const Complex64* X, const int incX) {
2294
+ return cblas_scasum(N, X, incX);
2295
+ }
2296
+
2297
+ template <>
2298
+ inline double asum(const int N, const Complex128* X, const int incX) {
2299
+ return cblas_dzasum(N, X, incX);
2300
+ }
2301
+ #else
2302
+ template <>
2303
+ inline float asum(const int N, const Complex64* X, const int incX) {
2304
+ float sum = 0;
2305
+ if ((N > 0) && (incX > 0)) {
2306
+ for (int i = 0; i < N; ++i) {
2307
+ sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
2308
+ }
2309
+ }
2310
+ return sum;
2311
+ }
2312
+
2313
+ template <>
2314
+ inline double asum(const int N, const Complex128* X, const int incX) {
2315
+ double sum = 0;
2316
+ if ((N > 0) && (incX > 0)) {
2317
+ for (int i = 0; i < N; ++i) {
2318
+ sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
2319
+ }
2320
+ }
2321
+ return sum;
2322
+ }
2323
+ #endif
2324
+
2325
+
2326
+ template <typename ReturnDType, typename DType>
2327
+ inline void cblas_asum(const int N, const void* X, const int incX, void* sum) {
2328
+ *reinterpret_cast<ReturnDType*>( sum ) = asum<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
2029
2329
  }
2030
2330
 
2031
2331