nmatrix 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -102,6 +102,9 @@ extern "C" {
102
102
  void* nm_yale_storage_ref(STORAGE* s, SLICE* slice);
103
103
  char nm_yale_storage_set(STORAGE* storage, SLICE* slice, void* v);
104
104
 
105
+ //char nm_yale_storage_vector_insert(YALE_STORAGE* s, size_t pos, size_t* js, void* vals, size_t n, bool struct_only, nm::dtype_t dtype, nm::itype_t itype);
106
+ //void nm_yale_storage_increment_ia_after(YALE_STORAGE* s, size_t ija_size, size_t i, size_t n);
107
+
105
108
  size_t nm_yale_storage_get_size(const YALE_STORAGE* storage);
106
109
 
107
110
  ///////////
@@ -127,6 +127,8 @@ extern "C" {
127
127
  #include <clapack.h>
128
128
  #endif
129
129
 
130
+ static VALUE nm_cblas_nrm2(VALUE self, VALUE n, VALUE x, VALUE incx);
131
+ static VALUE nm_cblas_asum(VALUE self, VALUE n, VALUE x, VALUE incx);
130
132
  static VALUE nm_cblas_rot(VALUE self, VALUE n, VALUE x, VALUE incx, VALUE y, VALUE incy, VALUE c, VALUE s);
131
133
  static VALUE nm_cblas_rotg(VALUE self, VALUE ab);
132
134
 
@@ -307,6 +309,8 @@ void nm_math_init_blas() {
307
309
 
308
310
  cNMatrix_BLAS = rb_define_module_under(cNMatrix, "BLAS");
309
311
 
312
+ rb_define_singleton_method(cNMatrix_BLAS, "cblas_nrm2", (METHOD)nm_cblas_nrm2, 3);
313
+ rb_define_singleton_method(cNMatrix_BLAS, "cblas_asum", (METHOD)nm_cblas_asum, 3);
310
314
  rb_define_singleton_method(cNMatrix_BLAS, "cblas_rot", (METHOD)nm_cblas_rot, 7);
311
315
  rb_define_singleton_method(cNMatrix_BLAS, "cblas_rotg", (METHOD)nm_cblas_rotg, 1);
312
316
 
@@ -515,6 +519,114 @@ static VALUE nm_cblas_rot(VALUE self, VALUE n, VALUE x, VALUE incx, VALUE y, VAL
515
519
  }
516
520
 
517
521
 
522
+ /*
523
+ * Call any of the cblas_xnrm2 functions as directly as possible.
524
+ *
525
+ * xNRM2 is a BLAS level 1 routine which calculates the 2-norm of an n-vector x.
526
+ *
527
+ * Arguments:
528
+ * * n :: length of x, must be at least 0
529
+ * * x :: pointer to first entry of input vector
530
+ * * incx :: stride of x, must be POSITIVE (ATLAS says non-zero, but 3.8.4 code only allows positive)
531
+ *
532
+ * You probably don't want to call this function. Instead, why don't you try nrm2, which is more flexible
533
+ * with its arguments?
534
+ *
535
+ * This function does almost no type checking. Seriously, be really careful when you call it! There's no exception
536
+ * handling, so you can easily crash Ruby!
537
+ */
538
+ static VALUE nm_cblas_nrm2(VALUE self, VALUE n, VALUE x, VALUE incx) {
539
+
540
+ static void (*ttable[nm::NUM_DTYPES])(const int N, const void* X, const int incX, void* sum) = {
541
+ /* nm::math::cblas_nrm2<uint8_t,uint8_t>,
542
+ nm::math::cblas_nrm2<int8_t,int8_t>,
543
+ nm::math::cblas_nrm2<int16_t,int16_t>,
544
+ nm::math::cblas_nrm2<int32_t,int32_t>, */
545
+ NULL, NULL, NULL, NULL, NULL, // no help for integers
546
+ nm::math::cblas_nrm2<float32_t,float32_t>,
547
+ nm::math::cblas_nrm2<float64_t,float64_t>,
548
+ nm::math::cblas_nrm2<float32_t,nm::Complex64>,
549
+ nm::math::cblas_nrm2<float64_t,nm::Complex128>,
550
+ nm::math::cblas_nrm2<nm::Rational32,nm::Rational32>,
551
+ nm::math::cblas_nrm2<nm::Rational64,nm::Rational64>,
552
+ nm::math::cblas_nrm2<nm::Rational128,nm::Rational128>,
553
+ nm::math::cblas_nrm2<nm::RubyObject,nm::RubyObject>
554
+ };
555
+
556
+ nm::dtype_t dtype = NM_DTYPE(x);
557
+
558
+ if (!ttable[dtype]) {
559
+ rb_raise(nm_eDataTypeError, "this vector operation undefined for integer vectors");
560
+ return Qnil;
561
+
562
+ } else {
563
+ // Determine the return dtype and allocate it
564
+ nm::dtype_t rdtype = dtype;
565
+ if (dtype == nm::COMPLEX64) rdtype = nm::FLOAT32;
566
+ else if (dtype == nm::COMPLEX128) rdtype = nm::FLOAT64;
567
+
568
+ void *Result = ALLOCA_N(char, DTYPE_SIZES[rdtype]);
569
+
570
+ ttable[dtype](FIX2INT(n), NM_STORAGE_DENSE(x)->elements, FIX2INT(incx), Result);
571
+
572
+ return rubyobj_from_cval(Result, rdtype).rval;
573
+ }
574
+ }
575
+
576
+
577
+
578
+ /*
579
+ * Call any of the cblas_xasum functions as directly as possible.
580
+ *
581
+ * xASUM is a BLAS level 1 routine which calculates the sum of absolute values of the entries
582
+ * of a vector x.
583
+ *
584
+ * Arguments:
585
+ * * n :: length of x, must be at least 0
586
+ * * x :: pointer to first entry of input vector
587
+ * * incx :: stride of x, must be POSITIVE (ATLAS says non-zero, but 3.8.4 code only allows positive)
588
+ *
589
+ * You probably don't want to call this function. Instead, why don't you try asum, which is more flexible
590
+ * with its arguments?
591
+ *
592
+ * This function does almost no type checking. Seriously, be really careful when you call it! There's no exception
593
+ * handling, so you can easily crash Ruby!
594
+ */
595
+ static VALUE nm_cblas_asum(VALUE self, VALUE n, VALUE x, VALUE incx) {
596
+
597
+ static void (*ttable[nm::NUM_DTYPES])(const int N, const void* X, const int incX, void* sum) = {
598
+ nm::math::cblas_asum<uint8_t,uint8_t>,
599
+ nm::math::cblas_asum<int8_t,int8_t>,
600
+ nm::math::cblas_asum<int16_t,int16_t>,
601
+ nm::math::cblas_asum<int32_t,int32_t>,
602
+ nm::math::cblas_asum<int64_t,int64_t>,
603
+ nm::math::cblas_asum<float32_t,float32_t>,
604
+ nm::math::cblas_asum<float64_t,float64_t>,
605
+ nm::math::cblas_asum<float32_t,nm::Complex64>,
606
+ nm::math::cblas_asum<float64_t,nm::Complex128>,
607
+ nm::math::cblas_asum<nm::Rational32,nm::Rational32>,
608
+ nm::math::cblas_asum<nm::Rational64,nm::Rational64>,
609
+ nm::math::cblas_asum<nm::Rational128,nm::Rational128>,
610
+ nm::math::cblas_asum<nm::RubyObject,nm::RubyObject>
611
+ };
612
+
613
+ nm::dtype_t dtype = NM_DTYPE(x);
614
+
615
+ // Determine the return dtype and allocate it
616
+ nm::dtype_t rdtype = dtype;
617
+ if (dtype == nm::COMPLEX64) rdtype = nm::FLOAT32;
618
+ else if (dtype == nm::COMPLEX128) rdtype = nm::FLOAT64;
619
+
620
+ void *Result = ALLOCA_N(char, DTYPE_SIZES[rdtype]);
621
+
622
+ ttable[dtype](FIX2INT(n), NM_STORAGE_DENSE(x)->elements, FIX2INT(incx), Result);
623
+
624
+ return rubyobj_from_cval(Result, rdtype).rval;
625
+ }
626
+
627
+
628
+
629
+
518
630
  /* Call any of the cblas_xgemm functions as directly as possible.
519
631
  *
520
632
  * The cblas_xgemm functions (dgemm, sgemm, cgemm, and zgemm) define the following operation:
@@ -1026,33 +1026,31 @@ inline bool gemv(const enum CBLAS_TRANSPOSE Trans, const int M, const int N, con
1026
1026
 
1027
1027
  // Yale: numeric matrix multiply c=a*b
1028
1028
  template <typename DType, typename IType>
1029
- inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const DType* a, const bool diaga,
1029
+ inline void numbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const DType* a, const bool diaga,
1030
1030
  const IType* ib, const IType* jb, const DType* b, const bool diagb, IType* ic, IType* jc, DType* c, const bool diagc) {
1031
- IType next[m];
1032
- DType sums[m];
1031
+ const unsigned int max_lmn = std::max(std::max(m, n), l);
1032
+ IType next[max_lmn];
1033
+ DType sums[max_lmn];
1033
1034
 
1034
1035
  DType v;
1035
1036
 
1036
1037
  IType head, length, temp, ndnz = 0;
1037
- IType jj_start, jj_end, kk_start, kk_end;
1038
- IType i, j, k, kk, jj;
1039
1038
  IType minmn = std::min(m,n);
1039
+ IType minlm = std::min(l,m);
1040
1040
 
1041
- for (i = 0; i < m; ++i) { // initialize scratch arrays
1042
- next[i] = std::numeric_limits<IType>::max();
1043
- sums[i] = 0;
1041
+ for (IType idx = 0; idx < max_lmn; ++idx) { // initialize scratch arrays
1042
+ next[idx] = std::numeric_limits<IType>::max();
1043
+ sums[idx] = 0;
1044
1044
  }
1045
1045
 
1046
- for (i = 0; i < n; ++i) { // walk down the rows
1046
+ for (IType i = 0; i < n; ++i) { // walk down the rows
1047
1047
  head = std::numeric_limits<IType>::max()-1; // head gets assigned as whichever column of B's row j we last visited
1048
1048
  length = 0;
1049
1049
 
1050
- jj_start = ia[i];
1051
- jj_end = ia[i+1];
1050
+ for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // walk through entries in each row
1051
+ IType j;
1052
1052
 
1053
- for (jj = jj_start; jj <= jj_end; ++jj) { // walk through entries in each row
1054
-
1055
- if (jj == jj_end) { // if we're in the last entry for this row:
1053
+ if (jj == ia[i+1]) { // if we're in the last entry for this row:
1056
1054
  if (!diaga || i >= minmn) continue;
1057
1055
  j = i; // if it's a new Yale matrix, and last entry, get the diagonal position (j) and entry (ajj)
1058
1056
  v = a[i];
@@ -1061,12 +1059,12 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
1061
1059
  v = a[jj];
1062
1060
  }
1063
1061
 
1064
- kk_start = ib[j]; // Find the first entry of row j of matrix B
1065
- kk_end = ib[j+1];
1066
- for (kk = kk_start; kk <= kk_end; ++kk) {
1062
+ for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) {
1063
+
1064
+ IType k;
1067
1065
 
1068
- if (kk == kk_end) { // Get the column id for that entry
1069
- if (!diagb || j >= minmn) continue;
1066
+ if (kk == ib[j+1]) { // Get the column id for that entry
1067
+ if (!diagb || j >= minlm) continue;
1070
1068
  k = j;
1071
1069
  sums[k] += v*b[k];
1072
1070
  } else {
@@ -1079,10 +1077,10 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
1079
1077
  head = k;
1080
1078
  ++length;
1081
1079
  }
1082
- }
1083
- }
1080
+ } // end of kk loop
1081
+ } // end of jj loop
1084
1082
 
1085
- for (jj = 0; jj < length; ++jj) {
1083
+ for (IType jj = 0; jj < length; ++jj) {
1086
1084
  if (sums[head] != 0) {
1087
1085
  if (diagc && head == i) {
1088
1086
  c[head] = sums[head];
@@ -1105,22 +1103,64 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
1105
1103
  } /* numbmm_ */
1106
1104
 
1107
1105
 
1106
+ /*
1107
+ template <typename DType, typename IType>
1108
+ inline void new_yale_matrix_multiply(const unsigned int m, const IType* ija, const DType* a, const IType* ijb, const DType* b, YALE_STORAGE* c_storage) {
1109
+ unsigned int n = c_storage->shape[0],
1110
+ l = c_storage->shape[1];
1111
+
1112
+ // Create a working vector of dimension max(m,l,n) and initial value IType::max():
1113
+ std::vector<IType> mask(std::max(std::max(m,l),n), std::numeric_limits<IType>::max());
1114
+
1115
+ for (IType i = 0; i < n; ++i) { // A.rows.each_index do |i|
1116
+
1117
+ IType j, k;
1118
+ size_t ndnz;
1119
+
1120
+ for (IType jj = ija[i]; jj <= ija[i+1]; ++jj) { // walk through column pointers for row i of A
1121
+ j = (jj == ija[i+1]) ? i : ija[jj]; // Get the current column index (handle diagonals last)
1122
+
1123
+ if (j >= m) {
1124
+ if (j == ija[jj]) rb_raise(rb_eIndexError, "ija array for left-hand matrix contains an out-of-bounds column index %u at position %u", jj, j);
1125
+ else break;
1126
+ }
1127
+
1128
+ for (IType kk = ijb[j]; kk <= ijb[j+1]; ++kk) { // walk through column pointers for row j of B
1129
+ if (j >= m) continue; // first of all, does B *have* a row j?
1130
+ k = (kk == ijb[j+1]) ? j : ijb[kk]; // Get the current column index (handle diagonals last)
1131
+
1132
+ if (k >= l) {
1133
+ if (k == ijb[kk]) rb_raise(rb_eIndexError, "ija array for right-hand matrix contains an out-of-bounds column index %u at position %u", kk, k);
1134
+ else break;
1135
+ }
1136
+
1137
+ if (mask[k] == )
1138
+ }
1139
+
1140
+ }
1141
+ }
1142
+ }
1143
+ */
1108
1144
 
1109
1145
  // Yale: Symbolic matrix multiply c=a*b
1110
1146
  template <typename IType>
1111
- inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const bool diaga,
1147
+ inline size_t symbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const bool diaga,
1112
1148
  const IType* ib, const IType* jb, const bool diagb, IType* ic, const bool diagc) {
1113
- IType mask[m];
1114
- IType j, k, ndnz = n; /* Local variables */
1149
+ unsigned int max_lmn = std::max(std::max(m,n), l);
1150
+ IType mask[max_lmn]; // INDEX in the SMMP paper.
1151
+ IType j, k; /* Local variables */
1152
+ size_t ndnz = n;
1115
1153
 
1154
+ for (IType idx = 0; idx < max_lmn; ++idx)
1155
+ mask[idx] = std::numeric_limits<IType>::max();
1116
1156
 
1117
- for (j = 0; j < m; ++j)
1118
- mask[j] = std::numeric_limits<IType>::max();
1119
-
1120
- if (diagc) ic[0] = n+1;
1121
- else ic[0] = 0;
1157
+ if (ic) { // Only write to ic if it's supplied; otherwise, we're just counting.
1158
+ if (diagc) ic[0] = n+1;
1159
+ else ic[0] = 0;
1160
+ }
1122
1161
 
1123
1162
  IType minmn = std::min(m,n);
1163
+ IType minlm = std::min(l,m);
1124
1164
 
1125
1165
  for (IType i = 0; i < n; ++i) { // MAIN LOOP: through rows
1126
1166
 
@@ -1132,9 +1172,9 @@ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia,
1132
1172
  j = i;
1133
1173
  } else j = ja[jj];
1134
1174
 
1135
- for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns of row J in matrix B.
1175
+ for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns K of row J in matrix B.
1136
1176
  if (kk == ib[j+1]) {
1137
- if (!diagb || j >= minmn) continue;
1177
+ if (!diagb || j >= minlm) continue;
1138
1178
  k = j;
1139
1179
  } else k = jb[kk];
1140
1180
 
@@ -1145,65 +1185,138 @@ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia,
1145
1185
  }
1146
1186
  }
1147
1187
 
1148
- if (diagc && !mask[i]) --ndnz;
1188
+ if (diagc && mask[i] == std::numeric_limits<IType>::max()) --ndnz;
1149
1189
 
1150
- ic[i+1] = ndnz;
1190
+ if (ic) ic[i+1] = ndnz;
1151
1191
  }
1152
- } /* symbmm_ */
1153
1192
 
1193
+ return ndnz;
1194
+ } /* symbmm_ */
1154
1195
 
1155
- //TODO: More efficient sorting algorithm than selection sort would be nice, probably.
1156
- // Remember, we're dealing with unique keys, which simplifies things.
1157
- // Doesn't have to be in-place, since we probably just multiplied and that wasn't in-place.
1158
- template <typename DType, typename IType>
1159
- inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
1160
- IType jj, min, min_jj;
1161
- DType temp_val;
1162
1196
 
1163
- for (size_t i = 0; i < n; ++i) {
1164
- // No need to sort if there are 0 or 1 entries in the row
1165
- if (ia[i+1] - ia[i] < 2) continue;
1197
+ // In-place quicksort (from Wikipedia) -- called by smmp_sort_columns, below. All functions are inclusive of left, right.
1198
+ namespace smmp_sort {
1199
+ const size_t THRESHOLD = 4; // switch to insertion sort for 4 elements or fewer
1166
1200
 
1167
- for (IType jj_start = ia[i]; jj_start < ia[i+1]; ++jj_start) {
1201
+ template <typename DType, typename IType>
1202
+ void print_array(DType* vals, IType* array, IType left, IType right) {
1203
+ for (IType i = left; i <= right; ++i) {
1204
+ std::cerr << array[i] << ":" << vals[i] << " ";
1205
+ }
1206
+ std::cerr << std::endl;
1207
+ }
1168
1208
 
1169
- // If the previous min is just current-1, this key/value pair is already in sorted order.
1170
- // This follows from the unique condition on our column keys.
1171
- if (jj_start > ia[i] && min+1 == ja[jj_start]) {
1172
- min = ja[jj_start];
1173
- continue;
1209
+ template <typename DType, typename IType>
1210
+ IType partition(DType* vals, IType* array, IType left, IType right, IType pivot) {
1211
+ IType pivotJ = array[pivot];
1212
+ DType pivotV = vals[pivot];
1213
+
1214
+ // Swap pivot and right
1215
+ array[pivot] = array[right];
1216
+ vals[pivot] = vals[right];
1217
+ array[right] = pivotJ;
1218
+ vals[right] = pivotV;
1219
+
1220
+ IType store = left;
1221
+ for (IType idx = left; idx < right; ++idx) {
1222
+ if (array[idx] <= pivotJ) {
1223
+ // Swap i and store
1224
+ std::swap(array[idx], array[store]);
1225
+ std::swap(vals[idx], vals[store]);
1226
+ ++store;
1174
1227
  }
1228
+ }
1175
1229
 
1176
- // find the minimum key (column index) between jj_start and ia[i+1]
1177
- min = ja[jj_start];
1178
- min_jj = jj_start;
1179
- for (jj = jj_start+1; jj < ia[i+1]; ++jj) {
1180
- if (ja[jj] < min) {
1181
- min_jj = jj;
1182
- min = ja[jj];
1183
- }
1230
+ std::swap(array[store], array[right]);
1231
+ std::swap(vals[store], vals[right]);
1232
+
1233
+ return store;
1234
+ }
1235
+
1236
+ // Recommended to use the median of left, right, and mid for the pivot.
1237
+ template <typename IType>
1238
+ IType median(IType a, IType b, IType c) {
1239
+ if (a < b) {
1240
+ if (b < c) return b; // a b c
1241
+ if (a < c) return c; // a c b
1242
+ return a; // c a b
1243
+
1244
+ } else { // a > b
1245
+ if (a < c) return a; // b a c
1246
+ if (b < c) return c; // b c a
1247
+ return b; // c b a
1248
+ }
1249
+ }
1250
+
1251
+
1252
+ // Insertion sort is more efficient than quicksort for small N
1253
+ template <typename DType, typename IType>
1254
+ void insertion_sort(DType* vals, IType* array, IType left, IType right) {
1255
+ for (IType idx = left; idx <= right; ++idx) {
1256
+ IType col_to_insert = array[idx];
1257
+ DType val_to_insert = vals[idx];
1258
+
1259
+ IType hole_pos = idx;
1260
+ for (; hole_pos > left && col_to_insert < array[hole_pos-1]; --hole_pos) {
1261
+ array[hole_pos] = array[hole_pos - 1]; // shift the larger column index up
1262
+ vals[hole_pos] = vals[hole_pos - 1]; // value goes along with it
1184
1263
  }
1185
1264
 
1186
- // if min is already first, skip this iteration
1187
- if (min_jj == jj_start) continue;
1265
+ array[hole_pos] = col_to_insert;
1266
+ vals[hole_pos] = val_to_insert;
1267
+ }
1268
+ }
1269
+
1188
1270
 
1189
- for (jj = jj_start; jj < ia[i+1]; ++jj) {
1190
- // swap minimum key/value pair with key/value pair in the first position.
1191
- if (min_jj != jj) {
1192
- // min already = ja[min_jj], so use this as temp_key
1193
- temp_val = a[min_jj];
1271
+ template <typename DType, typename IType>
1272
+ void quicksort(DType* vals, IType* array, IType left, IType right) {
1194
1273
 
1195
- ja[min_jj] = ja[jj];
1196
- a[min_jj] = a[jj];
1274
+ if (left < right) {
1275
+ if (right - left < THRESHOLD) {
1276
+ insertion_sort(vals, array, left, right);
1277
+ } else {
1278
+ // choose any pivot such that left < pivot < right
1279
+ IType pivot = median(left, right, (IType)(((unsigned long)left + (unsigned long)right) / 2));
1280
+ pivot = partition(vals, array, left, right, pivot);
1197
1281
 
1198
- ja[jj] = min;
1199
- a[jj] = temp_val;
1200
- }
1282
+ // recursively sort elements smaller than the pivot
1283
+ quicksort<DType,IType>(vals, array, left, pivot-1);
1284
+
1285
+ // recursively sort elements at least as big as the pivot
1286
+ quicksort<DType,IType>(vals, array, pivot+1, right);
1201
1287
  }
1202
1288
  }
1203
1289
  }
1290
+
1291
+
1292
+ }; // end of namespace smmp_sort
1293
+
1294
+
1295
+ /*
1296
+ * For use following symbmm and numbmm. Sorts the matrix entries in each row according to the column index.
1297
+ * This utilizes quicksort, which is an in-place unstable sort (since there are no duplicate entries, we don't care
1298
+ * about stability).
1299
+ *
1300
+ * TODO: It might be worthwhile to do a test for free memory, and if available, use an unstable sort that isn't in-place.
1301
+ *
1302
+ * TODO: It's actually probably possible to write an even faster sort, since symbmm/numbmm are not producing a random
1303
+ * ordering. If someone is doing a lot of Yale matrix multiplication, it might benefit them to consider even insertion
1304
+ * sort.
1305
+ */
1306
+ template <typename DType, typename IType>
1307
+ inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
1308
+ for (size_t i = 0; i < n; ++i) {
1309
+ if (ia[i+1] - ia[i] < 2) continue; // no need to sort rows containing only one or two elements.
1310
+ else if (ia[i+1] - ia[i] <= smmp_sort::THRESHOLD) {
1311
+ smmp_sort::insertion_sort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for small rows
1312
+ } else {
1313
+ smmp_sort::quicksort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for large rows (and may call insertion_sort as well)
1314
+ }
1315
+ }
1204
1316
  }
1205
1317
 
1206
1318
 
1319
+
1207
1320
  /*
1208
1321
  * Transposes a generic Yale matrix (old or new). Specify new by setting diaga = true.
1209
1322
  *
@@ -2025,7 +2138,194 @@ inline void rot(const int N, Complex128* X, const int incX, Complex128* Y, const
2025
2138
 
2026
2139
  template <typename DType, typename CSDType>
2027
2140
  inline void cblas_rot(const int N, void* X, const int incX, void* Y, const int incY, const void* c, const void* s) {
2028
- rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY, *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
2141
+ rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY,
2142
+ *reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
2143
+ }
2144
+
2145
+ /*
2146
+ * Level 1 BLAS routine which returns the 2-norm of an n-vector x.
2147
+ #
2148
+ * Based on input types, these are the valid return types:
2149
+ * int -> int
2150
+ * float -> float or double
2151
+ * double -> double
2152
+ * complex64 -> float or double
2153
+ * complex128 -> double
2154
+ * rational -> rational
2155
+ */
2156
+ template <typename ReturnDType, typename DType>
2157
+ ReturnDType nrm2(const int N, const DType* X, const int incX) {
2158
+ const DType ONE = 1, ZERO = 0;
2159
+ typename LongDType<DType>::type scale = 0, ssq = 1, absxi, temp;
2160
+
2161
+
2162
+ if ((N < 1) || (incX < 1)) return ZERO;
2163
+ else if (N == 1) return std::abs(X[0]);
2164
+
2165
+ for (int i = 0; i < N; ++i) {
2166
+ absxi = std::abs(X[i*incX]);
2167
+ if (scale < absxi) {
2168
+ temp = scale / absxi;
2169
+ scale = absxi;
2170
+ ssq = ONE + ssq * (temp * temp);
2171
+ } else {
2172
+ temp = absxi / scale;
2173
+ ssq += temp * temp;
2174
+ }
2175
+ }
2176
+
2177
+ return scale * std::sqrt( ssq );
2178
+ }
2179
+
2180
+
2181
+ #ifdef HAVE_CBLAS_H
2182
+ template <>
2183
+ inline float nrm2(const int N, const float* X, const int incX) {
2184
+ return cblas_snrm2(N, X, incX);
2185
+ }
2186
+
2187
+ template <>
2188
+ inline double nrm2(const int N, const double* X, const int incX) {
2189
+ return cblas_dnrm2(N, X, incX);
2190
+ }
2191
+
2192
+ template <>
2193
+ inline float nrm2(const int N, const Complex64* X, const int incX) {
2194
+ return cblas_scnrm2(N, X, incX);
2195
+ }
2196
+
2197
+ template <>
2198
+ inline double nrm2(const int N, const Complex128* X, const int incX) {
2199
+ return cblas_dznrm2(N, X, incX);
2200
+ }
2201
+ #else
2202
+ template <typename FloatDType>
2203
+ static inline void nrm2_complex_helper(const FloatDType& xr, const FloatDType& xi, double& scale, double& ssq) {
2204
+ double absx = std::abs(xr);
2205
+ if (scale < absx) {
2206
+ double temp = scale / absx;
2207
+ scale = absx;
2208
+ ssq = 1.0 + ssq * (temp * temp);
2209
+ } else {
2210
+ double temp = absx / scale;
2211
+ ssq += temp * temp;
2212
+ }
2213
+
2214
+ absx = std::abs(xi);
2215
+ if (scale < absx) {
2216
+ double temp = scale / absx;
2217
+ scale = absx;
2218
+ ssq = 1.0 + ssq * (temp * temp);
2219
+ } else {
2220
+ double temp = absx / scale;
2221
+ ssq += temp * temp;
2222
+ }
2223
+ }
2224
+
2225
+ template <>
2226
+ float nrm2(const int N, const Complex64* X, const int incX) {
2227
+ double scale = 0, ssq = 1, temp;
2228
+
2229
+ if ((N < 1) || (incX < 1)) return 0.0;
2230
+
2231
+ for (int i = 0; i < N; ++i) {
2232
+ nrm2_complex_helper<float>(X[i*incX].r, X[i*incX].i, scale, temp);
2233
+ }
2234
+
2235
+ return scale * std::sqrt( ssq );
2236
+ }
2237
+
2238
+ template <>
2239
+ double nrm2(const int N, const Complex128* X, const int incX) {
2240
+ double scale = 0, ssq = 1, temp;
2241
+
2242
+ if ((N < 1) || (incX < 1)) return 0.0;
2243
+
2244
+ for (int i = 0; i < N; ++i) {
2245
+ nrm2_complex_helper<double>(X[i*incX].r, X[i*incX].i, scale, temp);
2246
+ }
2247
+
2248
+ return scale * std::sqrt( ssq );
2249
+ }
2250
+ #endif
2251
+
2252
+ template <typename ReturnDType, typename DType>
2253
+ inline void cblas_nrm2(const int N, const void* X, const int incX, void* result) {
2254
+ *reinterpret_cast<ReturnDType*>( result ) = nrm2<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
2255
+ }
2256
+
2257
+ /*
2258
+ * Level 1 BLAS routine which sums the absolute values of a vector's contents. If the vector consists of complex values,
2259
+ * the routine sums the absolute values of the real and imaginary components as well.
2260
+ *
2261
+ * So, based on input types, these are the valid return types:
2262
+ * int -> int
2263
+ * float -> float or double
2264
+ * double -> double
2265
+ * complex64 -> float or double
2266
+ * complex128 -> double
2267
+ * rational -> rational
2268
+ */
2269
+ template <typename ReturnDType, typename DType>
2270
+ inline ReturnDType asum(const int N, const DType* X, const int incX) {
2271
+ ReturnDType sum = 0;
2272
+ if ((N > 0) && (incX > 0)) {
2273
+ for (int i = 0; i < N; ++i) {
2274
+ sum += std::abs(X[i*incX]);
2275
+ }
2276
+ }
2277
+ return sum;
2278
+ }
2279
+
2280
+
2281
+ #ifdef HAVE_CBLAS_H
2282
+ template <>
2283
+ inline float asum(const int N, const float* X, const int incX) {
2284
+ return cblas_sasum(N, X, incX);
2285
+ }
2286
+
2287
+ template <>
2288
+ inline double asum(const int N, const double* X, const int incX) {
2289
+ return cblas_dasum(N, X, incX);
2290
+ }
2291
+
2292
+ template <>
2293
+ inline float asum(const int N, const Complex64* X, const int incX) {
2294
+ return cblas_scasum(N, X, incX);
2295
+ }
2296
+
2297
+ template <>
2298
+ inline double asum(const int N, const Complex128* X, const int incX) {
2299
+ return cblas_dzasum(N, X, incX);
2300
+ }
2301
+ #else
2302
+ template <>
2303
+ inline float asum(const int N, const Complex64* X, const int incX) {
2304
+ float sum = 0;
2305
+ if ((N > 0) && (incX > 0)) {
2306
+ for (int i = 0; i < N; ++i) {
2307
+ sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
2308
+ }
2309
+ }
2310
+ return sum;
2311
+ }
2312
+
2313
+ template <>
2314
+ inline double asum(const int N, const Complex128* X, const int incX) {
2315
+ double sum = 0;
2316
+ if ((N > 0) && (incX > 0)) {
2317
+ for (int i = 0; i < N; ++i) {
2318
+ sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
2319
+ }
2320
+ }
2321
+ return sum;
2322
+ }
2323
+ #endif
2324
+
2325
+
2326
+ template <typename ReturnDType, typename DType>
2327
+ inline void cblas_asum(const int N, const void* X, const int incX, void* sum) {
2328
+ *reinterpret_cast<ReturnDType*>( sum ) = asum<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
2029
2329
  }
2030
2330
 
2031
2331