nmatrix 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/History.txt +68 -2
- data/Manifest.txt +1 -0
- data/README.rdoc +8 -7
- data/Rakefile +13 -2
- data/ext/nmatrix/data/complex.h +19 -1
- data/ext/nmatrix/data/data.h +8 -0
- data/ext/nmatrix/data/ruby_object.h +1 -0
- data/ext/nmatrix/extconf.rb +6 -4
- data/ext/nmatrix/nmatrix.cpp +97 -35
- data/ext/nmatrix/nmatrix.h +2 -0
- data/ext/nmatrix/ruby_constants.cpp +11 -1
- data/ext/nmatrix/ruby_constants.h +6 -1
- data/ext/nmatrix/storage/dense.cpp +2 -2
- data/ext/nmatrix/storage/yale.cpp +303 -49
- data/ext/nmatrix/storage/yale.h +3 -0
- data/ext/nmatrix/util/math.cpp +112 -0
- data/ext/nmatrix/util/math.h +372 -72
- data/lib/nmatrix/blas.rb +55 -9
- data/lib/nmatrix/nmatrix.rb +315 -2
- data/lib/nmatrix/nvector.rb +156 -95
- data/lib/nmatrix/version.rb +1 -1
- data/lib/nmatrix/yale_functions.rb +112 -0
- data/spec/blas_spec.rb +11 -0
- data/spec/elementwise_spec.rb +4 -1
- data/spec/io_spec.rb +8 -0
- data/spec/lapack_spec.rb +37 -15
- data/spec/leakcheck.rb +16 -0
- data/spec/math_spec.rb +6 -2
- data/spec/nmatrix_spec.rb +209 -3
- data/spec/nmatrix_yale_spec.rb +55 -0
- data/spec/nvector_spec.rb +33 -14
- data/spec/slice_spec.rb +26 -17
- data/spec/spec_helper.rb +17 -0
- metadata +60 -45
- data/ext/nmatrix/new_extconf.rb +0 -55
data/ext/nmatrix/storage/yale.h
CHANGED
@@ -102,6 +102,9 @@ extern "C" {
|
|
102
102
|
void* nm_yale_storage_ref(STORAGE* s, SLICE* slice);
|
103
103
|
char nm_yale_storage_set(STORAGE* storage, SLICE* slice, void* v);
|
104
104
|
|
105
|
+
//char nm_yale_storage_vector_insert(YALE_STORAGE* s, size_t pos, size_t* js, void* vals, size_t n, bool struct_only, nm::dtype_t dtype, nm::itype_t itype);
|
106
|
+
//void nm_yale_storage_increment_ia_after(YALE_STORAGE* s, size_t ija_size, size_t i, size_t n);
|
107
|
+
|
105
108
|
size_t nm_yale_storage_get_size(const YALE_STORAGE* storage);
|
106
109
|
|
107
110
|
///////////
|
data/ext/nmatrix/util/math.cpp
CHANGED
@@ -127,6 +127,8 @@ extern "C" {
|
|
127
127
|
#include <clapack.h>
|
128
128
|
#endif
|
129
129
|
|
130
|
+
static VALUE nm_cblas_nrm2(VALUE self, VALUE n, VALUE x, VALUE incx);
|
131
|
+
static VALUE nm_cblas_asum(VALUE self, VALUE n, VALUE x, VALUE incx);
|
130
132
|
static VALUE nm_cblas_rot(VALUE self, VALUE n, VALUE x, VALUE incx, VALUE y, VALUE incy, VALUE c, VALUE s);
|
131
133
|
static VALUE nm_cblas_rotg(VALUE self, VALUE ab);
|
132
134
|
|
@@ -307,6 +309,8 @@ void nm_math_init_blas() {
|
|
307
309
|
|
308
310
|
cNMatrix_BLAS = rb_define_module_under(cNMatrix, "BLAS");
|
309
311
|
|
312
|
+
rb_define_singleton_method(cNMatrix_BLAS, "cblas_nrm2", (METHOD)nm_cblas_nrm2, 3);
|
313
|
+
rb_define_singleton_method(cNMatrix_BLAS, "cblas_asum", (METHOD)nm_cblas_asum, 3);
|
310
314
|
rb_define_singleton_method(cNMatrix_BLAS, "cblas_rot", (METHOD)nm_cblas_rot, 7);
|
311
315
|
rb_define_singleton_method(cNMatrix_BLAS, "cblas_rotg", (METHOD)nm_cblas_rotg, 1);
|
312
316
|
|
@@ -515,6 +519,114 @@ static VALUE nm_cblas_rot(VALUE self, VALUE n, VALUE x, VALUE incx, VALUE y, VAL
|
|
515
519
|
}
|
516
520
|
|
517
521
|
|
522
|
+
/*
|
523
|
+
* Call any of the cblas_xnrm2 functions as directly as possible.
|
524
|
+
*
|
525
|
+
* xNRM2 is a BLAS level 1 routine which calculates the 2-norm of an n-vector x.
|
526
|
+
*
|
527
|
+
* Arguments:
|
528
|
+
* * n :: length of x, must be at least 0
|
529
|
+
* * x :: pointer to first entry of input vector
|
530
|
+
* * incx :: stride of x, must be POSITIVE (ATLAS says non-zero, but 3.8.4 code only allows positive)
|
531
|
+
*
|
532
|
+
* You probably don't want to call this function. Instead, why don't you try nrm2, which is more flexible
|
533
|
+
* with its arguments?
|
534
|
+
*
|
535
|
+
* This function does almost no type checking. Seriously, be really careful when you call it! There's no exception
|
536
|
+
* handling, so you can easily crash Ruby!
|
537
|
+
*/
|
538
|
+
static VALUE nm_cblas_nrm2(VALUE self, VALUE n, VALUE x, VALUE incx) {
|
539
|
+
|
540
|
+
static void (*ttable[nm::NUM_DTYPES])(const int N, const void* X, const int incX, void* sum) = {
|
541
|
+
/* nm::math::cblas_nrm2<uint8_t,uint8_t>,
|
542
|
+
nm::math::cblas_nrm2<int8_t,int8_t>,
|
543
|
+
nm::math::cblas_nrm2<int16_t,int16_t>,
|
544
|
+
nm::math::cblas_nrm2<int32_t,int32_t>, */
|
545
|
+
NULL, NULL, NULL, NULL, NULL, // no help for integers
|
546
|
+
nm::math::cblas_nrm2<float32_t,float32_t>,
|
547
|
+
nm::math::cblas_nrm2<float64_t,float64_t>,
|
548
|
+
nm::math::cblas_nrm2<float32_t,nm::Complex64>,
|
549
|
+
nm::math::cblas_nrm2<float64_t,nm::Complex128>,
|
550
|
+
nm::math::cblas_nrm2<nm::Rational32,nm::Rational32>,
|
551
|
+
nm::math::cblas_nrm2<nm::Rational64,nm::Rational64>,
|
552
|
+
nm::math::cblas_nrm2<nm::Rational128,nm::Rational128>,
|
553
|
+
nm::math::cblas_nrm2<nm::RubyObject,nm::RubyObject>
|
554
|
+
};
|
555
|
+
|
556
|
+
nm::dtype_t dtype = NM_DTYPE(x);
|
557
|
+
|
558
|
+
if (!ttable[dtype]) {
|
559
|
+
rb_raise(nm_eDataTypeError, "this vector operation undefined for integer vectors");
|
560
|
+
return Qnil;
|
561
|
+
|
562
|
+
} else {
|
563
|
+
// Determine the return dtype and allocate it
|
564
|
+
nm::dtype_t rdtype = dtype;
|
565
|
+
if (dtype == nm::COMPLEX64) rdtype = nm::FLOAT32;
|
566
|
+
else if (dtype == nm::COMPLEX128) rdtype = nm::FLOAT64;
|
567
|
+
|
568
|
+
void *Result = ALLOCA_N(char, DTYPE_SIZES[rdtype]);
|
569
|
+
|
570
|
+
ttable[dtype](FIX2INT(n), NM_STORAGE_DENSE(x)->elements, FIX2INT(incx), Result);
|
571
|
+
|
572
|
+
return rubyobj_from_cval(Result, rdtype).rval;
|
573
|
+
}
|
574
|
+
}
|
575
|
+
|
576
|
+
|
577
|
+
|
578
|
+
/*
|
579
|
+
* Call any of the cblas_xasum functions as directly as possible.
|
580
|
+
*
|
581
|
+
* xASUM is a BLAS level 1 routine which calculates the sum of absolute values of the entries
|
582
|
+
* of a vector x.
|
583
|
+
*
|
584
|
+
* Arguments:
|
585
|
+
* * n :: length of x, must be at least 0
|
586
|
+
* * x :: pointer to first entry of input vector
|
587
|
+
* * incx :: stride of x, must be POSITIVE (ATLAS says non-zero, but 3.8.4 code only allows positive)
|
588
|
+
*
|
589
|
+
* You probably don't want to call this function. Instead, why don't you try asum, which is more flexible
|
590
|
+
* with its arguments?
|
591
|
+
*
|
592
|
+
* This function does almost no type checking. Seriously, be really careful when you call it! There's no exception
|
593
|
+
* handling, so you can easily crash Ruby!
|
594
|
+
*/
|
595
|
+
static VALUE nm_cblas_asum(VALUE self, VALUE n, VALUE x, VALUE incx) {
|
596
|
+
|
597
|
+
static void (*ttable[nm::NUM_DTYPES])(const int N, const void* X, const int incX, void* sum) = {
|
598
|
+
nm::math::cblas_asum<uint8_t,uint8_t>,
|
599
|
+
nm::math::cblas_asum<int8_t,int8_t>,
|
600
|
+
nm::math::cblas_asum<int16_t,int16_t>,
|
601
|
+
nm::math::cblas_asum<int32_t,int32_t>,
|
602
|
+
nm::math::cblas_asum<int64_t,int64_t>,
|
603
|
+
nm::math::cblas_asum<float32_t,float32_t>,
|
604
|
+
nm::math::cblas_asum<float64_t,float64_t>,
|
605
|
+
nm::math::cblas_asum<float32_t,nm::Complex64>,
|
606
|
+
nm::math::cblas_asum<float64_t,nm::Complex128>,
|
607
|
+
nm::math::cblas_asum<nm::Rational32,nm::Rational32>,
|
608
|
+
nm::math::cblas_asum<nm::Rational64,nm::Rational64>,
|
609
|
+
nm::math::cblas_asum<nm::Rational128,nm::Rational128>,
|
610
|
+
nm::math::cblas_asum<nm::RubyObject,nm::RubyObject>
|
611
|
+
};
|
612
|
+
|
613
|
+
nm::dtype_t dtype = NM_DTYPE(x);
|
614
|
+
|
615
|
+
// Determine the return dtype and allocate it
|
616
|
+
nm::dtype_t rdtype = dtype;
|
617
|
+
if (dtype == nm::COMPLEX64) rdtype = nm::FLOAT32;
|
618
|
+
else if (dtype == nm::COMPLEX128) rdtype = nm::FLOAT64;
|
619
|
+
|
620
|
+
void *Result = ALLOCA_N(char, DTYPE_SIZES[rdtype]);
|
621
|
+
|
622
|
+
ttable[dtype](FIX2INT(n), NM_STORAGE_DENSE(x)->elements, FIX2INT(incx), Result);
|
623
|
+
|
624
|
+
return rubyobj_from_cval(Result, rdtype).rval;
|
625
|
+
}
|
626
|
+
|
627
|
+
|
628
|
+
|
629
|
+
|
518
630
|
/* Call any of the cblas_xgemm functions as directly as possible.
|
519
631
|
*
|
520
632
|
* The cblas_xgemm functions (dgemm, sgemm, cgemm, and zgemm) define the following operation:
|
data/ext/nmatrix/util/math.h
CHANGED
@@ -1026,33 +1026,31 @@ inline bool gemv(const enum CBLAS_TRANSPOSE Trans, const int M, const int N, con
|
|
1026
1026
|
|
1027
1027
|
// Yale: numeric matrix multiply c=a*b
|
1028
1028
|
template <typename DType, typename IType>
|
1029
|
-
inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const DType* a, const bool diaga,
|
1029
|
+
inline void numbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const DType* a, const bool diaga,
|
1030
1030
|
const IType* ib, const IType* jb, const DType* b, const bool diagb, IType* ic, IType* jc, DType* c, const bool diagc) {
|
1031
|
-
|
1032
|
-
|
1031
|
+
const unsigned int max_lmn = std::max(std::max(m, n), l);
|
1032
|
+
IType next[max_lmn];
|
1033
|
+
DType sums[max_lmn];
|
1033
1034
|
|
1034
1035
|
DType v;
|
1035
1036
|
|
1036
1037
|
IType head, length, temp, ndnz = 0;
|
1037
|
-
IType jj_start, jj_end, kk_start, kk_end;
|
1038
|
-
IType i, j, k, kk, jj;
|
1039
1038
|
IType minmn = std::min(m,n);
|
1039
|
+
IType minlm = std::min(l,m);
|
1040
1040
|
|
1041
|
-
for (
|
1042
|
-
next[
|
1043
|
-
sums[
|
1041
|
+
for (IType idx = 0; idx < max_lmn; ++idx) { // initialize scratch arrays
|
1042
|
+
next[idx] = std::numeric_limits<IType>::max();
|
1043
|
+
sums[idx] = 0;
|
1044
1044
|
}
|
1045
1045
|
|
1046
|
-
for (i = 0; i < n; ++i) { // walk down the rows
|
1046
|
+
for (IType i = 0; i < n; ++i) { // walk down the rows
|
1047
1047
|
head = std::numeric_limits<IType>::max()-1; // head gets assigned as whichever column of B's row j we last visited
|
1048
1048
|
length = 0;
|
1049
1049
|
|
1050
|
-
|
1051
|
-
|
1050
|
+
for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // walk through entries in each row
|
1051
|
+
IType j;
|
1052
1052
|
|
1053
|
-
|
1054
|
-
|
1055
|
-
if (jj == jj_end) { // if we're in the last entry for this row:
|
1053
|
+
if (jj == ia[i+1]) { // if we're in the last entry for this row:
|
1056
1054
|
if (!diaga || i >= minmn) continue;
|
1057
1055
|
j = i; // if it's a new Yale matrix, and last entry, get the diagonal position (j) and entry (ajj)
|
1058
1056
|
v = a[i];
|
@@ -1061,12 +1059,12 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
|
|
1061
1059
|
v = a[jj];
|
1062
1060
|
}
|
1063
1061
|
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1062
|
+
for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) {
|
1063
|
+
|
1064
|
+
IType k;
|
1067
1065
|
|
1068
|
-
if (kk ==
|
1069
|
-
if (!diagb || j >=
|
1066
|
+
if (kk == ib[j+1]) { // Get the column id for that entry
|
1067
|
+
if (!diagb || j >= minlm) continue;
|
1070
1068
|
k = j;
|
1071
1069
|
sums[k] += v*b[k];
|
1072
1070
|
} else {
|
@@ -1079,10 +1077,10 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
|
|
1079
1077
|
head = k;
|
1080
1078
|
++length;
|
1081
1079
|
}
|
1082
|
-
}
|
1083
|
-
}
|
1080
|
+
} // end of kk loop
|
1081
|
+
} // end of jj loop
|
1084
1082
|
|
1085
|
-
for (jj = 0; jj < length; ++jj) {
|
1083
|
+
for (IType jj = 0; jj < length; ++jj) {
|
1086
1084
|
if (sums[head] != 0) {
|
1087
1085
|
if (diagc && head == i) {
|
1088
1086
|
c[head] = sums[head];
|
@@ -1105,22 +1103,64 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
|
|
1105
1103
|
} /* numbmm_ */
|
1106
1104
|
|
1107
1105
|
|
1106
|
+
/*
|
1107
|
+
template <typename DType, typename IType>
|
1108
|
+
inline void new_yale_matrix_multiply(const unsigned int m, const IType* ija, const DType* a, const IType* ijb, const DType* b, YALE_STORAGE* c_storage) {
|
1109
|
+
unsigned int n = c_storage->shape[0],
|
1110
|
+
l = c_storage->shape[1];
|
1111
|
+
|
1112
|
+
// Create a working vector of dimension max(m,l,n) and initial value IType::max():
|
1113
|
+
std::vector<IType> mask(std::max(std::max(m,l),n), std::numeric_limits<IType>::max());
|
1114
|
+
|
1115
|
+
for (IType i = 0; i < n; ++i) { // A.rows.each_index do |i|
|
1116
|
+
|
1117
|
+
IType j, k;
|
1118
|
+
size_t ndnz;
|
1119
|
+
|
1120
|
+
for (IType jj = ija[i]; jj <= ija[i+1]; ++jj) { // walk through column pointers for row i of A
|
1121
|
+
j = (jj == ija[i+1]) ? i : ija[jj]; // Get the current column index (handle diagonals last)
|
1122
|
+
|
1123
|
+
if (j >= m) {
|
1124
|
+
if (j == ija[jj]) rb_raise(rb_eIndexError, "ija array for left-hand matrix contains an out-of-bounds column index %u at position %u", jj, j);
|
1125
|
+
else break;
|
1126
|
+
}
|
1127
|
+
|
1128
|
+
for (IType kk = ijb[j]; kk <= ijb[j+1]; ++kk) { // walk through column pointers for row j of B
|
1129
|
+
if (j >= m) continue; // first of all, does B *have* a row j?
|
1130
|
+
k = (kk == ijb[j+1]) ? j : ijb[kk]; // Get the current column index (handle diagonals last)
|
1131
|
+
|
1132
|
+
if (k >= l) {
|
1133
|
+
if (k == ijb[kk]) rb_raise(rb_eIndexError, "ija array for right-hand matrix contains an out-of-bounds column index %u at position %u", kk, k);
|
1134
|
+
else break;
|
1135
|
+
}
|
1136
|
+
|
1137
|
+
if (mask[k] == )
|
1138
|
+
}
|
1139
|
+
|
1140
|
+
}
|
1141
|
+
}
|
1142
|
+
}
|
1143
|
+
*/
|
1108
1144
|
|
1109
1145
|
// Yale: Symbolic matrix multiply c=a*b
|
1110
1146
|
template <typename IType>
|
1111
|
-
inline
|
1147
|
+
inline size_t symbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const bool diaga,
|
1112
1148
|
const IType* ib, const IType* jb, const bool diagb, IType* ic, const bool diagc) {
|
1113
|
-
|
1114
|
-
IType
|
1149
|
+
unsigned int max_lmn = std::max(std::max(m,n), l);
|
1150
|
+
IType mask[max_lmn]; // INDEX in the SMMP paper.
|
1151
|
+
IType j, k; /* Local variables */
|
1152
|
+
size_t ndnz = n;
|
1115
1153
|
|
1154
|
+
for (IType idx = 0; idx < max_lmn; ++idx)
|
1155
|
+
mask[idx] = std::numeric_limits<IType>::max();
|
1116
1156
|
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
else ic[0] = 0;
|
1157
|
+
if (ic) { // Only write to ic if it's supplied; otherwise, we're just counting.
|
1158
|
+
if (diagc) ic[0] = n+1;
|
1159
|
+
else ic[0] = 0;
|
1160
|
+
}
|
1122
1161
|
|
1123
1162
|
IType minmn = std::min(m,n);
|
1163
|
+
IType minlm = std::min(l,m);
|
1124
1164
|
|
1125
1165
|
for (IType i = 0; i < n; ++i) { // MAIN LOOP: through rows
|
1126
1166
|
|
@@ -1132,9 +1172,9 @@ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia,
|
|
1132
1172
|
j = i;
|
1133
1173
|
} else j = ja[jj];
|
1134
1174
|
|
1135
|
-
for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns of row J in matrix B.
|
1175
|
+
for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns K of row J in matrix B.
|
1136
1176
|
if (kk == ib[j+1]) {
|
1137
|
-
if (!diagb || j >=
|
1177
|
+
if (!diagb || j >= minlm) continue;
|
1138
1178
|
k = j;
|
1139
1179
|
} else k = jb[kk];
|
1140
1180
|
|
@@ -1145,65 +1185,138 @@ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia,
|
|
1145
1185
|
}
|
1146
1186
|
}
|
1147
1187
|
|
1148
|
-
if (diagc &&
|
1188
|
+
if (diagc && mask[i] == std::numeric_limits<IType>::max()) --ndnz;
|
1149
1189
|
|
1150
|
-
ic[i+1] = ndnz;
|
1190
|
+
if (ic) ic[i+1] = ndnz;
|
1151
1191
|
}
|
1152
|
-
} /* symbmm_ */
|
1153
1192
|
|
1193
|
+
return ndnz;
|
1194
|
+
} /* symbmm_ */
|
1154
1195
|
|
1155
|
-
//TODO: More efficient sorting algorithm than selection sort would be nice, probably.
|
1156
|
-
// Remember, we're dealing with unique keys, which simplifies things.
|
1157
|
-
// Doesn't have to be in-place, since we probably just multiplied and that wasn't in-place.
|
1158
|
-
template <typename DType, typename IType>
|
1159
|
-
inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
|
1160
|
-
IType jj, min, min_jj;
|
1161
|
-
DType temp_val;
|
1162
1196
|
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1197
|
+
// In-place quicksort (from Wikipedia) -- called by smmp_sort_columns, below. All functions are inclusive of left, right.
|
1198
|
+
namespace smmp_sort {
|
1199
|
+
const size_t THRESHOLD = 4; // switch to insertion sort for 4 elements or fewer
|
1166
1200
|
|
1167
|
-
|
1201
|
+
template <typename DType, typename IType>
|
1202
|
+
void print_array(DType* vals, IType* array, IType left, IType right) {
|
1203
|
+
for (IType i = left; i <= right; ++i) {
|
1204
|
+
std::cerr << array[i] << ":" << vals[i] << " ";
|
1205
|
+
}
|
1206
|
+
std::cerr << std::endl;
|
1207
|
+
}
|
1168
1208
|
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1173
|
-
|
1209
|
+
template <typename DType, typename IType>
|
1210
|
+
IType partition(DType* vals, IType* array, IType left, IType right, IType pivot) {
|
1211
|
+
IType pivotJ = array[pivot];
|
1212
|
+
DType pivotV = vals[pivot];
|
1213
|
+
|
1214
|
+
// Swap pivot and right
|
1215
|
+
array[pivot] = array[right];
|
1216
|
+
vals[pivot] = vals[right];
|
1217
|
+
array[right] = pivotJ;
|
1218
|
+
vals[right] = pivotV;
|
1219
|
+
|
1220
|
+
IType store = left;
|
1221
|
+
for (IType idx = left; idx < right; ++idx) {
|
1222
|
+
if (array[idx] <= pivotJ) {
|
1223
|
+
// Swap i and store
|
1224
|
+
std::swap(array[idx], array[store]);
|
1225
|
+
std::swap(vals[idx], vals[store]);
|
1226
|
+
++store;
|
1174
1227
|
}
|
1228
|
+
}
|
1175
1229
|
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1230
|
+
std::swap(array[store], array[right]);
|
1231
|
+
std::swap(vals[store], vals[right]);
|
1232
|
+
|
1233
|
+
return store;
|
1234
|
+
}
|
1235
|
+
|
1236
|
+
// Recommended to use the median of left, right, and mid for the pivot.
|
1237
|
+
template <typename IType>
|
1238
|
+
IType median(IType a, IType b, IType c) {
|
1239
|
+
if (a < b) {
|
1240
|
+
if (b < c) return b; // a b c
|
1241
|
+
if (a < c) return c; // a c b
|
1242
|
+
return a; // c a b
|
1243
|
+
|
1244
|
+
} else { // a > b
|
1245
|
+
if (a < c) return a; // b a c
|
1246
|
+
if (b < c) return c; // b c a
|
1247
|
+
return b; // c b a
|
1248
|
+
}
|
1249
|
+
}
|
1250
|
+
|
1251
|
+
|
1252
|
+
// Insertion sort is more efficient than quicksort for small N
|
1253
|
+
template <typename DType, typename IType>
|
1254
|
+
void insertion_sort(DType* vals, IType* array, IType left, IType right) {
|
1255
|
+
for (IType idx = left; idx <= right; ++idx) {
|
1256
|
+
IType col_to_insert = array[idx];
|
1257
|
+
DType val_to_insert = vals[idx];
|
1258
|
+
|
1259
|
+
IType hole_pos = idx;
|
1260
|
+
for (; hole_pos > left && col_to_insert < array[hole_pos-1]; --hole_pos) {
|
1261
|
+
array[hole_pos] = array[hole_pos - 1]; // shift the larger column index up
|
1262
|
+
vals[hole_pos] = vals[hole_pos - 1]; // value goes along with it
|
1184
1263
|
}
|
1185
1264
|
|
1186
|
-
|
1187
|
-
|
1265
|
+
array[hole_pos] = col_to_insert;
|
1266
|
+
vals[hole_pos] = val_to_insert;
|
1267
|
+
}
|
1268
|
+
}
|
1269
|
+
|
1188
1270
|
|
1189
|
-
|
1190
|
-
|
1191
|
-
if (min_jj != jj) {
|
1192
|
-
// min already = ja[min_jj], so use this as temp_key
|
1193
|
-
temp_val = a[min_jj];
|
1271
|
+
template <typename DType, typename IType>
|
1272
|
+
void quicksort(DType* vals, IType* array, IType left, IType right) {
|
1194
1273
|
|
1195
|
-
|
1196
|
-
|
1274
|
+
if (left < right) {
|
1275
|
+
if (right - left < THRESHOLD) {
|
1276
|
+
insertion_sort(vals, array, left, right);
|
1277
|
+
} else {
|
1278
|
+
// choose any pivot such that left < pivot < right
|
1279
|
+
IType pivot = median(left, right, (IType)(((unsigned long)left + (unsigned long)right) / 2));
|
1280
|
+
pivot = partition(vals, array, left, right, pivot);
|
1197
1281
|
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1282
|
+
// recursively sort elements smaller than the pivot
|
1283
|
+
quicksort<DType,IType>(vals, array, left, pivot-1);
|
1284
|
+
|
1285
|
+
// recursively sort elements at least as big as the pivot
|
1286
|
+
quicksort<DType,IType>(vals, array, pivot+1, right);
|
1201
1287
|
}
|
1202
1288
|
}
|
1203
1289
|
}
|
1290
|
+
|
1291
|
+
|
1292
|
+
}; // end of namespace smmp_sort
|
1293
|
+
|
1294
|
+
|
1295
|
+
/*
|
1296
|
+
* For use following symbmm and numbmm. Sorts the matrix entries in each row according to the column index.
|
1297
|
+
* This utilizes quicksort, which is an in-place unstable sort (since there are no duplicate entries, we don't care
|
1298
|
+
* about stability).
|
1299
|
+
*
|
1300
|
+
* TODO: It might be worthwhile to do a test for free memory, and if available, use an unstable sort that isn't in-place.
|
1301
|
+
*
|
1302
|
+
* TODO: It's actually probably possible to write an even faster sort, since symbmm/numbmm are not producing a random
|
1303
|
+
* ordering. If someone is doing a lot of Yale matrix multiplication, it might benefit them to consider even insertion
|
1304
|
+
* sort.
|
1305
|
+
*/
|
1306
|
+
template <typename DType, typename IType>
|
1307
|
+
inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
|
1308
|
+
for (size_t i = 0; i < n; ++i) {
|
1309
|
+
if (ia[i+1] - ia[i] < 2) continue; // no need to sort rows containing only one or two elements.
|
1310
|
+
else if (ia[i+1] - ia[i] <= smmp_sort::THRESHOLD) {
|
1311
|
+
smmp_sort::insertion_sort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for small rows
|
1312
|
+
} else {
|
1313
|
+
smmp_sort::quicksort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for large rows (and may call insertion_sort as well)
|
1314
|
+
}
|
1315
|
+
}
|
1204
1316
|
}
|
1205
1317
|
|
1206
1318
|
|
1319
|
+
|
1207
1320
|
/*
|
1208
1321
|
* Transposes a generic Yale matrix (old or new). Specify new by setting diaga = true.
|
1209
1322
|
*
|
@@ -2025,7 +2138,194 @@ inline void rot(const int N, Complex128* X, const int incX, Complex128* Y, const
|
|
2025
2138
|
|
2026
2139
|
template <typename DType, typename CSDType>
|
2027
2140
|
inline void cblas_rot(const int N, void* X, const int incX, void* Y, const int incY, const void* c, const void* s) {
|
2028
|
-
rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY,
|
2141
|
+
rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY,
|
2142
|
+
*reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
|
2143
|
+
}
|
2144
|
+
|
2145
|
+
/*
|
2146
|
+
* Level 1 BLAS routine which returns the 2-norm of an n-vector x.
|
2147
|
+
#
|
2148
|
+
* Based on input types, these are the valid return types:
|
2149
|
+
* int -> int
|
2150
|
+
* float -> float or double
|
2151
|
+
* double -> double
|
2152
|
+
* complex64 -> float or double
|
2153
|
+
* complex128 -> double
|
2154
|
+
* rational -> rational
|
2155
|
+
*/
|
2156
|
+
template <typename ReturnDType, typename DType>
|
2157
|
+
ReturnDType nrm2(const int N, const DType* X, const int incX) {
|
2158
|
+
const DType ONE = 1, ZERO = 0;
|
2159
|
+
typename LongDType<DType>::type scale = 0, ssq = 1, absxi, temp;
|
2160
|
+
|
2161
|
+
|
2162
|
+
if ((N < 1) || (incX < 1)) return ZERO;
|
2163
|
+
else if (N == 1) return std::abs(X[0]);
|
2164
|
+
|
2165
|
+
for (int i = 0; i < N; ++i) {
|
2166
|
+
absxi = std::abs(X[i*incX]);
|
2167
|
+
if (scale < absxi) {
|
2168
|
+
temp = scale / absxi;
|
2169
|
+
scale = absxi;
|
2170
|
+
ssq = ONE + ssq * (temp * temp);
|
2171
|
+
} else {
|
2172
|
+
temp = absxi / scale;
|
2173
|
+
ssq += temp * temp;
|
2174
|
+
}
|
2175
|
+
}
|
2176
|
+
|
2177
|
+
return scale * std::sqrt( ssq );
|
2178
|
+
}
|
2179
|
+
|
2180
|
+
|
2181
|
+
#ifdef HAVE_CBLAS_H
|
2182
|
+
template <>
|
2183
|
+
inline float nrm2(const int N, const float* X, const int incX) {
|
2184
|
+
return cblas_snrm2(N, X, incX);
|
2185
|
+
}
|
2186
|
+
|
2187
|
+
template <>
|
2188
|
+
inline double nrm2(const int N, const double* X, const int incX) {
|
2189
|
+
return cblas_dnrm2(N, X, incX);
|
2190
|
+
}
|
2191
|
+
|
2192
|
+
template <>
|
2193
|
+
inline float nrm2(const int N, const Complex64* X, const int incX) {
|
2194
|
+
return cblas_scnrm2(N, X, incX);
|
2195
|
+
}
|
2196
|
+
|
2197
|
+
template <>
|
2198
|
+
inline double nrm2(const int N, const Complex128* X, const int incX) {
|
2199
|
+
return cblas_dznrm2(N, X, incX);
|
2200
|
+
}
|
2201
|
+
#else
|
2202
|
+
template <typename FloatDType>
|
2203
|
+
static inline void nrm2_complex_helper(const FloatDType& xr, const FloatDType& xi, double& scale, double& ssq) {
|
2204
|
+
double absx = std::abs(xr);
|
2205
|
+
if (scale < absx) {
|
2206
|
+
double temp = scale / absx;
|
2207
|
+
scale = absx;
|
2208
|
+
ssq = 1.0 + ssq * (temp * temp);
|
2209
|
+
} else {
|
2210
|
+
double temp = absx / scale;
|
2211
|
+
ssq += temp * temp;
|
2212
|
+
}
|
2213
|
+
|
2214
|
+
absx = std::abs(xi);
|
2215
|
+
if (scale < absx) {
|
2216
|
+
double temp = scale / absx;
|
2217
|
+
scale = absx;
|
2218
|
+
ssq = 1.0 + ssq * (temp * temp);
|
2219
|
+
} else {
|
2220
|
+
double temp = absx / scale;
|
2221
|
+
ssq += temp * temp;
|
2222
|
+
}
|
2223
|
+
}
|
2224
|
+
|
2225
|
+
template <>
|
2226
|
+
float nrm2(const int N, const Complex64* X, const int incX) {
|
2227
|
+
double scale = 0, ssq = 1, temp;
|
2228
|
+
|
2229
|
+
if ((N < 1) || (incX < 1)) return 0.0;
|
2230
|
+
|
2231
|
+
for (int i = 0; i < N; ++i) {
|
2232
|
+
nrm2_complex_helper<float>(X[i*incX].r, X[i*incX].i, scale, temp);
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
return scale * std::sqrt( ssq );
|
2236
|
+
}
|
2237
|
+
|
2238
|
+
template <>
|
2239
|
+
double nrm2(const int N, const Complex128* X, const int incX) {
|
2240
|
+
double scale = 0, ssq = 1, temp;
|
2241
|
+
|
2242
|
+
if ((N < 1) || (incX < 1)) return 0.0;
|
2243
|
+
|
2244
|
+
for (int i = 0; i < N; ++i) {
|
2245
|
+
nrm2_complex_helper<double>(X[i*incX].r, X[i*incX].i, scale, temp);
|
2246
|
+
}
|
2247
|
+
|
2248
|
+
return scale * std::sqrt( ssq );
|
2249
|
+
}
|
2250
|
+
#endif
|
2251
|
+
|
2252
|
+
template <typename ReturnDType, typename DType>
|
2253
|
+
inline void cblas_nrm2(const int N, const void* X, const int incX, void* result) {
|
2254
|
+
*reinterpret_cast<ReturnDType*>( result ) = nrm2<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
|
2255
|
+
}
|
2256
|
+
|
2257
|
+
/*
|
2258
|
+
* Level 1 BLAS routine which sums the absolute values of a vector's contents. If the vector consists of complex values,
|
2259
|
+
* the routine sums the absolute values of the real and imaginary components as well.
|
2260
|
+
*
|
2261
|
+
* So, based on input types, these are the valid return types:
|
2262
|
+
* int -> int
|
2263
|
+
* float -> float or double
|
2264
|
+
* double -> double
|
2265
|
+
* complex64 -> float or double
|
2266
|
+
* complex128 -> double
|
2267
|
+
* rational -> rational
|
2268
|
+
*/
|
2269
|
+
template <typename ReturnDType, typename DType>
|
2270
|
+
inline ReturnDType asum(const int N, const DType* X, const int incX) {
|
2271
|
+
ReturnDType sum = 0;
|
2272
|
+
if ((N > 0) && (incX > 0)) {
|
2273
|
+
for (int i = 0; i < N; ++i) {
|
2274
|
+
sum += std::abs(X[i*incX]);
|
2275
|
+
}
|
2276
|
+
}
|
2277
|
+
return sum;
|
2278
|
+
}
|
2279
|
+
|
2280
|
+
|
2281
|
+
#ifdef HAVE_CBLAS_H
|
2282
|
+
template <>
|
2283
|
+
inline float asum(const int N, const float* X, const int incX) {
|
2284
|
+
return cblas_sasum(N, X, incX);
|
2285
|
+
}
|
2286
|
+
|
2287
|
+
template <>
|
2288
|
+
inline double asum(const int N, const double* X, const int incX) {
|
2289
|
+
return cblas_dasum(N, X, incX);
|
2290
|
+
}
|
2291
|
+
|
2292
|
+
template <>
|
2293
|
+
inline float asum(const int N, const Complex64* X, const int incX) {
|
2294
|
+
return cblas_scasum(N, X, incX);
|
2295
|
+
}
|
2296
|
+
|
2297
|
+
template <>
|
2298
|
+
inline double asum(const int N, const Complex128* X, const int incX) {
|
2299
|
+
return cblas_dzasum(N, X, incX);
|
2300
|
+
}
|
2301
|
+
#else
|
2302
|
+
template <>
|
2303
|
+
inline float asum(const int N, const Complex64* X, const int incX) {
|
2304
|
+
float sum = 0;
|
2305
|
+
if ((N > 0) && (incX > 0)) {
|
2306
|
+
for (int i = 0; i < N; ++i) {
|
2307
|
+
sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
|
2308
|
+
}
|
2309
|
+
}
|
2310
|
+
return sum;
|
2311
|
+
}
|
2312
|
+
|
2313
|
+
template <>
|
2314
|
+
inline double asum(const int N, const Complex128* X, const int incX) {
|
2315
|
+
double sum = 0;
|
2316
|
+
if ((N > 0) && (incX > 0)) {
|
2317
|
+
for (int i = 0; i < N; ++i) {
|
2318
|
+
sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
|
2319
|
+
}
|
2320
|
+
}
|
2321
|
+
return sum;
|
2322
|
+
}
|
2323
|
+
#endif
|
2324
|
+
|
2325
|
+
|
2326
|
+
template <typename ReturnDType, typename DType>
|
2327
|
+
inline void cblas_asum(const int N, const void* X, const int incX, void* sum) {
|
2328
|
+
*reinterpret_cast<ReturnDType*>( sum ) = asum<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
|
2029
2329
|
}
|
2030
2330
|
|
2031
2331
|
|