nmatrix 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/History.txt +68 -2
- data/Manifest.txt +1 -0
- data/README.rdoc +8 -7
- data/Rakefile +13 -2
- data/ext/nmatrix/data/complex.h +19 -1
- data/ext/nmatrix/data/data.h +8 -0
- data/ext/nmatrix/data/ruby_object.h +1 -0
- data/ext/nmatrix/extconf.rb +6 -4
- data/ext/nmatrix/nmatrix.cpp +97 -35
- data/ext/nmatrix/nmatrix.h +2 -0
- data/ext/nmatrix/ruby_constants.cpp +11 -1
- data/ext/nmatrix/ruby_constants.h +6 -1
- data/ext/nmatrix/storage/dense.cpp +2 -2
- data/ext/nmatrix/storage/yale.cpp +303 -49
- data/ext/nmatrix/storage/yale.h +3 -0
- data/ext/nmatrix/util/math.cpp +112 -0
- data/ext/nmatrix/util/math.h +372 -72
- data/lib/nmatrix/blas.rb +55 -9
- data/lib/nmatrix/nmatrix.rb +315 -2
- data/lib/nmatrix/nvector.rb +156 -95
- data/lib/nmatrix/version.rb +1 -1
- data/lib/nmatrix/yale_functions.rb +112 -0
- data/spec/blas_spec.rb +11 -0
- data/spec/elementwise_spec.rb +4 -1
- data/spec/io_spec.rb +8 -0
- data/spec/lapack_spec.rb +37 -15
- data/spec/leakcheck.rb +16 -0
- data/spec/math_spec.rb +6 -2
- data/spec/nmatrix_spec.rb +209 -3
- data/spec/nmatrix_yale_spec.rb +55 -0
- data/spec/nvector_spec.rb +33 -14
- data/spec/slice_spec.rb +26 -17
- data/spec/spec_helper.rb +17 -0
- metadata +60 -45
- data/ext/nmatrix/new_extconf.rb +0 -55
data/ext/nmatrix/storage/yale.h
CHANGED
@@ -102,6 +102,9 @@ extern "C" {
|
|
102
102
|
void* nm_yale_storage_ref(STORAGE* s, SLICE* slice);
|
103
103
|
char nm_yale_storage_set(STORAGE* storage, SLICE* slice, void* v);
|
104
104
|
|
105
|
+
//char nm_yale_storage_vector_insert(YALE_STORAGE* s, size_t pos, size_t* js, void* vals, size_t n, bool struct_only, nm::dtype_t dtype, nm::itype_t itype);
|
106
|
+
//void nm_yale_storage_increment_ia_after(YALE_STORAGE* s, size_t ija_size, size_t i, size_t n);
|
107
|
+
|
105
108
|
size_t nm_yale_storage_get_size(const YALE_STORAGE* storage);
|
106
109
|
|
107
110
|
///////////
|
data/ext/nmatrix/util/math.cpp
CHANGED
@@ -127,6 +127,8 @@ extern "C" {
|
|
127
127
|
#include <clapack.h>
|
128
128
|
#endif
|
129
129
|
|
130
|
+
static VALUE nm_cblas_nrm2(VALUE self, VALUE n, VALUE x, VALUE incx);
|
131
|
+
static VALUE nm_cblas_asum(VALUE self, VALUE n, VALUE x, VALUE incx);
|
130
132
|
static VALUE nm_cblas_rot(VALUE self, VALUE n, VALUE x, VALUE incx, VALUE y, VALUE incy, VALUE c, VALUE s);
|
131
133
|
static VALUE nm_cblas_rotg(VALUE self, VALUE ab);
|
132
134
|
|
@@ -307,6 +309,8 @@ void nm_math_init_blas() {
|
|
307
309
|
|
308
310
|
cNMatrix_BLAS = rb_define_module_under(cNMatrix, "BLAS");
|
309
311
|
|
312
|
+
rb_define_singleton_method(cNMatrix_BLAS, "cblas_nrm2", (METHOD)nm_cblas_nrm2, 3);
|
313
|
+
rb_define_singleton_method(cNMatrix_BLAS, "cblas_asum", (METHOD)nm_cblas_asum, 3);
|
310
314
|
rb_define_singleton_method(cNMatrix_BLAS, "cblas_rot", (METHOD)nm_cblas_rot, 7);
|
311
315
|
rb_define_singleton_method(cNMatrix_BLAS, "cblas_rotg", (METHOD)nm_cblas_rotg, 1);
|
312
316
|
|
@@ -515,6 +519,114 @@ static VALUE nm_cblas_rot(VALUE self, VALUE n, VALUE x, VALUE incx, VALUE y, VAL
|
|
515
519
|
}
|
516
520
|
|
517
521
|
|
522
|
+
/*
|
523
|
+
* Call any of the cblas_xnrm2 functions as directly as possible.
|
524
|
+
*
|
525
|
+
* xNRM2 is a BLAS level 1 routine which calculates the 2-norm of an n-vector x.
|
526
|
+
*
|
527
|
+
* Arguments:
|
528
|
+
* * n :: length of x, must be at least 0
|
529
|
+
* * x :: pointer to first entry of input vector
|
530
|
+
* * incx :: stride of x, must be POSITIVE (ATLAS says non-zero, but 3.8.4 code only allows positive)
|
531
|
+
*
|
532
|
+
* You probably don't want to call this function. Instead, why don't you try nrm2, which is more flexible
|
533
|
+
* with its arguments?
|
534
|
+
*
|
535
|
+
* This function does almost no type checking. Seriously, be really careful when you call it! There's no exception
|
536
|
+
* handling, so you can easily crash Ruby!
|
537
|
+
*/
|
538
|
+
static VALUE nm_cblas_nrm2(VALUE self, VALUE n, VALUE x, VALUE incx) {
|
539
|
+
|
540
|
+
static void (*ttable[nm::NUM_DTYPES])(const int N, const void* X, const int incX, void* sum) = {
|
541
|
+
/* nm::math::cblas_nrm2<uint8_t,uint8_t>,
|
542
|
+
nm::math::cblas_nrm2<int8_t,int8_t>,
|
543
|
+
nm::math::cblas_nrm2<int16_t,int16_t>,
|
544
|
+
nm::math::cblas_nrm2<int32_t,int32_t>, */
|
545
|
+
NULL, NULL, NULL, NULL, NULL, // no help for integers
|
546
|
+
nm::math::cblas_nrm2<float32_t,float32_t>,
|
547
|
+
nm::math::cblas_nrm2<float64_t,float64_t>,
|
548
|
+
nm::math::cblas_nrm2<float32_t,nm::Complex64>,
|
549
|
+
nm::math::cblas_nrm2<float64_t,nm::Complex128>,
|
550
|
+
nm::math::cblas_nrm2<nm::Rational32,nm::Rational32>,
|
551
|
+
nm::math::cblas_nrm2<nm::Rational64,nm::Rational64>,
|
552
|
+
nm::math::cblas_nrm2<nm::Rational128,nm::Rational128>,
|
553
|
+
nm::math::cblas_nrm2<nm::RubyObject,nm::RubyObject>
|
554
|
+
};
|
555
|
+
|
556
|
+
nm::dtype_t dtype = NM_DTYPE(x);
|
557
|
+
|
558
|
+
if (!ttable[dtype]) {
|
559
|
+
rb_raise(nm_eDataTypeError, "this vector operation undefined for integer vectors");
|
560
|
+
return Qnil;
|
561
|
+
|
562
|
+
} else {
|
563
|
+
// Determine the return dtype and allocate it
|
564
|
+
nm::dtype_t rdtype = dtype;
|
565
|
+
if (dtype == nm::COMPLEX64) rdtype = nm::FLOAT32;
|
566
|
+
else if (dtype == nm::COMPLEX128) rdtype = nm::FLOAT64;
|
567
|
+
|
568
|
+
void *Result = ALLOCA_N(char, DTYPE_SIZES[rdtype]);
|
569
|
+
|
570
|
+
ttable[dtype](FIX2INT(n), NM_STORAGE_DENSE(x)->elements, FIX2INT(incx), Result);
|
571
|
+
|
572
|
+
return rubyobj_from_cval(Result, rdtype).rval;
|
573
|
+
}
|
574
|
+
}
|
575
|
+
|
576
|
+
|
577
|
+
|
578
|
+
/*
|
579
|
+
* Call any of the cblas_xasum functions as directly as possible.
|
580
|
+
*
|
581
|
+
* xASUM is a BLAS level 1 routine which calculates the sum of absolute values of the entries
|
582
|
+
* of a vector x.
|
583
|
+
*
|
584
|
+
* Arguments:
|
585
|
+
* * n :: length of x, must be at least 0
|
586
|
+
* * x :: pointer to first entry of input vector
|
587
|
+
* * incx :: stride of x, must be POSITIVE (ATLAS says non-zero, but 3.8.4 code only allows positive)
|
588
|
+
*
|
589
|
+
* You probably don't want to call this function. Instead, why don't you try asum, which is more flexible
|
590
|
+
* with its arguments?
|
591
|
+
*
|
592
|
+
* This function does almost no type checking. Seriously, be really careful when you call it! There's no exception
|
593
|
+
* handling, so you can easily crash Ruby!
|
594
|
+
*/
|
595
|
+
static VALUE nm_cblas_asum(VALUE self, VALUE n, VALUE x, VALUE incx) {
|
596
|
+
|
597
|
+
static void (*ttable[nm::NUM_DTYPES])(const int N, const void* X, const int incX, void* sum) = {
|
598
|
+
nm::math::cblas_asum<uint8_t,uint8_t>,
|
599
|
+
nm::math::cblas_asum<int8_t,int8_t>,
|
600
|
+
nm::math::cblas_asum<int16_t,int16_t>,
|
601
|
+
nm::math::cblas_asum<int32_t,int32_t>,
|
602
|
+
nm::math::cblas_asum<int64_t,int64_t>,
|
603
|
+
nm::math::cblas_asum<float32_t,float32_t>,
|
604
|
+
nm::math::cblas_asum<float64_t,float64_t>,
|
605
|
+
nm::math::cblas_asum<float32_t,nm::Complex64>,
|
606
|
+
nm::math::cblas_asum<float64_t,nm::Complex128>,
|
607
|
+
nm::math::cblas_asum<nm::Rational32,nm::Rational32>,
|
608
|
+
nm::math::cblas_asum<nm::Rational64,nm::Rational64>,
|
609
|
+
nm::math::cblas_asum<nm::Rational128,nm::Rational128>,
|
610
|
+
nm::math::cblas_asum<nm::RubyObject,nm::RubyObject>
|
611
|
+
};
|
612
|
+
|
613
|
+
nm::dtype_t dtype = NM_DTYPE(x);
|
614
|
+
|
615
|
+
// Determine the return dtype and allocate it
|
616
|
+
nm::dtype_t rdtype = dtype;
|
617
|
+
if (dtype == nm::COMPLEX64) rdtype = nm::FLOAT32;
|
618
|
+
else if (dtype == nm::COMPLEX128) rdtype = nm::FLOAT64;
|
619
|
+
|
620
|
+
void *Result = ALLOCA_N(char, DTYPE_SIZES[rdtype]);
|
621
|
+
|
622
|
+
ttable[dtype](FIX2INT(n), NM_STORAGE_DENSE(x)->elements, FIX2INT(incx), Result);
|
623
|
+
|
624
|
+
return rubyobj_from_cval(Result, rdtype).rval;
|
625
|
+
}
|
626
|
+
|
627
|
+
|
628
|
+
|
629
|
+
|
518
630
|
/* Call any of the cblas_xgemm functions as directly as possible.
|
519
631
|
*
|
520
632
|
* The cblas_xgemm functions (dgemm, sgemm, cgemm, and zgemm) define the following operation:
|
data/ext/nmatrix/util/math.h
CHANGED
@@ -1026,33 +1026,31 @@ inline bool gemv(const enum CBLAS_TRANSPOSE Trans, const int M, const int N, con
|
|
1026
1026
|
|
1027
1027
|
// Yale: numeric matrix multiply c=a*b
|
1028
1028
|
template <typename DType, typename IType>
|
1029
|
-
inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const DType* a, const bool diaga,
|
1029
|
+
inline void numbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const DType* a, const bool diaga,
|
1030
1030
|
const IType* ib, const IType* jb, const DType* b, const bool diagb, IType* ic, IType* jc, DType* c, const bool diagc) {
|
1031
|
-
|
1032
|
-
|
1031
|
+
const unsigned int max_lmn = std::max(std::max(m, n), l);
|
1032
|
+
IType next[max_lmn];
|
1033
|
+
DType sums[max_lmn];
|
1033
1034
|
|
1034
1035
|
DType v;
|
1035
1036
|
|
1036
1037
|
IType head, length, temp, ndnz = 0;
|
1037
|
-
IType jj_start, jj_end, kk_start, kk_end;
|
1038
|
-
IType i, j, k, kk, jj;
|
1039
1038
|
IType minmn = std::min(m,n);
|
1039
|
+
IType minlm = std::min(l,m);
|
1040
1040
|
|
1041
|
-
for (
|
1042
|
-
next[
|
1043
|
-
sums[
|
1041
|
+
for (IType idx = 0; idx < max_lmn; ++idx) { // initialize scratch arrays
|
1042
|
+
next[idx] = std::numeric_limits<IType>::max();
|
1043
|
+
sums[idx] = 0;
|
1044
1044
|
}
|
1045
1045
|
|
1046
|
-
for (i = 0; i < n; ++i) { // walk down the rows
|
1046
|
+
for (IType i = 0; i < n; ++i) { // walk down the rows
|
1047
1047
|
head = std::numeric_limits<IType>::max()-1; // head gets assigned as whichever column of B's row j we last visited
|
1048
1048
|
length = 0;
|
1049
1049
|
|
1050
|
-
|
1051
|
-
|
1050
|
+
for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // walk through entries in each row
|
1051
|
+
IType j;
|
1052
1052
|
|
1053
|
-
|
1054
|
-
|
1055
|
-
if (jj == jj_end) { // if we're in the last entry for this row:
|
1053
|
+
if (jj == ia[i+1]) { // if we're in the last entry for this row:
|
1056
1054
|
if (!diaga || i >= minmn) continue;
|
1057
1055
|
j = i; // if it's a new Yale matrix, and last entry, get the diagonal position (j) and entry (ajj)
|
1058
1056
|
v = a[i];
|
@@ -1061,12 +1059,12 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
|
|
1061
1059
|
v = a[jj];
|
1062
1060
|
}
|
1063
1061
|
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1062
|
+
for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) {
|
1063
|
+
|
1064
|
+
IType k;
|
1067
1065
|
|
1068
|
-
if (kk ==
|
1069
|
-
if (!diagb || j >=
|
1066
|
+
if (kk == ib[j+1]) { // Get the column id for that entry
|
1067
|
+
if (!diagb || j >= minlm) continue;
|
1070
1068
|
k = j;
|
1071
1069
|
sums[k] += v*b[k];
|
1072
1070
|
} else {
|
@@ -1079,10 +1077,10 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
|
|
1079
1077
|
head = k;
|
1080
1078
|
++length;
|
1081
1079
|
}
|
1082
|
-
}
|
1083
|
-
}
|
1080
|
+
} // end of kk loop
|
1081
|
+
} // end of jj loop
|
1084
1082
|
|
1085
|
-
for (jj = 0; jj < length; ++jj) {
|
1083
|
+
for (IType jj = 0; jj < length; ++jj) {
|
1086
1084
|
if (sums[head] != 0) {
|
1087
1085
|
if (diagc && head == i) {
|
1088
1086
|
c[head] = sums[head];
|
@@ -1105,22 +1103,64 @@ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia,
|
|
1105
1103
|
} /* numbmm_ */
|
1106
1104
|
|
1107
1105
|
|
1106
|
+
/*
|
1107
|
+
template <typename DType, typename IType>
|
1108
|
+
inline void new_yale_matrix_multiply(const unsigned int m, const IType* ija, const DType* a, const IType* ijb, const DType* b, YALE_STORAGE* c_storage) {
|
1109
|
+
unsigned int n = c_storage->shape[0],
|
1110
|
+
l = c_storage->shape[1];
|
1111
|
+
|
1112
|
+
// Create a working vector of dimension max(m,l,n) and initial value IType::max():
|
1113
|
+
std::vector<IType> mask(std::max(std::max(m,l),n), std::numeric_limits<IType>::max());
|
1114
|
+
|
1115
|
+
for (IType i = 0; i < n; ++i) { // A.rows.each_index do |i|
|
1116
|
+
|
1117
|
+
IType j, k;
|
1118
|
+
size_t ndnz;
|
1119
|
+
|
1120
|
+
for (IType jj = ija[i]; jj <= ija[i+1]; ++jj) { // walk through column pointers for row i of A
|
1121
|
+
j = (jj == ija[i+1]) ? i : ija[jj]; // Get the current column index (handle diagonals last)
|
1122
|
+
|
1123
|
+
if (j >= m) {
|
1124
|
+
if (j == ija[jj]) rb_raise(rb_eIndexError, "ija array for left-hand matrix contains an out-of-bounds column index %u at position %u", jj, j);
|
1125
|
+
else break;
|
1126
|
+
}
|
1127
|
+
|
1128
|
+
for (IType kk = ijb[j]; kk <= ijb[j+1]; ++kk) { // walk through column pointers for row j of B
|
1129
|
+
if (j >= m) continue; // first of all, does B *have* a row j?
|
1130
|
+
k = (kk == ijb[j+1]) ? j : ijb[kk]; // Get the current column index (handle diagonals last)
|
1131
|
+
|
1132
|
+
if (k >= l) {
|
1133
|
+
if (k == ijb[kk]) rb_raise(rb_eIndexError, "ija array for right-hand matrix contains an out-of-bounds column index %u at position %u", kk, k);
|
1134
|
+
else break;
|
1135
|
+
}
|
1136
|
+
|
1137
|
+
if (mask[k] == )
|
1138
|
+
}
|
1139
|
+
|
1140
|
+
}
|
1141
|
+
}
|
1142
|
+
}
|
1143
|
+
*/
|
1108
1144
|
|
1109
1145
|
// Yale: Symbolic matrix multiply c=a*b
|
1110
1146
|
template <typename IType>
|
1111
|
-
inline
|
1147
|
+
inline size_t symbmm(const unsigned int n, const unsigned int m, const unsigned int l, const IType* ia, const IType* ja, const bool diaga,
|
1112
1148
|
const IType* ib, const IType* jb, const bool diagb, IType* ic, const bool diagc) {
|
1113
|
-
|
1114
|
-
IType
|
1149
|
+
unsigned int max_lmn = std::max(std::max(m,n), l);
|
1150
|
+
IType mask[max_lmn]; // INDEX in the SMMP paper.
|
1151
|
+
IType j, k; /* Local variables */
|
1152
|
+
size_t ndnz = n;
|
1115
1153
|
|
1154
|
+
for (IType idx = 0; idx < max_lmn; ++idx)
|
1155
|
+
mask[idx] = std::numeric_limits<IType>::max();
|
1116
1156
|
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
else ic[0] = 0;
|
1157
|
+
if (ic) { // Only write to ic if it's supplied; otherwise, we're just counting.
|
1158
|
+
if (diagc) ic[0] = n+1;
|
1159
|
+
else ic[0] = 0;
|
1160
|
+
}
|
1122
1161
|
|
1123
1162
|
IType minmn = std::min(m,n);
|
1163
|
+
IType minlm = std::min(l,m);
|
1124
1164
|
|
1125
1165
|
for (IType i = 0; i < n; ++i) { // MAIN LOOP: through rows
|
1126
1166
|
|
@@ -1132,9 +1172,9 @@ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia,
|
|
1132
1172
|
j = i;
|
1133
1173
|
} else j = ja[jj];
|
1134
1174
|
|
1135
|
-
for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns of row J in matrix B.
|
1175
|
+
for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns K of row J in matrix B.
|
1136
1176
|
if (kk == ib[j+1]) {
|
1137
|
-
if (!diagb || j >=
|
1177
|
+
if (!diagb || j >= minlm) continue;
|
1138
1178
|
k = j;
|
1139
1179
|
} else k = jb[kk];
|
1140
1180
|
|
@@ -1145,65 +1185,138 @@ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia,
|
|
1145
1185
|
}
|
1146
1186
|
}
|
1147
1187
|
|
1148
|
-
if (diagc &&
|
1188
|
+
if (diagc && mask[i] == std::numeric_limits<IType>::max()) --ndnz;
|
1149
1189
|
|
1150
|
-
ic[i+1] = ndnz;
|
1190
|
+
if (ic) ic[i+1] = ndnz;
|
1151
1191
|
}
|
1152
|
-
} /* symbmm_ */
|
1153
1192
|
|
1193
|
+
return ndnz;
|
1194
|
+
} /* symbmm_ */
|
1154
1195
|
|
1155
|
-
//TODO: More efficient sorting algorithm than selection sort would be nice, probably.
|
1156
|
-
// Remember, we're dealing with unique keys, which simplifies things.
|
1157
|
-
// Doesn't have to be in-place, since we probably just multiplied and that wasn't in-place.
|
1158
|
-
template <typename DType, typename IType>
|
1159
|
-
inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
|
1160
|
-
IType jj, min, min_jj;
|
1161
|
-
DType temp_val;
|
1162
1196
|
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1197
|
+
// In-place quicksort (from Wikipedia) -- called by smmp_sort_columns, below. All functions are inclusive of left, right.
|
1198
|
+
namespace smmp_sort {
|
1199
|
+
const size_t THRESHOLD = 4; // switch to insertion sort for 4 elements or fewer
|
1166
1200
|
|
1167
|
-
|
1201
|
+
template <typename DType, typename IType>
|
1202
|
+
void print_array(DType* vals, IType* array, IType left, IType right) {
|
1203
|
+
for (IType i = left; i <= right; ++i) {
|
1204
|
+
std::cerr << array[i] << ":" << vals[i] << " ";
|
1205
|
+
}
|
1206
|
+
std::cerr << std::endl;
|
1207
|
+
}
|
1168
1208
|
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1173
|
-
|
1209
|
+
template <typename DType, typename IType>
|
1210
|
+
IType partition(DType* vals, IType* array, IType left, IType right, IType pivot) {
|
1211
|
+
IType pivotJ = array[pivot];
|
1212
|
+
DType pivotV = vals[pivot];
|
1213
|
+
|
1214
|
+
// Swap pivot and right
|
1215
|
+
array[pivot] = array[right];
|
1216
|
+
vals[pivot] = vals[right];
|
1217
|
+
array[right] = pivotJ;
|
1218
|
+
vals[right] = pivotV;
|
1219
|
+
|
1220
|
+
IType store = left;
|
1221
|
+
for (IType idx = left; idx < right; ++idx) {
|
1222
|
+
if (array[idx] <= pivotJ) {
|
1223
|
+
// Swap i and store
|
1224
|
+
std::swap(array[idx], array[store]);
|
1225
|
+
std::swap(vals[idx], vals[store]);
|
1226
|
+
++store;
|
1174
1227
|
}
|
1228
|
+
}
|
1175
1229
|
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1230
|
+
std::swap(array[store], array[right]);
|
1231
|
+
std::swap(vals[store], vals[right]);
|
1232
|
+
|
1233
|
+
return store;
|
1234
|
+
}
|
1235
|
+
|
1236
|
+
// Recommended to use the median of left, right, and mid for the pivot.
|
1237
|
+
template <typename IType>
|
1238
|
+
IType median(IType a, IType b, IType c) {
|
1239
|
+
if (a < b) {
|
1240
|
+
if (b < c) return b; // a b c
|
1241
|
+
if (a < c) return c; // a c b
|
1242
|
+
return a; // c a b
|
1243
|
+
|
1244
|
+
} else { // a > b
|
1245
|
+
if (a < c) return a; // b a c
|
1246
|
+
if (b < c) return c; // b c a
|
1247
|
+
return b; // c b a
|
1248
|
+
}
|
1249
|
+
}
|
1250
|
+
|
1251
|
+
|
1252
|
+
// Insertion sort is more efficient than quicksort for small N
|
1253
|
+
template <typename DType, typename IType>
|
1254
|
+
void insertion_sort(DType* vals, IType* array, IType left, IType right) {
|
1255
|
+
for (IType idx = left; idx <= right; ++idx) {
|
1256
|
+
IType col_to_insert = array[idx];
|
1257
|
+
DType val_to_insert = vals[idx];
|
1258
|
+
|
1259
|
+
IType hole_pos = idx;
|
1260
|
+
for (; hole_pos > left && col_to_insert < array[hole_pos-1]; --hole_pos) {
|
1261
|
+
array[hole_pos] = array[hole_pos - 1]; // shift the larger column index up
|
1262
|
+
vals[hole_pos] = vals[hole_pos - 1]; // value goes along with it
|
1184
1263
|
}
|
1185
1264
|
|
1186
|
-
|
1187
|
-
|
1265
|
+
array[hole_pos] = col_to_insert;
|
1266
|
+
vals[hole_pos] = val_to_insert;
|
1267
|
+
}
|
1268
|
+
}
|
1269
|
+
|
1188
1270
|
|
1189
|
-
|
1190
|
-
|
1191
|
-
if (min_jj != jj) {
|
1192
|
-
// min already = ja[min_jj], so use this as temp_key
|
1193
|
-
temp_val = a[min_jj];
|
1271
|
+
template <typename DType, typename IType>
|
1272
|
+
void quicksort(DType* vals, IType* array, IType left, IType right) {
|
1194
1273
|
|
1195
|
-
|
1196
|
-
|
1274
|
+
if (left < right) {
|
1275
|
+
if (right - left < THRESHOLD) {
|
1276
|
+
insertion_sort(vals, array, left, right);
|
1277
|
+
} else {
|
1278
|
+
// choose any pivot such that left < pivot < right
|
1279
|
+
IType pivot = median(left, right, (IType)(((unsigned long)left + (unsigned long)right) / 2));
|
1280
|
+
pivot = partition(vals, array, left, right, pivot);
|
1197
1281
|
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1282
|
+
// recursively sort elements smaller than the pivot
|
1283
|
+
quicksort<DType,IType>(vals, array, left, pivot-1);
|
1284
|
+
|
1285
|
+
// recursively sort elements at least as big as the pivot
|
1286
|
+
quicksort<DType,IType>(vals, array, pivot+1, right);
|
1201
1287
|
}
|
1202
1288
|
}
|
1203
1289
|
}
|
1290
|
+
|
1291
|
+
|
1292
|
+
}; // end of namespace smmp_sort
|
1293
|
+
|
1294
|
+
|
1295
|
+
/*
|
1296
|
+
* For use following symbmm and numbmm. Sorts the matrix entries in each row according to the column index.
|
1297
|
+
* This utilizes quicksort, which is an in-place unstable sort (since there are no duplicate entries, we don't care
|
1298
|
+
* about stability).
|
1299
|
+
*
|
1300
|
+
* TODO: It might be worthwhile to do a test for free memory, and if available, use an unstable sort that isn't in-place.
|
1301
|
+
*
|
1302
|
+
* TODO: It's actually probably possible to write an even faster sort, since symbmm/numbmm are not producing a random
|
1303
|
+
* ordering. If someone is doing a lot of Yale matrix multiplication, it might benefit them to consider even insertion
|
1304
|
+
* sort.
|
1305
|
+
*/
|
1306
|
+
template <typename DType, typename IType>
|
1307
|
+
inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
|
1308
|
+
for (size_t i = 0; i < n; ++i) {
|
1309
|
+
if (ia[i+1] - ia[i] < 2) continue; // no need to sort rows containing only one or two elements.
|
1310
|
+
else if (ia[i+1] - ia[i] <= smmp_sort::THRESHOLD) {
|
1311
|
+
smmp_sort::insertion_sort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for small rows
|
1312
|
+
} else {
|
1313
|
+
smmp_sort::quicksort<DType, IType>(a, ja, ia[i], ia[i+1]-1); // faster for large rows (and may call insertion_sort as well)
|
1314
|
+
}
|
1315
|
+
}
|
1204
1316
|
}
|
1205
1317
|
|
1206
1318
|
|
1319
|
+
|
1207
1320
|
/*
|
1208
1321
|
* Transposes a generic Yale matrix (old or new). Specify new by setting diaga = true.
|
1209
1322
|
*
|
@@ -2025,7 +2138,194 @@ inline void rot(const int N, Complex128* X, const int incX, Complex128* Y, const
|
|
2025
2138
|
|
2026
2139
|
template <typename DType, typename CSDType>
|
2027
2140
|
inline void cblas_rot(const int N, void* X, const int incX, void* Y, const int incY, const void* c, const void* s) {
|
2028
|
-
rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY,
|
2141
|
+
rot<DType,CSDType>(N, reinterpret_cast<DType*>(X), incX, reinterpret_cast<DType*>(Y), incY,
|
2142
|
+
*reinterpret_cast<const CSDType*>(c), *reinterpret_cast<const CSDType*>(s));
|
2143
|
+
}
|
2144
|
+
|
2145
|
+
/*
|
2146
|
+
* Level 1 BLAS routine which returns the 2-norm of an n-vector x.
|
2147
|
+
#
|
2148
|
+
* Based on input types, these are the valid return types:
|
2149
|
+
* int -> int
|
2150
|
+
* float -> float or double
|
2151
|
+
* double -> double
|
2152
|
+
* complex64 -> float or double
|
2153
|
+
* complex128 -> double
|
2154
|
+
* rational -> rational
|
2155
|
+
*/
|
2156
|
+
template <typename ReturnDType, typename DType>
|
2157
|
+
ReturnDType nrm2(const int N, const DType* X, const int incX) {
|
2158
|
+
const DType ONE = 1, ZERO = 0;
|
2159
|
+
typename LongDType<DType>::type scale = 0, ssq = 1, absxi, temp;
|
2160
|
+
|
2161
|
+
|
2162
|
+
if ((N < 1) || (incX < 1)) return ZERO;
|
2163
|
+
else if (N == 1) return std::abs(X[0]);
|
2164
|
+
|
2165
|
+
for (int i = 0; i < N; ++i) {
|
2166
|
+
absxi = std::abs(X[i*incX]);
|
2167
|
+
if (scale < absxi) {
|
2168
|
+
temp = scale / absxi;
|
2169
|
+
scale = absxi;
|
2170
|
+
ssq = ONE + ssq * (temp * temp);
|
2171
|
+
} else {
|
2172
|
+
temp = absxi / scale;
|
2173
|
+
ssq += temp * temp;
|
2174
|
+
}
|
2175
|
+
}
|
2176
|
+
|
2177
|
+
return scale * std::sqrt( ssq );
|
2178
|
+
}
|
2179
|
+
|
2180
|
+
|
2181
|
+
#ifdef HAVE_CBLAS_H
|
2182
|
+
template <>
|
2183
|
+
inline float nrm2(const int N, const float* X, const int incX) {
|
2184
|
+
return cblas_snrm2(N, X, incX);
|
2185
|
+
}
|
2186
|
+
|
2187
|
+
template <>
|
2188
|
+
inline double nrm2(const int N, const double* X, const int incX) {
|
2189
|
+
return cblas_dnrm2(N, X, incX);
|
2190
|
+
}
|
2191
|
+
|
2192
|
+
template <>
|
2193
|
+
inline float nrm2(const int N, const Complex64* X, const int incX) {
|
2194
|
+
return cblas_scnrm2(N, X, incX);
|
2195
|
+
}
|
2196
|
+
|
2197
|
+
template <>
|
2198
|
+
inline double nrm2(const int N, const Complex128* X, const int incX) {
|
2199
|
+
return cblas_dznrm2(N, X, incX);
|
2200
|
+
}
|
2201
|
+
#else
|
2202
|
+
template <typename FloatDType>
|
2203
|
+
static inline void nrm2_complex_helper(const FloatDType& xr, const FloatDType& xi, double& scale, double& ssq) {
|
2204
|
+
double absx = std::abs(xr);
|
2205
|
+
if (scale < absx) {
|
2206
|
+
double temp = scale / absx;
|
2207
|
+
scale = absx;
|
2208
|
+
ssq = 1.0 + ssq * (temp * temp);
|
2209
|
+
} else {
|
2210
|
+
double temp = absx / scale;
|
2211
|
+
ssq += temp * temp;
|
2212
|
+
}
|
2213
|
+
|
2214
|
+
absx = std::abs(xi);
|
2215
|
+
if (scale < absx) {
|
2216
|
+
double temp = scale / absx;
|
2217
|
+
scale = absx;
|
2218
|
+
ssq = 1.0 + ssq * (temp * temp);
|
2219
|
+
} else {
|
2220
|
+
double temp = absx / scale;
|
2221
|
+
ssq += temp * temp;
|
2222
|
+
}
|
2223
|
+
}
|
2224
|
+
|
2225
|
+
template <>
|
2226
|
+
float nrm2(const int N, const Complex64* X, const int incX) {
|
2227
|
+
double scale = 0, ssq = 1, temp;
|
2228
|
+
|
2229
|
+
if ((N < 1) || (incX < 1)) return 0.0;
|
2230
|
+
|
2231
|
+
for (int i = 0; i < N; ++i) {
|
2232
|
+
nrm2_complex_helper<float>(X[i*incX].r, X[i*incX].i, scale, temp);
|
2233
|
+
}
|
2234
|
+
|
2235
|
+
return scale * std::sqrt( ssq );
|
2236
|
+
}
|
2237
|
+
|
2238
|
+
template <>
|
2239
|
+
double nrm2(const int N, const Complex128* X, const int incX) {
|
2240
|
+
double scale = 0, ssq = 1, temp;
|
2241
|
+
|
2242
|
+
if ((N < 1) || (incX < 1)) return 0.0;
|
2243
|
+
|
2244
|
+
for (int i = 0; i < N; ++i) {
|
2245
|
+
nrm2_complex_helper<double>(X[i*incX].r, X[i*incX].i, scale, temp);
|
2246
|
+
}
|
2247
|
+
|
2248
|
+
return scale * std::sqrt( ssq );
|
2249
|
+
}
|
2250
|
+
#endif
|
2251
|
+
|
2252
|
+
template <typename ReturnDType, typename DType>
|
2253
|
+
inline void cblas_nrm2(const int N, const void* X, const int incX, void* result) {
|
2254
|
+
*reinterpret_cast<ReturnDType*>( result ) = nrm2<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
|
2255
|
+
}
|
2256
|
+
|
2257
|
+
/*
|
2258
|
+
* Level 1 BLAS routine which sums the absolute values of a vector's contents. If the vector consists of complex values,
|
2259
|
+
* the routine sums the absolute values of the real and imaginary components as well.
|
2260
|
+
*
|
2261
|
+
* So, based on input types, these are the valid return types:
|
2262
|
+
* int -> int
|
2263
|
+
* float -> float or double
|
2264
|
+
* double -> double
|
2265
|
+
* complex64 -> float or double
|
2266
|
+
* complex128 -> double
|
2267
|
+
* rational -> rational
|
2268
|
+
*/
|
2269
|
+
template <typename ReturnDType, typename DType>
|
2270
|
+
inline ReturnDType asum(const int N, const DType* X, const int incX) {
|
2271
|
+
ReturnDType sum = 0;
|
2272
|
+
if ((N > 0) && (incX > 0)) {
|
2273
|
+
for (int i = 0; i < N; ++i) {
|
2274
|
+
sum += std::abs(X[i*incX]);
|
2275
|
+
}
|
2276
|
+
}
|
2277
|
+
return sum;
|
2278
|
+
}
|
2279
|
+
|
2280
|
+
|
2281
|
+
#ifdef HAVE_CBLAS_H
|
2282
|
+
template <>
|
2283
|
+
inline float asum(const int N, const float* X, const int incX) {
|
2284
|
+
return cblas_sasum(N, X, incX);
|
2285
|
+
}
|
2286
|
+
|
2287
|
+
template <>
|
2288
|
+
inline double asum(const int N, const double* X, const int incX) {
|
2289
|
+
return cblas_dasum(N, X, incX);
|
2290
|
+
}
|
2291
|
+
|
2292
|
+
template <>
|
2293
|
+
inline float asum(const int N, const Complex64* X, const int incX) {
|
2294
|
+
return cblas_scasum(N, X, incX);
|
2295
|
+
}
|
2296
|
+
|
2297
|
+
template <>
|
2298
|
+
inline double asum(const int N, const Complex128* X, const int incX) {
|
2299
|
+
return cblas_dzasum(N, X, incX);
|
2300
|
+
}
|
2301
|
+
#else
|
2302
|
+
template <>
|
2303
|
+
inline float asum(const int N, const Complex64* X, const int incX) {
|
2304
|
+
float sum = 0;
|
2305
|
+
if ((N > 0) && (incX > 0)) {
|
2306
|
+
for (int i = 0; i < N; ++i) {
|
2307
|
+
sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
|
2308
|
+
}
|
2309
|
+
}
|
2310
|
+
return sum;
|
2311
|
+
}
|
2312
|
+
|
2313
|
+
template <>
|
2314
|
+
inline double asum(const int N, const Complex128* X, const int incX) {
|
2315
|
+
double sum = 0;
|
2316
|
+
if ((N > 0) && (incX > 0)) {
|
2317
|
+
for (int i = 0; i < N; ++i) {
|
2318
|
+
sum += std::abs(X[i*incX].r) + std::abs(X[i*incX].i);
|
2319
|
+
}
|
2320
|
+
}
|
2321
|
+
return sum;
|
2322
|
+
}
|
2323
|
+
#endif
|
2324
|
+
|
2325
|
+
|
2326
|
+
template <typename ReturnDType, typename DType>
|
2327
|
+
inline void cblas_asum(const int N, const void* X, const int incX, void* sum) {
|
2328
|
+
*reinterpret_cast<ReturnDType*>( sum ) = asum<ReturnDType, DType>( N, reinterpret_cast<const DType*>(X), incX );
|
2029
2329
|
}
|
2030
2330
|
|
2031
2331
|
|