nmatrix 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/.gitignore +27 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +3 -5
  4. data/Guardfile +6 -0
  5. data/History.txt +33 -0
  6. data/Manifest.txt +41 -38
  7. data/README.rdoc +88 -11
  8. data/Rakefile +35 -53
  9. data/ext/nmatrix/data/complex.h +372 -0
  10. data/ext/nmatrix/data/data.cpp +275 -0
  11. data/ext/nmatrix/data/data.h +707 -0
  12. data/ext/nmatrix/data/rational.h +421 -0
  13. data/ext/nmatrix/data/ruby_object.h +446 -0
  14. data/ext/nmatrix/extconf.rb +101 -51
  15. data/ext/nmatrix/new_extconf.rb +56 -0
  16. data/ext/nmatrix/nmatrix.cpp +1609 -0
  17. data/ext/nmatrix/nmatrix.h +265 -849
  18. data/ext/nmatrix/ruby_constants.cpp +134 -0
  19. data/ext/nmatrix/ruby_constants.h +103 -0
  20. data/ext/nmatrix/storage/common.cpp +70 -0
  21. data/ext/nmatrix/storage/common.h +170 -0
  22. data/ext/nmatrix/storage/dense.cpp +665 -0
  23. data/ext/nmatrix/storage/dense.h +116 -0
  24. data/ext/nmatrix/storage/list.cpp +1088 -0
  25. data/ext/nmatrix/storage/list.h +129 -0
  26. data/ext/nmatrix/storage/storage.cpp +658 -0
  27. data/ext/nmatrix/storage/storage.h +99 -0
  28. data/ext/nmatrix/storage/yale.cpp +1601 -0
  29. data/ext/nmatrix/storage/yale.h +208 -0
  30. data/ext/nmatrix/ttable_helper.rb +126 -0
  31. data/ext/nmatrix/{yale/smmp1_header.template.c → types.h} +36 -9
  32. data/ext/nmatrix/util/io.cpp +295 -0
  33. data/ext/nmatrix/util/io.h +117 -0
  34. data/ext/nmatrix/util/lapack.h +1175 -0
  35. data/ext/nmatrix/util/math.cpp +557 -0
  36. data/ext/nmatrix/util/math.h +1363 -0
  37. data/ext/nmatrix/util/sl_list.cpp +475 -0
  38. data/ext/nmatrix/util/sl_list.h +255 -0
  39. data/ext/nmatrix/util/util.h +78 -0
  40. data/lib/nmatrix/blas.rb +70 -0
  41. data/lib/nmatrix/io/mat5_reader.rb +567 -0
  42. data/lib/nmatrix/io/mat_reader.rb +162 -0
  43. data/lib/{string.rb → nmatrix/monkeys.rb} +49 -2
  44. data/lib/nmatrix/nmatrix.rb +199 -0
  45. data/lib/nmatrix/nvector.rb +103 -0
  46. data/lib/nmatrix/version.rb +27 -0
  47. data/lib/nmatrix.rb +22 -230
  48. data/nmatrix.gemspec +59 -0
  49. data/scripts/mac-brew-gcc.sh +47 -0
  50. data/spec/4x4_sparse.mat +0 -0
  51. data/spec/4x5_dense.mat +0 -0
  52. data/spec/blas_spec.rb +47 -0
  53. data/spec/elementwise_spec.rb +164 -0
  54. data/spec/io_spec.rb +60 -0
  55. data/spec/lapack_spec.rb +52 -0
  56. data/spec/math_spec.rb +96 -0
  57. data/spec/nmatrix_spec.rb +93 -89
  58. data/spec/nmatrix_yale_spec.rb +52 -36
  59. data/spec/nvector_spec.rb +1 -1
  60. data/spec/slice_spec.rb +257 -0
  61. data/spec/spec_helper.rb +51 -0
  62. data/spec/utm5940.mtx +83844 -0
  63. metadata +113 -71
  64. data/.autotest +0 -23
  65. data/.gemtest +0 -0
  66. data/ext/nmatrix/cblas.c +0 -150
  67. data/ext/nmatrix/dense/blas_header.template.c +0 -52
  68. data/ext/nmatrix/dense/elementwise.template.c +0 -107
  69. data/ext/nmatrix/dense/gemm.template.c +0 -159
  70. data/ext/nmatrix/dense/gemv.template.c +0 -130
  71. data/ext/nmatrix/dense/rationalmath.template.c +0 -68
  72. data/ext/nmatrix/dense.c +0 -307
  73. data/ext/nmatrix/depend +0 -18
  74. data/ext/nmatrix/generator/syntax_tree.rb +0 -481
  75. data/ext/nmatrix/generator.rb +0 -594
  76. data/ext/nmatrix/list.c +0 -774
  77. data/ext/nmatrix/nmatrix.c +0 -1977
  78. data/ext/nmatrix/rational.c +0 -98
  79. data/ext/nmatrix/yale/complexmath.template.c +0 -71
  80. data/ext/nmatrix/yale/elementwise.template.c +0 -46
  81. data/ext/nmatrix/yale/elementwise_op.template.c +0 -73
  82. data/ext/nmatrix/yale/numbmm.template.c +0 -94
  83. data/ext/nmatrix/yale/smmp1.template.c +0 -21
  84. data/ext/nmatrix/yale/smmp2.template.c +0 -43
  85. data/ext/nmatrix/yale/smmp2_header.template.c +0 -46
  86. data/ext/nmatrix/yale/sort_columns.template.c +0 -56
  87. data/ext/nmatrix/yale/symbmm.template.c +0 -54
  88. data/ext/nmatrix/yale/transp.template.c +0 -68
  89. data/ext/nmatrix/yale.c +0 -726
  90. data/lib/array.rb +0 -67
  91. data/spec/syntax_tree_spec.rb +0 -46
@@ -0,0 +1,1363 @@
1
+ /////////////////////////////////////////////////////////////////////
2
+ // = NMatrix
3
+ //
4
+ // A linear algebra library for scientific computation in Ruby.
5
+ // NMatrix is part of SciRuby.
6
+ //
7
+ // NMatrix was originally inspired by and derived from NArray, by
8
+ // Masahiro Tanaka: http://narray.rubyforge.org
9
+ //
10
+ // == Copyright Information
11
+ //
12
+ // SciRuby is Copyright (c) 2010 - 2012, Ruby Science Foundation
13
+ // NMatrix is Copyright (c) 2012, Ruby Science Foundation
14
+ //
15
+ // Please see LICENSE.txt for additional copyright notices.
16
+ //
17
+ // == Contributing
18
+ //
19
+ // By contributing source code to SciRuby, you agree to be bound by
20
+ // our Contributor Agreement:
21
+ //
22
+ // * https://github.com/SciRuby/sciruby/wiki/Contributor-Agreement
23
+ //
24
+ // == math.h
25
+ //
26
+ // Header file for math functions, interfacing with BLAS, etc.
27
+ //
28
+ // For instructions on adding CBLAS and CLAPACK functions, see the
29
+ // beginning of math.cpp.
30
+ //
31
+ // Some of these functions are from ATLAS. Here is the license for
32
+ // ATLAS:
33
+ //
34
+ /*
35
+ * Automatically Tuned Linear Algebra Software v3.8.4
36
+ * (C) Copyright 1999 R. Clint Whaley
37
+ *
38
+ * Redistribution and use in source and binary forms, with or without
39
+ * modification, are permitted provided that the following conditions
40
+ * are met:
41
+ * 1. Redistributions of source code must retain the above copyright
42
+ * notice, this list of conditions and the following disclaimer.
43
+ * 2. Redistributions in binary form must reproduce the above copyright
44
+ * notice, this list of conditions, and the following disclaimer in the
45
+ * documentation and/or other materials provided with the distribution.
46
+ * 3. The name of the ATLAS group or the names of its contributers may
47
+ * not be used to endorse or promote products derived from this
48
+ * software without specific written permission.
49
+ *
50
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
51
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
52
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
53
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
54
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
55
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
56
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
57
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
58
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
59
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
60
+ * POSSIBILITY OF SUCH DAMAGE.
61
+ *
62
+ */
63
+
64
+ #ifndef MATH_H
65
+ #define MATH_H
66
+
67
+ /*
68
+ * Standard Includes
69
+ */
70
+
71
+ extern "C" { // These need to be in an extern "C" block or you'll get all kinds of undefined symbol errors.
72
+ #include <cblas.h>
73
+ //#include <clapack.h>
74
+ }
75
+
76
+ #include <algorithm> // std::min, std::max
77
+ #include <limits> // std::numeric_limits
78
+
79
+ /*
80
+ * Project Includes
81
+ */
82
+ #include "data/data.h"
83
+ #include "lapack.h"
84
+
85
+ /*
86
+ * Macros
87
+ */
88
+
89
+ /*
90
+ * Data
91
+ */
92
+
93
+
94
+ extern "C" {
95
+ /*
96
+ * C accessors.
97
+ */
98
+ void nm_math_det_exact(const int M, const void* elements, const int lda, dtype_t dtype, void* result);
99
+ void nm_math_transpose_generic(const size_t M, const size_t N, const void* A, const int lda, void* B, const int ldb, size_t element_size);
100
+ void nm_math_init_blas(void);
101
+ }
102
+
103
+
104
+ namespace nm {
105
+ namespace math {
106
+
107
+ /*
108
+ * Types
109
+ */
110
+
111
+
112
+ // These allow an increase in precision for intermediate values of gemm and gemv.
113
+ // See also: http://stackoverflow.com/questions/11873694/how-does-one-increase-precision-in-c-templates-in-a-template-typename-dependen
114
+ template <typename DType> struct LongDType;
115
+ template <> struct LongDType<uint8_t> { typedef int16_t type; };
116
+ template <> struct LongDType<int8_t> { typedef int16_t type; };
117
+ template <> struct LongDType<int16_t> { typedef int32_t type; };
118
+ template <> struct LongDType<int32_t> { typedef int64_t type; };
119
+ template <> struct LongDType<int64_t> { typedef int64_t type; };
120
+ template <> struct LongDType<float> { typedef double type; };
121
+ template <> struct LongDType<double> { typedef double type; };
122
+ template <> struct LongDType<Complex64> { typedef Complex128 type; };
123
+ template <> struct LongDType<Complex128> { typedef Complex128 type; };
124
+ template <> struct LongDType<Rational32> { typedef Rational128 type; };
125
+ template <> struct LongDType<Rational64> { typedef Rational128 type; };
126
+ template <> struct LongDType<Rational128> { typedef Rational128 type; };
127
+ template <> struct LongDType<RubyObject> { typedef RubyObject type; };
128
+
129
+ /*
130
+ * Functions
131
+ */
132
+
133
+ /* Numeric inverse -- usually just 1 / f, but a little more complicated for complex. */
134
+ template <typename DType>
135
+ inline DType numeric_inverse(const DType& n) {
136
+ return n.inverse();
137
+ }
138
+ template <> inline float numeric_inverse<float>(const float& n) { return 1 / n; }
139
+ template <> inline double numeric_inverse<double>(const double& n) { return 1 / n; }
140
+
141
+ /*
142
+ * This version of trsm doesn't do any error checks and only works on column-major matrices.
143
+ *
144
+ * For row major, call trsm<DType> instead. That will handle necessary changes-of-variables
145
+ * and parameter checks.
146
+ */
147
+ template <typename DType>
148
+ inline void trsm_nothrow(const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
149
+ const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
150
+ const int m, const int n, const DType alpha, const DType* a,
151
+ const int lda, DType* b, const int ldb)
152
+ {
153
+ if (m == 0 || n == 0) return; /* Quick return if possible. */
154
+
155
+ if (alpha == 0) { // Handle alpha == 0
156
+ for (int j = 0; j < n; ++j) {
157
+ for (int i = 0; i < m; ++i) {
158
+ b[i + j * ldb] = 0;
159
+ }
160
+ }
161
+ return;
162
+ }
163
+
164
+ if (side == CblasLeft) {
165
+ if (trans_a == CblasNoTrans) {
166
+
167
+ /* Form B := alpha*inv( A )*B. */
168
+ if (uplo == CblasUpper) {
169
+ for (int j = 0; j < n; ++j) {
170
+ if (alpha != 1) {
171
+ for (int i = 0; i < m; ++i) {
172
+ b[i + j * ldb] = alpha * b[i + j * ldb];
173
+ }
174
+ }
175
+ for (int k = m-1; k >= 0; --k) {
176
+ if (b[k + j * ldb] != 0) {
177
+ if (diag == CblasNonUnit) {
178
+ b[k + j * ldb] /= a[k + k * lda];
179
+ }
180
+
181
+ for (int i = 0; i < k-1; ++i) {
182
+ b[i + j * ldb] -= b[k + j * ldb] * a[i + k * lda];
183
+ }
184
+ }
185
+ }
186
+ }
187
+ } else {
188
+ for (int j = 0; j < n; ++j) {
189
+ if (alpha != 1) {
190
+ for (int i = 0; i < m; ++i) {
191
+ b[i + j * ldb] = alpha * b[i + j * ldb];
192
+ }
193
+ }
194
+ for (int k = 0; k < m; ++k) {
195
+ if (b[k + j * ldb] != 0.) {
196
+ if (diag == CblasNonUnit) {
197
+ b[k + j * ldb] /= a[k + k * lda];
198
+ }
199
+ for (int i = k+1; i < m; ++i) {
200
+ b[i + j * ldb] -= b[k + j * ldb] * a[i + k * lda];
201
+ }
202
+ }
203
+ }
204
+ }
205
+ }
206
+ } else { // CblasTrans
207
+
208
+ /* Form B := alpha*inv( A**T )*B. */
209
+ if (uplo == CblasUpper) {
210
+ for (int j = 0; j < n; ++j) {
211
+ for (int i = 0; i < m; ++i) {
212
+ DType temp = alpha * b[i + j * ldb];
213
+ for (int k = 0; k < i-1; ++k) {
214
+ temp -= a[k + i * lda] * b[k + j * ldb];
215
+ }
216
+ if (diag == CblasNonUnit) {
217
+ temp /= a[i + i * lda];
218
+ }
219
+ b[i + j * ldb] = temp;
220
+ }
221
+ }
222
+ } else {
223
+ for (int j = 0; j < n; ++j) {
224
+ for (int i = m-1; i >= 0; --i) {
225
+ DType temp= alpha * b[i + j * ldb];
226
+ for (int k = i+1; k < m; ++k) {
227
+ temp -= a[k + i * lda] * b[k + j * ldb];
228
+ }
229
+ if (diag == CblasNonUnit) {
230
+ temp /= a[i + i * lda];
231
+ }
232
+ b[i + j * ldb] = temp;
233
+ }
234
+ }
235
+ }
236
+ }
237
+ } else { // right side
238
+
239
+ if (trans_a == CblasNoTrans) {
240
+
241
+ /* Form B := alpha*B*inv( A ). */
242
+
243
+ if (uplo == CblasUpper) {
244
+ for (int j = 0; j < n; ++j) {
245
+ if (alpha != 1) {
246
+ for (int i = 0; i < m; ++i) {
247
+ b[i + j * ldb] = alpha * b[i + j * ldb];
248
+ }
249
+ }
250
+ for (int k = 0; k < j-1; ++k) {
251
+ if (a[k + j * lda] != 0) {
252
+ for (int i = 0; i < m; ++i) {
253
+ b[i + j * ldb] -= a[k + j * lda] * b[i + k * ldb];
254
+ }
255
+ }
256
+ }
257
+ if (diag == CblasNonUnit) {
258
+ DType temp = 1 / a[j + j * lda];
259
+ for (int i = 0; i < m; ++i) {
260
+ b[i + j * ldb] = temp * b[i + j * ldb];
261
+ }
262
+ }
263
+ }
264
+ } else {
265
+ for (int j = n-1; j >= 0; --j) {
266
+ if (alpha != 1) {
267
+ for (int i = 0; i < m; ++i) {
268
+ b[i + j * ldb] = alpha * b[i + j * ldb];
269
+ }
270
+ }
271
+
272
+ for (int k = j+1; k < n; ++k) {
273
+ if (a[k + j * lda] != 0.) {
274
+ for (int i = 0; i < m; ++i) {
275
+ b[i + j * ldb] -= a[k + j * lda] * b[i + k * ldb];
276
+ }
277
+ }
278
+ }
279
+ if (diag == CblasNonUnit) {
280
+ DType temp = 1 / a[j + j * lda];
281
+
282
+ for (int i = 0; i < m; ++i) {
283
+ b[i + j * ldb] = temp * b[i + j * ldb];
284
+ }
285
+ }
286
+ }
287
+ }
288
+ } else { // CblasTrans
289
+
290
+ /* Form B := alpha*B*inv( A**T ). */
291
+
292
+ if (uplo == CblasUpper) {
293
+ for (int k = n-1; k >= 0; --k) {
294
+ if (diag == CblasNonUnit) {
295
+ DType temp= 1 / a[k + k * lda];
296
+ for (int i = 0; i < m; ++i) {
297
+ b[i + k * ldb] = temp * b[i + k * ldb];
298
+ }
299
+ }
300
+ for (int j = 0; j < k-1; ++j) {
301
+ if (a[j + k * lda] != 0.) {
302
+ DType temp= a[j + k * lda];
303
+ for (int i = 0; i < m; ++i) {
304
+ b[i + j * ldb] -= temp * b[i + k * ldb];
305
+ }
306
+ }
307
+ }
308
+ if (alpha != 1) {
309
+ for (int i = 0; i < m; ++i) {
310
+ b[i + k * ldb] = alpha * b[i + k * ldb];
311
+ }
312
+ }
313
+ }
314
+ } else {
315
+ for (int k = 0; k < n; ++k) {
316
+ if (diag == CblasNonUnit) {
317
+ DType temp = 1 / a[k + k * lda];
318
+ for (int i = 0; i < m; ++i) {
319
+ b[i + k * ldb] = temp * b[i + k * ldb];
320
+ }
321
+ }
322
+ for (int j = k+1; j < n; ++j) {
323
+ if (a[j + k * lda] != 0.) {
324
+ DType temp = a[j + k * lda];
325
+ for (int i = 0; i < m; ++i) {
326
+ b[i + j * ldb] -= temp * b[i + k * ldb];
327
+ }
328
+ }
329
+ }
330
+ if (alpha != 1) {
331
+ for (int i = 0; i < m; ++i) {
332
+ b[i + k * ldb] = alpha * b[i + k * ldb];
333
+ }
334
+ }
335
+ }
336
+ }
337
+ }
338
+ }
339
+ }
340
+
341
+
342
+ /*
343
+ * BLAS' DTRSM function, generalized.
344
+ */
345
+ template <typename DType, typename = typename std::enable_if<!std::is_integral<DType>::value>::type>
346
+ inline void trsm(const enum CBLAS_ORDER order,
347
+ const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
348
+ const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
349
+ const int m, const int n, const DType alpha, const DType* a,
350
+ const int lda, DType* b, const int ldb)
351
+ {
352
+ int num_rows_a = n;
353
+ if (side == CblasLeft) num_rows_a = m;
354
+
355
+ if (lda < std::max(1,num_rows_a)) {
356
+ fprintf(stderr, "TRSM: num_rows_a = %d; got lda=%d\n", num_rows_a, lda);
357
+ rb_raise(rb_eArgError, "TRSM: Expected lda >= max(1, num_rows_a)");
358
+ }
359
+
360
+ // Test the input parameters.
361
+ if (order == CblasRowMajor) {
362
+ if (ldb < std::max(1,n)) {
363
+ fprintf(stderr, "TRSM: M=%d; got ldb=%d\n", m, ldb);
364
+ rb_raise(rb_eArgError, "TRSM: Expected ldb >= max(1,N)");
365
+ }
366
+
367
+ // For row major, need to switch side and uplo
368
+ enum CBLAS_SIDE side_ = side == CblasLeft ? CblasRight : CblasLeft;
369
+ enum CBLAS_UPLO uplo_ = uplo == CblasUpper ? CblasLower : CblasUpper;
370
+
371
+ trsm_nothrow<DType>(side_, uplo_, trans_a, diag, n, m, alpha, a, lda, b, ldb);
372
+
373
+ } else { // CblasColMajor
374
+
375
+ if (ldb < std::max(1,m)) {
376
+ fprintf(stderr, "TRSM: M=%d; got ldb=%d\n", m, ldb);
377
+ rb_raise(rb_eArgError, "TRSM: Expected ldb >= max(1,M)");
378
+ }
379
+
380
+ trsm_nothrow<DType>(side, uplo, trans_a, diag, m, n, alpha, a, lda, b, ldb);
381
+
382
+ }
383
+
384
+ }
385
+
386
+
387
+ template <>
388
+ inline void trsm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
389
+ const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
390
+ const int m, const int n, const float alpha, const float* a,
391
+ const int lda, float* b, const int ldb)
392
+ {
393
+ cblas_strsm(CblasRowMajor, side, uplo, trans_a, diag, m, n, alpha, a, lda, b, ldb);
394
+ }
395
+
396
+ template <>
397
+ inline void trsm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
398
+ const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
399
+ const int m, const int n, const double alpha, const double* a,
400
+ const int lda, double* b, const int ldb)
401
+ {
402
+ cblas_dtrsm(CblasRowMajor, side, uplo, trans_a, diag, m, n, alpha, a, lda, b, ldb);
403
+ }
404
+
405
+
406
+ template <>
407
+ inline void trsm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
408
+ const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
409
+ const int m, const int n, const Complex64 alpha, const Complex64* a,
410
+ const int lda, Complex64* b, const int ldb)
411
+ {
412
+ cblas_ctrsm(CblasRowMajor, side, uplo, trans_a, diag, m, n, (const void*)(&alpha), (const void*)(a), lda, (void*)(b), ldb);
413
+ }
414
+
415
+ template <>
416
+ inline void trsm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side, const enum CBLAS_UPLO uplo,
417
+ const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_DIAG diag,
418
+ const int m, const int n, const Complex128 alpha, const Complex128* a,
419
+ const int lda, Complex128* b, const int ldb)
420
+ {
421
+ cblas_ztrsm(CblasRowMajor, side, uplo, trans_a, diag, m, n, (const void*)(&alpha), (const void*)(a), lda, (void*)(b), ldb);
422
+ }
423
+
424
+
425
+ /*
426
+ * ATLAS function which performs row interchanges on a general rectangular matrix. Modeled after the LAPACK LASWP function.
427
+ *
428
+ * This version is templated for use by template <> getrf().
429
+ */
430
+ template <typename DType>
431
+ inline void laswp(const int N, DType* A, const int lda, const int K1, const int K2, const int *piv, const int inci) {
432
+ const int n = K2 - K1;
433
+
434
+ int nb = N >> 5;
435
+
436
+ const int mr = N - (nb<<5);
437
+ const int incA = lda << 5;
438
+
439
+ if (K2 < K1) return;
440
+
441
+ int i1, i2;
442
+ if (inci < 0) {
443
+ piv -= (K2-1) * inci;
444
+ i1 = K2 - 1;
445
+ i2 = K1;
446
+ } else {
447
+ piv += K1 * inci;
448
+ i1 = K1;
449
+ i2 = K2-1;
450
+ }
451
+
452
+ if (nb) {
453
+
454
+ do {
455
+ const int* ipiv = piv;
456
+ int i = i1;
457
+ int KeepOn;
458
+
459
+ do {
460
+ int ip = *ipiv; ipiv += inci;
461
+
462
+ if (ip != i) {
463
+ DType *a0 = &(A[i]),
464
+ *a1 = &(A[ip]);
465
+
466
+ for (register int h = 32; h; h--) {
467
+ DType r = *a0;
468
+ *a0 = *a1;
469
+ *a1 = r;
470
+
471
+ a0 += lda;
472
+ a1 += lda;
473
+ }
474
+
475
+ }
476
+ if (inci > 0) KeepOn = (++i <= i2);
477
+ else KeepOn = (--i >= i2);
478
+
479
+ } while (KeepOn);
480
+ A += incA;
481
+ } while (--nb);
482
+ }
483
+
484
+ if (mr) {
485
+ const int* ipiv = piv;
486
+ int i = i1;
487
+ int KeepOn;
488
+
489
+ do {
490
+ int ip = *ipiv; ipiv += inci;
491
+ if (ip != i) {
492
+ DType *a0 = &(A[i]),
493
+ *a1 = &(A[ip]);
494
+
495
+ for (register int h = mr; h; h--) {
496
+ DType r = *a0;
497
+ *a0 = *a1;
498
+ *a1 = r;
499
+
500
+ a0 += lda;
501
+ a1 += lda;
502
+ }
503
+ }
504
+
505
+ if (inci > 0) KeepOn = (++i <= i2);
506
+ else KeepOn = (--i >= i2);
507
+
508
+ } while (KeepOn);
509
+ }
510
+ }
511
+
512
+
513
+ /*
514
+ * GEneral Matrix Multiplication: based on dgemm.f from Netlib.
515
+ *
516
+ * This is an extremely inefficient algorithm. Recommend using ATLAS' version instead.
517
+ *
518
+ * Template parameters: LT -- long version of type T. Type T is the matrix dtype.
519
+ *
520
+ * This version throws no errors. Use gemm<DType> instead for error checking.
521
+ */
522
+ template <typename DType>
523
+ inline void gemm_nothrow(const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
524
+ const DType* alpha, const DType* A, const int lda, const DType* B, const int ldb, const DType* beta, DType* C, const int ldc)
525
+ {
526
+
527
+ typename LongDType<DType>::type temp;
528
+
529
+ // Quick return if possible
530
+ if (!M or !N or ((*alpha == 0 or !K) and *beta == 1)) return;
531
+
532
+ // For alpha = 0
533
+ if (*alpha == 0) {
534
+ if (*beta == 0) {
535
+ for (int j = 0; j < N; ++j)
536
+ for (int i = 0; i < M; ++i) {
537
+ C[i+j*ldc] = 0;
538
+ }
539
+ } else {
540
+ for (int j = 0; j < N; ++j)
541
+ for (int i = 0; i < M; ++i) {
542
+ C[i+j*ldc] *= *beta;
543
+ }
544
+ }
545
+ return;
546
+ }
547
+
548
+ // Start the operations
549
+ if (TransB == CblasNoTrans) {
550
+ if (TransA == CblasNoTrans) {
551
+ // C = alpha*A*B+beta*C
552
+ for (int j = 0; j < N; ++j) {
553
+ if (*beta == 0) {
554
+ for (int i = 0; i < M; ++i) {
555
+ C[i+j*ldc] = 0;
556
+ }
557
+ } else if (*beta != 1) {
558
+ for (int i = 0; i < M; ++i) {
559
+ C[i+j*ldc] *= *beta;
560
+ }
561
+ }
562
+
563
+ for (int l = 0; l < K; ++l) {
564
+ if (B[l+j*ldb] != 0) {
565
+ temp = *alpha * B[l+j*ldb];
566
+ for (int i = 0; i < M; ++i) {
567
+ C[i+j*ldc] += A[i+l*lda] * temp;
568
+ }
569
+ }
570
+ }
571
+ }
572
+
573
+ } else {
574
+
575
+ // C = alpha*A**DType*B + beta*C
576
+ for (int j = 0; j < N; ++j) {
577
+ for (int i = 0; i < M; ++i) {
578
+ temp = 0;
579
+ for (int l = 0; l < K; ++l) {
580
+ temp += A[l+i*lda] * B[l+j*ldb];
581
+ }
582
+
583
+ if (*beta == 0) {
584
+ C[i+j*ldc] = *alpha*temp;
585
+ } else {
586
+ C[i+j*ldc] = *alpha*temp + *beta*C[i+j*ldc];
587
+ }
588
+ }
589
+ }
590
+
591
+ }
592
+
593
+ } else if (TransA == CblasNoTrans) {
594
+
595
+ // C = alpha*A*B**T + beta*C
596
+ for (int j = 0; j < N; ++j) {
597
+ if (*beta == 0) {
598
+ for (int i = 0; i < M; ++i) {
599
+ C[i+j*ldc] = 0;
600
+ }
601
+ } else if (*beta != 1) {
602
+ for (int i = 0; i < M; ++i) {
603
+ C[i+j*ldc] *= *beta;
604
+ }
605
+ }
606
+
607
+ for (int l = 0; l < K; ++l) {
608
+ if (B[j+l*ldb] != 0) {
609
+ temp = *alpha * B[j+l*ldb];
610
+ for (int i = 0; i < M; ++i) {
611
+ C[i+j*ldc] += A[i+l*lda] * temp;
612
+ }
613
+ }
614
+ }
615
+
616
+ }
617
+
618
+ } else {
619
+
620
+ // C = alpha*A**DType*B**T + beta*C
621
+ for (int j = 0; j < N; ++j) {
622
+ for (int i = 0; i < M; ++i) {
623
+ temp = 0;
624
+ for (int l = 0; l < K; ++l) {
625
+ temp += A[l+i*lda] * B[j+l*ldb];
626
+ }
627
+
628
+ if (*beta == 0) {
629
+ C[i+j*ldc] = *alpha*temp;
630
+ } else {
631
+ C[i+j*ldc] = *alpha*temp + *beta*C[i+j*ldc];
632
+ }
633
+ }
634
+ }
635
+
636
+ }
637
+
638
+ return;
639
+ }
640
+
641
+
642
+
643
+ template <typename DType>
644
+ inline void gemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
645
+ const DType* alpha, const DType* A, const int lda, const DType* B, const int ldb, const DType* beta, DType* C, const int ldc)
646
+ {
647
+ if (Order == CblasRowMajor) {
648
+ if (TransA == CblasNoTrans) {
649
+ if (lda < std::max(K,1)) {
650
+ rb_raise(rb_eArgError, "lda must be >= MAX(K,1): lda=%d K=%d", lda, K);
651
+ }
652
+ } else {
653
+ if (lda < std::max(M,1)) { // && TransA == CblasTrans
654
+ rb_raise(rb_eArgError, "lda must be >= MAX(M,1): lda=%d M=%d", lda, M);
655
+ }
656
+ }
657
+
658
+ if (TransB == CblasNoTrans) {
659
+ if (ldb < std::max(N,1)) {
660
+ rb_raise(rb_eArgError, "ldb must be >= MAX(N,1): ldb=%d N=%d", ldb, N);
661
+ }
662
+ } else {
663
+ if (ldb < std::max(K,1)) {
664
+ rb_raise(rb_eArgError, "ldb must be >= MAX(K,1): ldb=%d K=%d", ldb, K);
665
+ }
666
+ }
667
+
668
+ if (ldc < std::max(N,1)) {
669
+ rb_raise(rb_eArgError, "ldc must be >= MAX(N,1): ldc=%d N=%d", ldc, N);
670
+ }
671
+ } else { // CblasColMajor
672
+ if (TransA == CblasNoTrans) {
673
+ if (lda < std::max(M,1)) {
674
+ rb_raise(rb_eArgError, "lda must be >= MAX(M,1): lda=%d M=%d", lda, M);
675
+ }
676
+ } else {
677
+ if (lda < std::max(K,1)) { // && TransA == CblasTrans
678
+ rb_raise(rb_eArgError, "lda must be >= MAX(K,1): lda=%d K=%d", lda, K);
679
+ }
680
+ }
681
+
682
+ if (TransB == CblasNoTrans) {
683
+ if (ldb < std::max(K,1)) {
684
+ rb_raise(rb_eArgError, "ldb must be >= MAX(K,1): ldb=%d N=%d", ldb, K);
685
+ }
686
+ } else {
687
+ if (ldb < std::max(N,1)) { // NOTE: This error message is actually wrong in the ATLAS source currently. Or are we wrong?
688
+ rb_raise(rb_eArgError, "ldb must be >= MAX(N,1): ldb=%d N=%d", ldb, N);
689
+ }
690
+ }
691
+
692
+ if (ldc < std::max(M,1)) {
693
+ rb_raise(rb_eArgError, "ldc must be >= MAX(M,1): ldc=%d N=%d", ldc, M);
694
+ }
695
+ }
696
+
697
+ /*
698
+ * Call SYRK when that's what the user is actually asking for; just handle beta=0, because beta=X requires
699
+ * we copy C and then subtract to preserve asymmetry.
700
+ */
701
+
702
+ if (A == B && M == N && TransA != TransB && lda == ldb && beta == 0) {
703
+ rb_raise(rb_eNotImpError, "syrk and syreflect not implemented");
704
+ /*syrk<DType>(CblasUpper, (Order == CblasColMajor) ? TransA : TransB, N, K, alpha, A, lda, beta, C, ldc);
705
+ syreflect(CblasUpper, N, C, ldc);
706
+ */
707
+ }
708
+
709
+ if (Order == CblasRowMajor) gemm_nothrow<DType>(TransB, TransA, N, M, K, alpha, B, ldb, A, lda, beta, C, ldc);
710
+ else gemm_nothrow<DType>(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
711
+
712
+ }
713
+
714
+
715
+ template <>
716
+ inline void gemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
717
+ const float* alpha, const float* A, const int lda, const float* B, const int ldb, const float* beta, float* C, const int ldc) {
718
+ cblas_sgemm(Order, TransA, TransB, M, N, K, *alpha, A, lda, B, ldb, *beta, C, ldc);
719
+ }
720
+
721
+ template <>
722
+ inline void gemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
723
+ const double* alpha, const double* A, const int lda, const double* B, const int ldb, const double* beta, double* C, const int ldc) {
724
+ cblas_dgemm(Order, TransA, TransB, M, N, K, *alpha, A, lda, B, ldb, *beta, C, ldc);
725
+ }
726
+
727
+ template <>
728
+ inline void gemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
729
+ const Complex64* alpha, const Complex64* A, const int lda, const Complex64* B, const int ldb, const Complex64* beta, Complex64* C, const int ldc) {
730
+ cblas_cgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
731
+ }
732
+
733
+ template <>
734
+ inline void gemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
735
+ const Complex128* alpha, const Complex128* A, const int lda, const Complex128* B, const int ldb, const Complex128* beta, Complex128* C, const int ldc) {
736
+ cblas_zgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
737
+ }
738
+
739
+
740
+ /*
741
+ * GEneral Matrix-Vector multiplication: based on dgemv.f from Netlib.
742
+ *
743
+ * This is an extremely inefficient algorithm. Recommend using ATLAS' version instead.
744
+ *
745
+ * Template parameters: LT -- long version of type T. Type T is the matrix dtype.
746
+ */
747
+ template <typename DType>
748
+ inline bool gemv(const enum CBLAS_TRANSPOSE Trans, const int M, const int N, const DType* alpha, const DType* A, const int lda,
749
+ const DType* X, const int incX, const DType* beta, DType* Y, const int incY) {
750
+ int lenX, lenY, i, j;
751
+ int kx, ky, iy, jx, jy, ix;
752
+
753
+ typename LongDType<DType>::type temp;
754
+
755
+ // Test the input parameters
756
+ if (Trans < 111 || Trans > 113) {
757
+ rb_raise(rb_eArgError, "GEMV: TransA must be CblasNoTrans, CblasTrans, or CblasConjTrans");
758
+ return false;
759
+ } else if (lda < std::max(1, N)) {
760
+ fprintf(stderr, "GEMV: N = %d; got lda=%d", N, lda);
761
+ rb_raise(rb_eArgError, "GEMV: Expected lda >= max(1, N)");
762
+ return false;
763
+ } else if (incX == 0) {
764
+ rb_raise(rb_eArgError, "GEMV: Expected incX != 0\n");
765
+ return false;
766
+ } else if (incY == 0) {
767
+ rb_raise(rb_eArgError, "GEMV: Expected incY != 0\n");
768
+ return false;
769
+ }
770
+
771
+ // Quick return if possible
772
+ if (!M or !N or (*alpha == 0 and *beta == 1)) return true;
773
+
774
+ if (Trans == CblasNoTrans) {
775
+ lenX = N;
776
+ lenY = M;
777
+ } else {
778
+ lenX = M;
779
+ lenY = N;
780
+ }
781
+
782
+ if (incX > 0) kx = 0;
783
+ else kx = (lenX - 1) * -incX;
784
+
785
+ if (incY > 0) ky = 0;
786
+ else ky = (lenY - 1) * -incY;
787
+
788
+ // Start the operations. In this version, the elements of A are accessed sequentially with one pass through A.
789
+ if (*beta != 1) {
790
+ if (incY == 1) {
791
+ if (*beta == 0) {
792
+ for (i = 0; i < lenY; ++i) {
793
+ Y[i] = 0;
794
+ }
795
+ } else {
796
+ for (i = 0; i < lenY; ++i) {
797
+ Y[i] *= *beta;
798
+ }
799
+ }
800
+ } else {
801
+ iy = ky;
802
+ if (*beta == 0) {
803
+ for (i = 0; i < lenY; ++i) {
804
+ Y[iy] = 0;
805
+ iy += incY;
806
+ }
807
+ } else {
808
+ for (i = 0; i < lenY; ++i) {
809
+ Y[iy] *= *beta;
810
+ iy += incY;
811
+ }
812
+ }
813
+ }
814
+ }
815
+
816
+ if (*alpha == 0) return false;
817
+
818
+ if (Trans == CblasNoTrans) {
819
+
820
+ // Form y := alpha*A*x + y.
821
+ jx = kx;
822
+ if (incY == 1) {
823
+ for (j = 0; j < N; ++j) {
824
+ if (X[jx] != 0) {
825
+ temp = *alpha * X[jx];
826
+ for (i = 0; i < M; ++i) {
827
+ Y[i] += A[j+i*lda] * temp;
828
+ }
829
+ }
830
+ jx += incX;
831
+ }
832
+ } else {
833
+ for (j = 0; j < N; ++j) {
834
+ if (X[jx] != 0) {
835
+ temp = *alpha * X[jx];
836
+ iy = ky;
837
+ for (i = 0; i < M; ++i) {
838
+ Y[iy] += A[j+i*lda] * temp;
839
+ iy += incY;
840
+ }
841
+ }
842
+ jx += incX;
843
+ }
844
+ }
845
+
846
+ } else { // TODO: Check that indices are correct! They're switched for C.
847
+
848
+ // Form y := alpha*A**DType*x + y.
849
+ jy = ky;
850
+
851
+ if (incX == 1) {
852
+ for (j = 0; j < N; ++j) {
853
+ temp = 0;
854
+ for (i = 0; i < M; ++i) {
855
+ temp += A[j+i*lda]*X[j];
856
+ }
857
+ Y[jy] += *alpha * temp;
858
+ jy += incY;
859
+ }
860
+ } else {
861
+ for (j = 0; j < N; ++j) {
862
+ temp = 0;
863
+ ix = kx;
864
+ for (i = 0; i < M; ++i) {
865
+ temp += A[j+i*lda] * X[ix];
866
+ ix += incX;
867
+ }
868
+
869
+ Y[jy] += *alpha * temp;
870
+ jy += incY;
871
+ }
872
+ }
873
+ }
874
+
875
+ return true;
876
+ } // end of GEMV
877
+
878
+ template <>
879
+ inline bool gemv(const enum CBLAS_TRANSPOSE Trans, const int M, const int N, const float* alpha, const float* A, const int lda,
880
+ const float* X, const int incX, const float* beta, float* Y, const int incY) {
881
+ cblas_sgemv(CblasRowMajor, Trans, M, N, *alpha, A, lda, X, incX, *beta, Y, incY);
882
+ return true;
883
+ }
884
+
885
+ template <>
886
+ inline bool gemv(const enum CBLAS_TRANSPOSE Trans, const int M, const int N, const double* alpha, const double* A, const int lda,
887
+ const double* X, const int incX, const double* beta, double* Y, const int incY) {
888
+ cblas_dgemv(CblasRowMajor, Trans, M, N, *alpha, A, lda, X, incX, *beta, Y, incY);
889
+ return true;
890
+ }
891
+
892
+ template <>
893
+ inline bool gemv(const enum CBLAS_TRANSPOSE Trans, const int M, const int N, const Complex64* alpha, const Complex64* A, const int lda,
894
+ const Complex64* X, const int incX, const Complex64* beta, Complex64* Y, const int incY) {
895
+ cblas_cgemv(CblasRowMajor, Trans, M, N, alpha, A, lda, X, incX, beta, Y, incY);
896
+ return true;
897
+ }
898
+
899
+ template <>
900
+ inline bool gemv(const enum CBLAS_TRANSPOSE Trans, const int M, const int N, const Complex128* alpha, const Complex128* A, const int lda,
901
+ const Complex128* X, const int incX, const Complex128* beta, Complex128* Y, const int incY) {
902
+ cblas_zgemv(CblasRowMajor, Trans, M, N, alpha, A, lda, X, incX, beta, Y, incY);
903
+ return true;
904
+ }
905
+
906
+
907
+ // Yale: numeric matrix multiply c=a*b
908
+ template <typename DType, typename IType>
909
+ inline void numbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const DType* a, const bool diaga,
910
+ const IType* ib, const IType* jb, const DType* b, const bool diagb, IType* ic, IType* jc, DType* c, const bool diagc) {
911
+ IType next[m];
912
+ DType sums[m];
913
+
914
+ DType v;
915
+
916
+ IType head, length, temp, ndnz = 0;
917
+ IType jj_start, jj_end, kk_start, kk_end;
918
+ IType i, j, k, kk, jj;
919
+ IType minmn = std::min(m,n);
920
+
921
+ for (i = 0; i < m; ++i) { // initialize scratch arrays
922
+ next[i] = std::numeric_limits<IType>::max();
923
+ sums[i] = 0;
924
+ }
925
+
926
+ for (i = 0; i < n; ++i) { // walk down the rows
927
+ head = std::numeric_limits<IType>::max()-1; // head gets assigned as whichever column of B's row j we last visited
928
+ length = 0;
929
+
930
+ jj_start = ia[i];
931
+ jj_end = ia[i+1];
932
+
933
+ for (jj = jj_start; jj <= jj_end; ++jj) { // walk through entries in each row
934
+
935
+ if (jj == jj_end) { // if we're in the last entry for this row:
936
+ if (!diaga || i >= minmn) continue;
937
+ j = i; // if it's a new Yale matrix, and last entry, get the diagonal position (j) and entry (ajj)
938
+ v = a[i];
939
+ } else {
940
+ j = ja[jj]; // if it's not the last entry for this row, get the column (j) and entry (ajj)
941
+ v = a[jj];
942
+ }
943
+
944
+ kk_start = ib[j]; // Find the first entry of row j of matrix B
945
+ kk_end = ib[j+1];
946
+ for (kk = kk_start; kk <= kk_end; ++kk) {
947
+
948
+ if (kk == kk_end) { // Get the column id for that entry
949
+ if (!diagb || j >= minmn) continue;
950
+ k = j;
951
+ sums[k] += v*b[k];
952
+ } else {
953
+ k = jb[kk];
954
+ sums[k] += v*b[kk];
955
+ }
956
+
957
+ if (next[k] == std::numeric_limits<IType>::max()) {
958
+ next[k] = head;
959
+ head = k;
960
+ ++length;
961
+ }
962
+ }
963
+ }
964
+
965
+ for (jj = 0; jj < length; ++jj) {
966
+ if (sums[head] != 0) {
967
+ if (diagc && head == i) {
968
+ c[head] = sums[head];
969
+ } else {
970
+ jc[n+1+ndnz] = head;
971
+ c[n+1+ndnz] = sums[head];
972
+ ++ndnz;
973
+ }
974
+ }
975
+
976
+ temp = head;
977
+ head = next[head];
978
+
979
+ next[temp] = std::numeric_limits<IType>::max();
980
+ sums[temp] = 0;
981
+ }
982
+
983
+ ic[i+1] = n+1+ndnz;
984
+ }
985
+ } /* numbmm_ */
986
+
987
+
988
+
989
+ // Yale: Symbolic matrix multiply c=a*b
990
+ template <typename IType>
991
+ inline void symbmm(const unsigned int n, const unsigned int m, const IType* ia, const IType* ja, const bool diaga,
992
+ const IType* ib, const IType* jb, const bool diagb, IType* ic, const bool diagc) {
993
+ IType mask[m];
994
+ IType j, k, ndnz = n; /* Local variables */
995
+
996
+
997
+ for (j = 0; j < m; ++j)
998
+ mask[j] = std::numeric_limits<IType>::max();
999
+
1000
+ if (diagc) ic[0] = n+1;
1001
+ else ic[0] = 0;
1002
+
1003
+ IType minmn = std::min(m,n);
1004
+
1005
+ for (IType i = 0; i < n; ++i) { // MAIN LOOP: through rows
1006
+
1007
+ for (IType jj = ia[i]; jj <= ia[i+1]; ++jj) { // merge row lists, walking through columns in each row
1008
+
1009
+ // j <- column index given by JA[jj], or handle diagonal.
1010
+ if (jj == ia[i+1]) { // Don't really do it the last time -- just handle diagonals in a new yale matrix.
1011
+ if (!diaga || i >= minmn) continue;
1012
+ j = i;
1013
+ } else j = ja[jj];
1014
+
1015
+ for (IType kk = ib[j]; kk <= ib[j+1]; ++kk) { // Now walk through columns of row J in matrix B.
1016
+ if (kk == ib[j+1]) {
1017
+ if (!diagb || j >= minmn) continue;
1018
+ k = j;
1019
+ } else k = jb[kk];
1020
+
1021
+ if (mask[k] != i) {
1022
+ mask[k] = i;
1023
+ ++ndnz;
1024
+ }
1025
+ }
1026
+ }
1027
+
1028
+ if (diagc && !mask[i]) --ndnz;
1029
+
1030
+ ic[i+1] = ndnz;
1031
+ }
1032
+ } /* symbmm_ */
1033
+
1034
+
1035
+ //TODO: More efficient sorting algorithm than selection sort would be nice, probably.
1036
+ // Remember, we're dealing with unique keys, which simplifies things.
1037
+ // Doesn't have to be in-place, since we probably just multiplied and that wasn't in-place.
1038
+ template <typename DType, typename IType>
1039
+ inline void smmp_sort_columns(const size_t n, const IType* ia, IType* ja, DType* a) {
1040
+ IType jj, min, min_jj;
1041
+ DType temp_val;
1042
+
1043
+ for (size_t i = 0; i < n; ++i) {
1044
+ // No need to sort if there are 0 or 1 entries in the row
1045
+ if (ia[i+1] - ia[i] < 2) continue;
1046
+
1047
+ for (IType jj_start = ia[i]; jj_start < ia[i+1]; ++jj_start) {
1048
+
1049
+ // If the previous min is just current-1, this key/value pair is already in sorted order.
1050
+ // This follows from the unique condition on our column keys.
1051
+ if (jj_start > ia[i] && min+1 == ja[jj_start]) {
1052
+ min = ja[jj_start];
1053
+ continue;
1054
+ }
1055
+
1056
+ // find the minimum key (column index) between jj_start and ia[i+1]
1057
+ min = ja[jj_start];
1058
+ min_jj = jj_start;
1059
+ for (jj = jj_start+1; jj < ia[i+1]; ++jj) {
1060
+ if (ja[jj] < min) {
1061
+ min_jj = jj;
1062
+ min = ja[jj];
1063
+ }
1064
+ }
1065
+
1066
+ // if min is already first, skip this iteration
1067
+ if (min_jj == jj_start) continue;
1068
+
1069
+ for (jj = jj_start; jj < ia[i+1]; ++jj) {
1070
+ // swap minimum key/value pair with key/value pair in the first position.
1071
+ if (min_jj != jj) {
1072
+ // min already = ja[min_jj], so use this as temp_key
1073
+ temp_val = a[min_jj];
1074
+
1075
+ ja[min_jj] = ja[jj];
1076
+ a[min_jj] = a[jj];
1077
+
1078
+ ja[jj] = min;
1079
+ a[jj] = temp_val;
1080
+ }
1081
+ }
1082
+ }
1083
+ }
1084
+ }
1085
+
1086
+
1087
+ /*
1088
+ * Transposes a generic Yale matrix (old or new). Specify new by setting diaga = true.
1089
+ *
1090
+ * Based on transp from SMMP (same as symbmm and numbmm).
1091
+ *
1092
+ * This is not named in the same way as most yale_storage functions because it does not act on a YALE_STORAGE
1093
+ * object.
1094
+ */
1095
+ template <typename DType, typename IType>
1096
+ void transpose_yale(const size_t n, const size_t m, const void* ia_, const void* ja_, const void* a_,
1097
+ const bool diaga, void* ib_, void* jb_, void* b_, const bool move)
1098
+ {
1099
+ const IType *ia = reinterpret_cast<const IType*>(ia_),
1100
+ *ja = reinterpret_cast<const IType*>(ja_);
1101
+ const DType *a = reinterpret_cast<const DType*>(a_);
1102
+
1103
+ IType *ib = reinterpret_cast<IType*>(ib_),
1104
+ *jb = reinterpret_cast<IType*>(jb_);
1105
+ DType *b = reinterpret_cast<DType*>(b_);
1106
+
1107
+
1108
+
1109
+ size_t index;
1110
+
1111
+ // Clear B
1112
+ for (size_t i = 0; i < m+1; ++i) ib[i] = 0;
1113
+
1114
+ if (move)
1115
+ for (size_t i = 0; i < m+1; ++i) b[i] = 0;
1116
+
1117
+ if (diaga) ib[0] = m + 1;
1118
+ else ib[0] = 0;
1119
+
1120
+ /* count indices for each column */
1121
+
1122
+ for (size_t i = 0; i < n; ++i) {
1123
+ for (size_t j = ia[i]; j < ia[i+1]; ++j) {
1124
+ ++(ib[ja[j]+1]);
1125
+ }
1126
+ }
1127
+
1128
+ for (size_t i = 0; i < m; ++i) {
1129
+ ib[i+1] = ib[i] + ib[i+1];
1130
+ }
1131
+
1132
+ /* now make jb */
1133
+
1134
+ for (size_t i = 0; i < n; ++i) {
1135
+
1136
+ for (size_t j = ia[i]; j < ia[i+1]; ++j) {
1137
+ index = ja[j];
1138
+ jb[ib[index]] = i;
1139
+
1140
+ if (move)
1141
+ b[ib[index]] = a[j];
1142
+
1143
+ ++(ib[index]);
1144
+ }
1145
+ }
1146
+
1147
+ /* now fixup ib */
1148
+
1149
+ for (size_t i = m; i >= 1; --i) {
1150
+ ib[i] = ib[i-1];
1151
+ }
1152
+
1153
+
1154
+ if (diaga) {
1155
+ if (move) {
1156
+ size_t j = std::min(n,m);
1157
+
1158
+ for (size_t i = 0; i < j; ++i) {
1159
+ b[i] = a[i];
1160
+ }
1161
+ }
1162
+ ib[0] = m + 1;
1163
+
1164
+ } else {
1165
+ ib[0] = 0;
1166
+ }
1167
+ }
1168
+
1169
+
1170
+ /*
1171
+ * Templated version of row-order and column-order getrf, derived from ATL_getrfR.c (from ATLAS 3.8.0).
1172
+ *
1173
+ * 1. Row-major factorization of form
1174
+ * A = L * U * P
1175
+ * where P is a column-permutation matrix, L is lower triangular (lower
1176
+ * trapazoidal if M > N), and U is upper triangular with unit diagonals (upper
1177
+ * trapazoidal if M < N). This is the recursive Level 3 BLAS version.
1178
+ *
1179
+ * 2. Column-major factorization of form
1180
+ * A = P * L * U
1181
+ * where P is a row-permutation matrix, L is lower triangular with unit diagonal
1182
+ * elements (lower trapazoidal if M > N), and U is upper triangular (upper
1183
+ * trapazoidal if M < N). This is the recursive Level 3 BLAS version.
1184
+ *
1185
+ * Template argument determines whether 1 or 2 is utilized.
1186
+ */
1187
+ template <bool RowMajor, typename DType>
1188
+ inline int getrf_nothrow(const int M, const int N, DType* A, const int lda, int* ipiv) {
1189
+ const int MN = std::min(M, N);
1190
+ int ierr = 0;
1191
+
1192
+ // Symbols used by ATLAS:
1193
+ // Row Col Us
1194
+ // Nup Nleft N_ul
1195
+ // Ndown Nright N_dr
1196
+ // We're going to use N_ul, N_dr
1197
+
1198
+ DType neg_one = -1, one = 1;
1199
+
1200
+ if (MN > 1) {
1201
+ int N_ul = MN >> 1;
1202
+
1203
+ // FIXME: Figure out how ATLAS #defines NB
1204
+ #ifdef NB
1205
+ if (N_ul > NB) N_ul = ATL_MulByNB(ATL_DivByNB(N_ul));
1206
+ #endif
1207
+
1208
+ int N_dr = M - N_ul;
1209
+
1210
+ int i = RowMajor ? getrf_nothrow<true,DType>(N_ul, N, A, lda, ipiv) : getrf_nothrow<false,DType>(M, N_ul, A, lda, ipiv);
1211
+
1212
+ if (i) if (!ierr) ierr = i;
1213
+
1214
+ DType *Ar, *Ac, *An;
1215
+ if (RowMajor) {
1216
+ Ar = &(A[N_ul * lda]),
1217
+ Ac = &(A[N_ul]);
1218
+ An = &(Ar[N_ul]);
1219
+
1220
+ nm::math::laswp<DType>(N_dr, Ar, lda, 0, N_ul, ipiv, 1);
1221
+
1222
+ nm::math::trsm<DType>(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, N_dr, N_ul, one, A, lda, Ar, lda);
1223
+ nm::math::gemm<DType>(CblasRowMajor, CblasNoTrans, CblasNoTrans, N_dr, N-N_ul, N_ul, &neg_one, Ar, lda, Ac, lda, &one, An, lda);
1224
+
1225
+ i = getrf_nothrow<true,DType>(N_dr, N-N_ul, An, lda, ipiv+N_ul);
1226
+ } else {
1227
+ Ar = NULL;
1228
+ Ac = &(A[N_ul * lda]);
1229
+ An = &(Ac[N_ul]);
1230
+
1231
+ nm::math::laswp<DType>(N_dr, Ac, lda, 0, N_ul, ipiv, 1);
1232
+
1233
+ nm::math::trsm<DType>(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, N_ul, N_dr, one, A, lda, Ac, lda);
1234
+ nm::math::gemm<DType>(CblasColMajor, CblasNoTrans, CblasNoTrans, M-N_ul, N_dr, N_ul, &neg_one, An, lda, Ac, lda, &one, An, lda);
1235
+
1236
+ i = getrf_nothrow<false,DType>(M-N_ul, N_dr, An, lda, ipiv+N_ul);
1237
+ }
1238
+
1239
+ if (i) if (!ierr) ierr = N_ul + i;
1240
+
1241
+ for (i = N_ul; i != MN; i++) {
1242
+ ipiv[i] += N_ul;
1243
+ }
1244
+
1245
+ nm::math::laswp<DType>(N_ul, A, lda, N_ul, MN, ipiv, 1); /* apply pivots */
1246
+
1247
+ } else if (MN == 1) { // there's another case for the colmajor version, but i don't know that it's that critical. Calls ATLAS LU2, who knows what that does.
1248
+
1249
+ int i = *ipiv = nm::math::lapack::idamax<DType>(N, A, 1); // cblas_iamax(N, A, 1);
1250
+
1251
+ DType tmp = A[i];
1252
+ if (tmp != 0) {
1253
+
1254
+ nm::math::lapack::scal<DType>((RowMajor ? N : M), nm::math::numeric_inverse(tmp), A, 1);
1255
+ A[i] = *A;
1256
+ *A = tmp;
1257
+
1258
+ } else ierr = 1;
1259
+
1260
+ }
1261
+ return(ierr);
1262
+ }
1263
+
1264
+
1265
+ /*
1266
+ * From ATLAS 3.8.0:
1267
+ *
1268
+ * Computes one of two LU factorizations based on the setting of the Order
1269
+ * parameter, as follows:
1270
+ * ----------------------------------------------------------------------------
1271
+ * Order == CblasColMajor
1272
+ * Column-major factorization of form
1273
+ * A = P * L * U
1274
+ * where P is a row-permutation matrix, L is lower triangular with unit
1275
+ * diagonal elements (lower trapazoidal if M > N), and U is upper triangular
1276
+ * (upper trapazoidal if M < N).
1277
+ *
1278
+ * ----------------------------------------------------------------------------
1279
+ * Order == CblasRowMajor
1280
+ * Row-major factorization of form
1281
+ * A = P * L * U
1282
+ * where P is a column-permutation matrix, L is lower triangular (lower
1283
+ * trapazoidal if M > N), and U is upper triangular with unit diagonals (upper
1284
+ * trapazoidal if M < N).
1285
+ *
1286
+ * ============================================================================
1287
+ * Let IERR be the return value of the function:
1288
+ * If IERR == 0, successful exit.
1289
+ * If (IERR < 0) the -IERR argument had an illegal value
1290
+ * If (IERR > 0 && Order == CblasColMajor)
1291
+ * U(i-1,i-1) is exactly zero. The factorization has been completed,
1292
+ * but the factor U is exactly singular, and division by zero will
1293
+ * occur if it is used to solve a system of equations.
1294
+ * If (IERR > 0 && Order == CblasRowMajor)
1295
+ * L(i-1,i-1) is exactly zero. The factorization has been completed,
1296
+ * but the factor L is exactly singular, and division by zero will
1297
+ * occur if it is used to solve a system of equations.
1298
+ */
1299
+ template <typename DType>
1300
+ inline int getrf(const enum CBLAS_ORDER Order, const int M, const int N, DType* A, int lda, int* ipiv) {
1301
+ if (Order == CblasRowMajor) {
1302
+ if (lda < std::max(1,N)) {
1303
+ rb_raise(rb_eArgError, "GETRF: lda must be >= MAX(N,1): lda=%d N=%d", lda, N);
1304
+ return -6;
1305
+ }
1306
+
1307
+ return getrf_nothrow<true,DType>(M, N, A, lda, ipiv);
1308
+ } else {
1309
+ if (lda < std::max(1,M)) {
1310
+ rb_raise(rb_eArgError, "GETRF: lda must be >= MAX(M,1): lda=%d M=%d", lda, M);
1311
+ return -6;
1312
+ }
1313
+
1314
+ return getrf_nothrow<false,DType>(M, N, A, lda, ipiv);
1315
+ //rb_raise(rb_eNotImpError, "column major getrf not implemented");
1316
+ }
1317
+ }
1318
+
1319
+
1320
+
1321
+
1322
+ /*
1323
+ * Macro for declaring LAPACK specializations of the getrf function.
1324
+ *
1325
+ * type is the DType; call is the specific function to call; cast_as is what the DType* should be
1326
+ * cast to in order to pass it to LAPACK.
1327
+ */
1328
+ #define LAPACK_GETRF(type, call, cast_as) \
1329
+ template <> \
1330
+ inline int getrf(const enum CBLAS_ORDER Order, const int M, const int N, type * A, const int lda, int* ipiv) { \
1331
+ int info = call(Order, M, N, reinterpret_cast<cast_as *>(A), lda, ipiv); \
1332
+ if (!info) return info; \
1333
+ else { \
1334
+ rb_raise(rb_eArgError, "getrf: problem with argument %d\n", info); \
1335
+ return info; \
1336
+ } \
1337
+ }
1338
+
1339
+ /* Specialize for ATLAS types */
1340
+ /*LAPACK_GETRF(float, clapack_sgetrf, float)
1341
+ LAPACK_GETRF(double, clapack_dgetrf, double)
1342
+ LAPACK_GETRF(Complex64, clapack_cgetrf, void)
1343
+ LAPACK_GETRF(Complex128, clapack_zgetrf, void)
1344
+ */
1345
+
1346
+
1347
+ /*
1348
+ * Function signature conversion for calling LAPACK's getrf functions as directly as possible.
1349
+ *
1350
+ * For documentation: http://www.netlib.org/lapack/double/dgetrf.f
1351
+ *
1352
+ * This function should normally go in math.cpp, but we need it to be available to nmatrix.cpp.
1353
+ */
1354
+ template <typename DType>
1355
+ inline int clapack_getrf(const enum CBLAS_ORDER order, const int m, const int n, void* a, const int lda, int* ipiv) {
1356
+ return getrf<DType>(order, m, n, reinterpret_cast<DType*>(a), lda, ipiv);
1357
+ }
1358
+
1359
+
1360
+ }} // end namespace nm::math
1361
+
1362
+
1363
+ #endif // MATH_H