alglib4 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +47 -0
  3. data/ext/alglib/alglib.cpp +537 -0
  4. data/ext/alglib/alglib_array_converters.cpp +86 -0
  5. data/ext/alglib/alglib_array_converters.h +15 -0
  6. data/ext/alglib/alglib_utils.cpp +10 -0
  7. data/ext/alglib/alglib_utils.h +6 -0
  8. data/ext/alglib/alglibinternal.cpp +21749 -0
  9. data/ext/alglib/alglibinternal.h +2168 -0
  10. data/ext/alglib/alglibmisc.cpp +9106 -0
  11. data/ext/alglib/alglibmisc.h +2114 -0
  12. data/ext/alglib/ap.cpp +20094 -0
  13. data/ext/alglib/ap.h +7244 -0
  14. data/ext/alglib/dataanalysis.cpp +52588 -0
  15. data/ext/alglib/dataanalysis.h +10601 -0
  16. data/ext/alglib/diffequations.cpp +1342 -0
  17. data/ext/alglib/diffequations.h +282 -0
  18. data/ext/alglib/extconf.rb +5 -0
  19. data/ext/alglib/fasttransforms.cpp +4696 -0
  20. data/ext/alglib/fasttransforms.h +1018 -0
  21. data/ext/alglib/integration.cpp +4249 -0
  22. data/ext/alglib/integration.h +869 -0
  23. data/ext/alglib/interpolation.cpp +74502 -0
  24. data/ext/alglib/interpolation.h +12264 -0
  25. data/ext/alglib/kernels_avx2.cpp +2171 -0
  26. data/ext/alglib/kernels_avx2.h +201 -0
  27. data/ext/alglib/kernels_fma.cpp +1065 -0
  28. data/ext/alglib/kernels_fma.h +137 -0
  29. data/ext/alglib/kernels_sse2.cpp +735 -0
  30. data/ext/alglib/kernels_sse2.h +100 -0
  31. data/ext/alglib/linalg.cpp +65182 -0
  32. data/ext/alglib/linalg.h +9927 -0
  33. data/ext/alglib/optimization.cpp +135331 -0
  34. data/ext/alglib/optimization.h +19235 -0
  35. data/ext/alglib/solvers.cpp +20488 -0
  36. data/ext/alglib/solvers.h +4781 -0
  37. data/ext/alglib/specialfunctions.cpp +10672 -0
  38. data/ext/alglib/specialfunctions.h +2305 -0
  39. data/ext/alglib/statistics.cpp +19791 -0
  40. data/ext/alglib/statistics.h +1359 -0
  41. data/ext/alglib/stdafx.h +2 -0
  42. data/gpl2.txt +339 -0
  43. data/gpl3.txt +674 -0
  44. data/lib/alglib/version.rb +3 -0
  45. data/lib/alglib.rb +4 -0
  46. metadata +101 -0
@@ -0,0 +1,1065 @@
1
+ /*************************************************************************
2
+ ALGLIB 4.04.0 (source code generated 2024-12-21)
3
+ Copyright (c) Sergey Bochkanov (ALGLIB project).
4
+
5
+ >>> SOURCE LICENSE >>>
6
+ This program is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation (www.fsf.org); either version 2 of the
9
+ License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ A copy of the GNU General Public License is available at
17
+ http://www.fsf.org/licensing/licenses
18
+ >>> END OF LICENSE >>>
19
+ *************************************************************************/
20
+ #ifdef _MSC_VER
21
+ #define _CRT_SECURE_NO_WARNINGS
22
+ #endif
23
+ #include "stdafx.h"
24
+
25
+ //
26
+ // Must be defined before we include kernel header
27
+ //
28
+ #define _ALGLIB_IMPL_DEFINES
29
+ #define _ALGLIB_INTEGRITY_CHECKS_ONCE
30
+
31
+ #include "kernels_fma.h"
32
+
33
+ // disable some irrelevant warnings
34
+ #if (AE_COMPILER==AE_MSVC) && !defined(AE_ALL_WARNINGS)
35
+ #pragma warning(disable:4100)
36
+ #pragma warning(disable:4127)
37
+ #pragma warning(disable:4611)
38
+ #pragma warning(disable:4702)
39
+ #pragma warning(disable:4996)
40
+ #endif
41
+
42
+ namespace alglib_impl
43
+ {
44
+
45
+
46
+
47
+ #if !defined(ALGLIB_NO_FAST_KERNELS) && defined(_ALGLIB_HAS_FMA_INTRINSICS)
48
+ double rdotv_fma(const ae_int_t n,
49
+ /* Real */ const double* __restrict x,
50
+ /* Real */ const double* __restrict y,
51
+ const ae_state* __restrict _state)
52
+ {
53
+ ae_int_t i;
54
+
55
+ const ae_int_t avx2len = n>>2;
56
+ const ae_int_t fmaLen = (avx2len >> 2) << 2;
57
+ const __m256d* __restrict pX = (const __m256d*)(x);
58
+ const __m256d* __restrict pY = (const __m256d*)(y);
59
+ __m256d ans;
60
+ if(fmaLen >= 4) {
61
+ __m256d fmaUnroll[4];
62
+ fmaUnroll[0] = _mm256_mul_pd(pX[0], pY[0]);
63
+ fmaUnroll[1] = _mm256_mul_pd(pX[1], pY[1]);
64
+ fmaUnroll[2] = _mm256_mul_pd(pX[2], pY[2]);
65
+ fmaUnroll[3] = _mm256_mul_pd(pX[3], pY[3]);
66
+ for(i=4; i<fmaLen; i+=4) {
67
+ fmaUnroll[0] = _mm256_fmadd_pd(pX[i], pY[i], fmaUnroll[0]);
68
+ fmaUnroll[1] = _mm256_fmadd_pd(pX[i+1], pY[i+1], fmaUnroll[1]);
69
+ fmaUnroll[2] = _mm256_fmadd_pd(pX[i+2], pY[i+2], fmaUnroll[2]);
70
+ fmaUnroll[3] = _mm256_fmadd_pd(pX[i+3], pY[i+3], fmaUnroll[3]);
71
+ }
72
+ switch(avx2len-fmaLen) {
73
+ case 3:
74
+ fmaUnroll[2] = _mm256_fmadd_pd(pX[i+2], pY[i+2], fmaUnroll[2]);
75
+ case 2:
76
+ fmaUnroll[1] = _mm256_fmadd_pd(pX[i+1], pY[i+1], fmaUnroll[1]);
77
+ case 1:
78
+ fmaUnroll[0] = _mm256_fmadd_pd(pX[i], pY[i], fmaUnroll[0]);
79
+ }
80
+ ans = _mm256_add_pd(
81
+ _mm256_add_pd(fmaUnroll[0], fmaUnroll[1]),
82
+ _mm256_add_pd(fmaUnroll[2], fmaUnroll[3]));
83
+ }
84
+ else {
85
+ ans = _mm256_setzero_pd();
86
+ switch(avx2len) {
87
+ case 3:
88
+ ans = _mm256_mul_pd(pX[2], pY[2]);
89
+ case 2:
90
+ ans = _mm256_fmadd_pd(pX[1], pY[1], ans);
91
+ case 1:
92
+ ans = _mm256_fmadd_pd(pX[0], pY[0], ans);
93
+ }
94
+ }
95
+ const __m128d s = _mm_add_pd(_mm256_extractf128_pd(ans, 0), _mm256_extractf128_pd(ans, 1));
96
+ const double *pComps = (const double*)&s;
97
+ double dot =pComps[0] + pComps[1];
98
+ const ae_int_t tail = avx2len<<2;
99
+ switch(n-tail) {
100
+ case 1: {
101
+ dot += x[tail]*y[tail];
102
+ break;
103
+ }
104
+ case 2: {
105
+ dot += x[tail+0]*y[tail+0];
106
+ dot += x[tail+1]*y[tail+1];
107
+ break;
108
+ }
109
+ case 3: {
110
+ dot += x[tail+0]*y[tail+0];
111
+ dot += x[tail+1]*y[tail+1];
112
+ dot += x[tail+2]*y[tail+2];
113
+ break;
114
+ }
115
+ }
116
+ return dot;
117
+ }
118
+
119
+ double rdotv2_fma(const ae_int_t n,
120
+ /* Real */ const double* __restrict x,
121
+ const ae_state* __restrict _state)
122
+ {
123
+ ae_int_t i;
124
+
125
+ const ae_int_t avx2len = n>>2;
126
+ const ae_int_t fmaLen = (avx2len >> 2) << 2;
127
+ const __m256d* __restrict pX = (const __m256d*)(x);
128
+ __m256d ans;
129
+ if(fmaLen >= 4) {
130
+ //TODO: this can be further unrolled to 8 because AVX(2) provides 16 registers
131
+ __m256d fmaUnroll[4];
132
+ fmaUnroll[0] = _mm256_mul_pd(pX[0], pX[0]);
133
+ fmaUnroll[1] = _mm256_mul_pd(pX[1], pX[1]);
134
+ fmaUnroll[2] = _mm256_mul_pd(pX[2], pX[2]);
135
+ fmaUnroll[3] = _mm256_mul_pd(pX[3], pX[3]);
136
+ for(i=4; i<fmaLen; i+=4) {
137
+ fmaUnroll[0] = _mm256_fmadd_pd(pX[i], pX[i], fmaUnroll[0]);
138
+ fmaUnroll[1] = _mm256_fmadd_pd(pX[i+1], pX[i+1], fmaUnroll[1]);
139
+ fmaUnroll[2] = _mm256_fmadd_pd(pX[i+2], pX[i+2], fmaUnroll[2]);
140
+ fmaUnroll[3] = _mm256_fmadd_pd(pX[i+3], pX[i+3], fmaUnroll[3]);
141
+ }
142
+ switch(avx2len-fmaLen) {
143
+ case 3:
144
+ fmaUnroll[2] = _mm256_fmadd_pd(pX[i+2], pX[i+2], fmaUnroll[2]);
145
+ case 2:
146
+ fmaUnroll[1] = _mm256_fmadd_pd(pX[i+1], pX[i+1], fmaUnroll[1]);
147
+ case 1:
148
+ fmaUnroll[0] = _mm256_fmadd_pd(pX[i], pX[i], fmaUnroll[0]);
149
+ }
150
+ ans = _mm256_add_pd(
151
+ _mm256_add_pd(fmaUnroll[0], fmaUnroll[1]),
152
+ _mm256_add_pd(fmaUnroll[2], fmaUnroll[3]));
153
+ }
154
+ else {
155
+ ans = _mm256_setzero_pd();
156
+ switch(avx2len) {
157
+ case 3:
158
+ ans = _mm256_mul_pd(pX[2], pX[2]);
159
+ case 2:
160
+ ans = _mm256_fmadd_pd(pX[1], pX[1], ans);
161
+ case 1:
162
+ ans = _mm256_fmadd_pd(pX[0], pX[0], ans);
163
+ }
164
+ }
165
+ const __m128d s = _mm_add_pd(_mm256_extractf128_pd(ans, 0), _mm256_extractf128_pd(ans, 1));
166
+ const double *pComps = (const double*)&s;
167
+ double dot =pComps[0] + pComps[1];
168
+ const ae_int_t tail = avx2len<<2;
169
+ switch(n-tail) {
170
+ case 1: {
171
+ dot += x[tail]*x[tail];
172
+ break;
173
+ }
174
+ case 2: {
175
+ dot += x[tail+0]*x[tail+0];
176
+ dot += x[tail+1]*x[tail+1];
177
+ break;
178
+ }
179
+ case 3: {
180
+ dot += x[tail+0]*x[tail+0];
181
+ dot += x[tail+1]*x[tail+1];
182
+ dot += x[tail+2]*x[tail+2];
183
+ break;
184
+ }
185
+ }
186
+ return dot;
187
+ }
188
+
189
+ void raddv_fma(const ae_int_t n,
190
+ const double alpha,
191
+ /* Real */ const double* __restrict y,
192
+ /* Real */ double* __restrict x,
193
+ const ae_state* __restrict _state)
194
+ {
195
+ ae_int_t i;
196
+
197
+ const ae_int_t avx2len = n>>2;
198
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
199
+ __m256d* __restrict pDest = (__m256d*)(x);
200
+ const __m256d avx2alpha = _mm256_set1_pd(alpha);
201
+ for(i=0; i<avx2len; i++) {
202
+ pDest[i] = _mm256_fmadd_pd(avx2alpha, pSrc[i], pDest[i]);
203
+ }
204
+ const ae_int_t tail = avx2len<<2;
205
+ switch(n-tail) {
206
+ case 1:
207
+ *(double*)(pDest+i) += alpha*(*(const double*)(pSrc+i));
208
+ break;
209
+ case 2:
210
+ *(__m128d*)(pDest+i) = _mm_fmadd_pd(_mm256_extractf128_pd(avx2alpha, 0), *(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
211
+ break;
212
+ case 3:
213
+ *(__m128d*)(pDest+i) = _mm_fmadd_pd(_mm256_extractf128_pd(avx2alpha, 0), *(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
214
+ x[tail+2] += alpha*y[tail+2];
215
+ break;
216
+ }
217
+ }
218
+
219
+ void raddvx_fma3avx_xaligned(const ae_int_t n, const double alpha, const double* __restrict y, double* __restrict x, ae_state *_state)
220
+ {
221
+ ae_int_t i;
222
+ const ae_int_t vecLen = (n>>2)<<2;
223
+ const __m256d avx2alpha = _mm256_set1_pd(alpha);
224
+ __m256d* __restrict pDest = (__m256d*)x;
225
+ for(i=0; i<vecLen; i+=4)
226
+ {
227
+ const ae_int_t iDest = i>>2;
228
+ pDest[iDest] = _mm256_fmadd_pd(avx2alpha, _mm256_loadu_pd(y+i), pDest[iDest]);
229
+ }
230
+ switch(n-vecLen) {
231
+ case 1:
232
+ x[i] += alpha*y[i];
233
+ break;
234
+ case 2: {
235
+ const ae_int_t iDest = i>>2;
236
+ *(__m128d*)(pDest+iDest) = _mm_fmadd_pd(_mm256_extractf128_pd(avx2alpha, 0),
237
+ _mm_loadu_pd(y+i), *(const __m128d*)(pDest+iDest));
238
+ break;
239
+ }
240
+ case 3:
241
+ {
242
+ const ae_int_t iDest = i>>2;
243
+ *(__m128d*)(pDest+iDest) = _mm_fmadd_pd(_mm256_extractf128_pd(avx2alpha, 0), _mm_loadu_pd(y+i), *(const __m128d*)(pDest+iDest));
244
+ x[i+2] += alpha*y[i+2];
245
+ break;
246
+ }
247
+ }
248
+ }
249
+
250
+ void raddvx_fma(const ae_int_t n, const double alpha, const double* __restrict y, double* __restrict x, ae_state *_state)
251
+ {
252
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
253
+ if( n<=4 )
254
+ {
255
+ ae_int_t i;
256
+ for(i=0; i<=n-1; i++)
257
+ x[i] += alpha*y[i];
258
+ return;
259
+ }
260
+ switch(unal) {
261
+ case 0:
262
+ raddvx_fma3avx_xaligned(n, alpha, y, x, _state);
263
+ return;
264
+ case 8:
265
+ x[2] += alpha*y[2];
266
+ case 16:
267
+ x[1] += alpha*y[1];
268
+ case 24: {
269
+ x[0] += alpha*y[0];
270
+ const ptrdiff_t nDone = 4-(unal>>3);
271
+ raddvx_fma3avx_xaligned(n-nDone, alpha, y+nDone, x+nDone, _state);
272
+ return;
273
+ }
274
+ }
275
+ }
276
+
277
+ void rmuladdv_fma(const ae_int_t n,
278
+ const double* __restrict y,
279
+ const double* __restrict z,
280
+ double* __restrict x,
281
+ const ae_state* _state)
282
+ {
283
+ ae_int_t i;
284
+ const ae_int_t avx2len = n>>2;
285
+ const __m256d* __restrict pSrc0 = (const __m256d*)y;
286
+ const __m256d* __restrict pSrc1 = (const __m256d*)z;
287
+ __m256d* __restrict pDest = (__m256d*)x;
288
+ for(i=0; i<avx2len; i++)
289
+ pDest[i] = _mm256_fmadd_pd(pSrc0[i], pSrc1[i], pDest[i]);
290
+ const ae_int_t tail = avx2len<<2;
291
+ for(i=tail; i<n; i++)
292
+ x[i] += y[i]*z[i];
293
+ }
294
+
295
+ void rnegmuladdv_fma(const ae_int_t n,
296
+ const double* __restrict y,
297
+ const double* __restrict z,
298
+ double* __restrict x,
299
+ const ae_state* _state)
300
+ {
301
+ ae_int_t i;
302
+ const ae_int_t avx2len = n>>2;
303
+ const __m256d* __restrict pSrc0 = (const __m256d*)y;
304
+ const __m256d* __restrict pSrc1 = (const __m256d*)z;
305
+ __m256d* __restrict pDest = (__m256d*)x;
306
+ for(i=0; i<avx2len; i++)
307
+ pDest[i] = _mm256_fnmadd_pd(pSrc0[i], pSrc1[i], pDest[i]);
308
+ const ae_int_t tail = avx2len<<2;
309
+ for(i=tail; i<n; i++)
310
+ x[i] -= y[i]*z[i];
311
+ }
312
+
313
+ void rcopymuladdv_fma(const ae_int_t n,
314
+ const double* __restrict y,
315
+ const double* __restrict z,
316
+ const double* __restrict x,
317
+ double* __restrict r,
318
+ const ae_state* _state)
319
+ {
320
+ ae_int_t i;
321
+ const ae_int_t avx2len = n>>2;
322
+ const __m256d* __restrict pSrc0 = (const __m256d*)y;
323
+ const __m256d* __restrict pSrc1 = (const __m256d*)z;
324
+ const __m256d* __restrict pSrc2 = (const __m256d*)x;
325
+ __m256d* __restrict pDest = (__m256d*)r;
326
+ for(i=0; i<avx2len; i++)
327
+ pDest[i] = _mm256_fmadd_pd(pSrc0[i], pSrc1[i], pSrc2[i]);
328
+ const ae_int_t tail = avx2len<<2;
329
+ for(i=tail; i<n; i++)
330
+ r[i] = x[i]+y[i]*z[i];
331
+ }
332
+
333
+ void rcopynegmuladdv_fma(const ae_int_t n,
334
+ const double* __restrict y,
335
+ const double* __restrict z,
336
+ const double* __restrict x,
337
+ double* __restrict r,
338
+ const ae_state* _state)
339
+ {
340
+ ae_int_t i;
341
+ const ae_int_t avx2len = n>>2;
342
+ const __m256d* __restrict pSrc0 = (const __m256d*)y;
343
+ const __m256d* __restrict pSrc1 = (const __m256d*)z;
344
+ const __m256d* __restrict pSrc2 = (const __m256d*)x;
345
+ __m256d* __restrict pDest = (__m256d*)r;
346
+ for(i=0; i<avx2len; i++)
347
+ pDest[i] = _mm256_fnmadd_pd(pSrc0[i], pSrc1[i], pSrc2[i]);
348
+ const ae_int_t tail = avx2len<<2;
349
+ for(i=tail; i<n; i++)
350
+ r[i] = x[i]-y[i]*z[i];
351
+ }
352
+
353
+ void rgemv_straight_fma(const ae_int_t m, const ae_int_t n,
354
+ const double alpha, const ae_matrix* __restrict a,
355
+ const double* __restrict x, double* __restrict y, ae_state* _state)
356
+ {
357
+ ae_int_t i;
358
+ ae_int_t j;
359
+ __m256d sum = _mm256_setzero_pd();
360
+ const __m256d* __restrict pX = (const __m256d*)x;
361
+ const ae_int_t nVec = n >> 2;
362
+ const ae_int_t nUnroll = nVec >> 3;
363
+ for(i=0; i<m; i++) {
364
+ const __m256d* __restrict pRow = (const __m256d*)a->ptr.pp_double[i];
365
+ if(nUnroll >= 1) {
366
+ __m256d u0 = _mm256_mul_pd(pRow[0], pX[0]);
367
+ __m256d u1 = _mm256_mul_pd(pRow[1], pX[1]);
368
+ __m256d u2 = _mm256_mul_pd(pRow[2], pX[2]);
369
+ __m256d u3 = _mm256_mul_pd(pRow[3], pX[3]);
370
+ __m256d u4 = _mm256_mul_pd(pRow[4], pX[4]);
371
+ __m256d u5 = _mm256_mul_pd(pRow[5], pX[5]);
372
+ __m256d u6 = _mm256_mul_pd(pRow[6], pX[6]);
373
+ __m256d u7 = _mm256_mul_pd(pRow[7], pX[7]);
374
+ for(j=1; j<nUnroll; j++) {
375
+ const ae_int_t at = j<<3;
376
+ u0 = _mm256_fmadd_pd(pRow[at], pX[at], u0);
377
+ u1 = _mm256_fmadd_pd(pRow[at+1], pX[at+1], u1);
378
+ u2 = _mm256_fmadd_pd(pRow[at+2], pX[at+2], u2);
379
+ u3 = _mm256_fmadd_pd(pRow[at+3], pX[at+3], u3);
380
+ u4 = _mm256_fmadd_pd(pRow[at+4], pX[at+4], u4);
381
+ u5 = _mm256_fmadd_pd(pRow[at+5], pX[at+5], u5);
382
+ u6 = _mm256_fmadd_pd(pRow[at+6], pX[at+6], u6);
383
+ u7 = _mm256_fmadd_pd(pRow[at+7], pX[at+7], u7);
384
+ }
385
+ const ae_int_t at = j<<3;
386
+ switch(nVec-at) {
387
+ case 7:
388
+ u6 = _mm256_fmadd_pd(pX[at+6], pRow[at+6], u6);
389
+ case 6:
390
+ u5 = _mm256_fmadd_pd(pX[at+5], pRow[at+5], u5);
391
+ case 5:
392
+ u4 = _mm256_fmadd_pd(pX[at+4], pRow[at+4], u4);
393
+ case 4:
394
+ u3 = _mm256_fmadd_pd(pX[at+3], pRow[at+3], u3);
395
+ case 3:
396
+ u2 = _mm256_fmadd_pd(pX[at+2], pRow[at+2], u2);
397
+ case 2:
398
+ u1 = _mm256_fmadd_pd(pX[at+1], pRow[at+1], u1);
399
+ case 1:
400
+ u0 = _mm256_fmadd_pd(pX[at+0], pRow[at+0], u0);
401
+ }
402
+ sum = _mm256_add_pd(
403
+ _mm256_add_pd(_mm256_add_pd(u0, u1), _mm256_add_pd(u2, u3)),
404
+ _mm256_add_pd(_mm256_add_pd(u4, u5), _mm256_add_pd(u6, u7)));
405
+ }
406
+ else {
407
+ switch(nVec) {
408
+ case 0:
409
+ sum = _mm256_setzero_pd();
410
+ break;
411
+ case 1:
412
+ sum = _mm256_mul_pd(pX[0], pRow[0]);
413
+ break;
414
+ case 2:
415
+ sum = _mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1]));
416
+ break;
417
+ case 3:
418
+ sum = _mm256_fmadd_pd(pX[2], pRow[2],
419
+ _mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1])));
420
+ break;
421
+ case 4:
422
+ sum = _mm256_add_pd(
423
+ _mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1])),
424
+ _mm256_fmadd_pd(pX[2], pRow[2], _mm256_mul_pd(pX[3], pRow[3])));
425
+ break;
426
+ case 5:
427
+ sum = _mm256_fmadd_pd(pX[4], pRow[4],
428
+ _mm256_add_pd(
429
+ _mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1])),
430
+ _mm256_fmadd_pd(pX[2], pRow[2], _mm256_mul_pd(pX[3], pRow[3]))));
431
+ break;
432
+ case 6:
433
+ sum = _mm256_add_pd(
434
+ _mm256_add_pd(
435
+ _mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1])),
436
+ _mm256_fmadd_pd(pX[2], pRow[2], _mm256_mul_pd(pX[3], pRow[3]))),
437
+ _mm256_fmadd_pd(pX[4], pRow[4], _mm256_mul_pd(pX[5], pRow[5])));
438
+ break;
439
+ case 7:
440
+ sum = _mm256_add_pd(
441
+ _mm256_add_pd(
442
+ _mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1])),
443
+ _mm256_fmadd_pd(pX[2], pRow[2], _mm256_mul_pd(pX[3], pRow[3]))),
444
+ _mm256_fmadd_pd(pX[6], pRow[6],
445
+ _mm256_fmadd_pd(pX[4], pRow[4], _mm256_mul_pd(pX[5], pRow[5]))));
446
+ break;
447
+ }
448
+ }
449
+ const __m128d t = _mm_add_pd(_mm256_extractf128_pd(sum, 0), _mm256_extractf128_pd(sum, 1));
450
+ const double* pComps = (const double*)&t;
451
+ double ans = pComps[0] + pComps[1];
452
+ const ae_int_t tail = nVec<<2;
453
+ for(j=tail; j<n; j++) {
454
+ ans += a->ptr.pp_double[i][j] * x[j];
455
+ }
456
+ y[i] += alpha*ans;
457
+ }
458
+ }
459
+
460
+ void rgemv_transposed_fma(const ae_int_t m, const ae_int_t n,
461
+ const double alpha, const ae_matrix* __restrict a,
462
+ const double* __restrict x, double* __restrict y, ae_state* _state)
463
+ {
464
+ ae_int_t i;
465
+ ae_int_t j;
466
+ __m256d* __restrict pY = (__m256d*)y;
467
+ const ae_int_t nVec = m >> 2;
468
+
469
+ for(i=0; i<=n-1; i++)
470
+ {
471
+ const __m256d* __restrict pRow = (const __m256d*)a->ptr.pp_double[i];
472
+ const double v = alpha*x[i];
473
+ const __m256d vV = _mm256_set1_pd(v);
474
+ for(j=0; j<nVec; j++)
475
+ {
476
+ pY[j] = _mm256_fmadd_pd(vV, pRow[j], pY[j]);
477
+ }
478
+ const ae_int_t tail = nVec<<2;
479
+ for(j=tail; j<m; j++) {
480
+ y[j] += v*a->ptr.pp_double[i][j];
481
+ }
482
+ }
483
+ }
484
+
485
+ void rgemvx_straight_fma_xaligned(const ae_int_t m, const ae_int_t n,
486
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
487
+ const ae_int_t ja, const double* __restrict x,
488
+ double* __restrict y, ae_state* _state)
489
+ {
490
+ ae_int_t i;
491
+ ae_int_t j;
492
+ const __m256d* __restrict pX = (const __m256d*)x;
493
+ const ae_int_t nVec = n >> 2;
494
+ const ae_int_t nUnroll = nVec >> 3;
495
+ __m256d sum = _mm256_setzero_pd();
496
+ for(i=0; i<m; i++) {
497
+ const __m256d* __restrict pRow = (const __m256d*)(a->ptr.pp_double[i+ia]+ja);
498
+ if(nUnroll >= 1) {
499
+ __m256d u0 = _mm256_mul_pd(ULOAD256PD(pRow[0]), pX[0]);
500
+ __m256d u1 = _mm256_mul_pd(ULOAD256PD(pRow[1]), pX[1]);
501
+ __m256d u2 = _mm256_mul_pd(ULOAD256PD(pRow[2]), pX[2]);
502
+ __m256d u3 = _mm256_mul_pd(ULOAD256PD(pRow[3]), pX[3]);
503
+ __m256d u4 = _mm256_mul_pd(ULOAD256PD(pRow[4]), pX[4]);
504
+ __m256d u5 = _mm256_mul_pd(ULOAD256PD(pRow[5]), pX[5]);
505
+ __m256d u6 = _mm256_mul_pd(ULOAD256PD(pRow[6]), pX[6]);
506
+ __m256d u7 = _mm256_mul_pd(ULOAD256PD(pRow[7]), pX[7]);
507
+ for(j=1; j<nUnroll; j++) {
508
+ const ae_int_t at = j<<3;
509
+ u0 = _mm256_fmadd_pd(ULOAD256PD(pRow[at]), pX[at], u0);
510
+ u1 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+1]), pX[at+1], u1);
511
+ u2 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+2]), pX[at+2], u2);
512
+ u3 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+3]), pX[at+3], u3);
513
+ u4 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+4]), pX[at+4], u4);
514
+ u5 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+5]), pX[at+5], u5);
515
+ u6 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+6]), pX[at+6], u6);
516
+ u7 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+7]), pX[at+7], u7);
517
+ }
518
+ const ae_int_t at = j<<3;
519
+ switch(nVec-at) {
520
+ case 7:
521
+ u6 = _mm256_fmadd_pd(pX[at+6], ULOAD256PD(pRow[at+6]), u6);
522
+ case 6:
523
+ u5 = _mm256_fmadd_pd(pX[at+5], ULOAD256PD(pRow[at+5]), u5);
524
+ case 5:
525
+ u4 = _mm256_fmadd_pd(pX[at+4], ULOAD256PD(pRow[at+4]), u4);
526
+ case 4:
527
+ u3 = _mm256_fmadd_pd(pX[at+3], ULOAD256PD(pRow[at+3]), u3);
528
+ case 3:
529
+ u2 = _mm256_fmadd_pd(pX[at+2], ULOAD256PD(pRow[at+2]), u2);
530
+ case 2:
531
+ u1 = _mm256_fmadd_pd(pX[at+1], ULOAD256PD(pRow[at+1]), u1);
532
+ case 1:
533
+ u0 = _mm256_fmadd_pd(pX[at+0], ULOAD256PD(pRow[at+0]), u0);
534
+ }
535
+ sum = _mm256_add_pd(
536
+ _mm256_add_pd(_mm256_add_pd(u0, u1), _mm256_add_pd(u2, u3)),
537
+ _mm256_add_pd(_mm256_add_pd(u4, u5), _mm256_add_pd(u6, u7)));
538
+ }
539
+ else {
540
+ switch(nVec) {
541
+ case 0:
542
+ sum = _mm256_setzero_pd();
543
+ break;
544
+ case 1:
545
+ sum = _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0]));
546
+ break;
547
+ case 2:
548
+ sum = _mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]),
549
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1])));
550
+ break;
551
+ case 3:
552
+ sum = _mm256_fmadd_pd(pX[2], ULOAD256PD(pRow[2]),
553
+ _mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]), _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))));
554
+ break;
555
+ case 4:
556
+ sum = _mm256_add_pd(
557
+ _mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]), _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
558
+ _mm256_fmadd_pd(pX[2], ULOAD256PD(pRow[2]), _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3]))));
559
+ break;
560
+ case 5:
561
+ sum = _mm256_fmadd_pd(pX[4], ULOAD256PD(pRow[4]),
562
+ _mm256_add_pd(
563
+ _mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]), _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
564
+ _mm256_fmadd_pd(pX[2], ULOAD256PD(pRow[2]), _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))));
565
+ break;
566
+ case 6:
567
+ sum = _mm256_add_pd(
568
+ _mm256_add_pd(
569
+ _mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]), _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
570
+ _mm256_fmadd_pd(pX[2], ULOAD256PD(pRow[2]), _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
571
+ _mm256_fmadd_pd(pX[4], ULOAD256PD(pRow[4]), _mm256_mul_pd(pX[5], ULOAD256PD(pRow[5]))));
572
+ break;
573
+ case 7:
574
+ sum = _mm256_add_pd(
575
+ _mm256_add_pd(
576
+ _mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]), _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
577
+ _mm256_fmadd_pd(pX[2], ULOAD256PD(pRow[2]), _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
578
+ _mm256_fmadd_pd(pX[6], ULOAD256PD(pRow[6]),
579
+ _mm256_fmadd_pd(pX[4], ULOAD256PD(pRow[4]), _mm256_mul_pd(pX[5], ULOAD256PD(pRow[5])))));
580
+ break;
581
+ }
582
+ }
583
+ const __m128d t = _mm_add_pd(_mm256_extractf128_pd(sum, 0), _mm256_extractf128_pd(sum, 1));
584
+ const double* pComps = (const double*)&t;
585
+ double ans = pComps[0] + pComps[1];
586
+ const ae_int_t tail = nVec<<2;
587
+ for(j=tail; j<n; j++) {
588
+ ans += a->ptr.pp_double[i+ia][j+ja] * x[j];
589
+ }
590
+ y[i] += alpha*ans;
591
+ }
592
+ }
593
+
594
+ void rgemvx_straight_fma(const ae_int_t m, const ae_int_t n,
595
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
596
+ const ae_int_t ja, const double* __restrict x,
597
+ double* __restrict y, ae_state* _state)
598
+ {
599
+ ae_int_t i;
600
+ ae_int_t j;
601
+ if( n<=3 ) {
602
+ for(i=0; i<m; i++) {
603
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
604
+ double v = 0.0;
605
+ for(j=0; j<n; j++) {
606
+ v += p_a[j] * x[j];
607
+ }
608
+ y[i] += alpha*v;
609
+ }
610
+ return;
611
+ }
612
+
613
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
614
+ if(unal == 0)
615
+ {
616
+ rgemvx_straight_fma_xaligned(m, n, alpha, a, ia, ja, x, y, _state);
617
+ return;
618
+ }
619
+ const ptrdiff_t shift = 4-(unal>>3);
620
+ for(i=0; i<m; i++) {
621
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
622
+ double v = 0.0;
623
+ for(j=0; j<shift; j++) {
624
+ v += p_a[j] * x[j];
625
+ }
626
+ y[i] += alpha*v;
627
+ }
628
+ rgemvx_straight_fma_xaligned(m, n-shift, alpha, a, ia, ja+shift, x+shift, y, _state);
629
+ }
630
+
631
+ void rgemvx_transposed_fma_yaligned(const ae_int_t m, const ae_int_t n,
632
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
633
+ const ae_int_t ja, const double* __restrict x, double* __restrict y,
634
+ ae_state* _state)
635
+ {
636
+ ae_int_t i;
637
+ ae_int_t j;
638
+ __m256d* __restrict pY = (__m256d*)y;
639
+ const ae_int_t nVec = m >> 2;
640
+
641
+ for(i=0; i<=n-1; i++)
642
+ {
643
+ const __m256d* __restrict pRow = (const __m256d*)(a->ptr.pp_double[i+ia]+ja);
644
+ const double v = alpha*x[i];
645
+ const __m256d vV = _mm256_set1_pd(v);
646
+ for(j=0; j<nVec; j++)
647
+ {
648
+ pY[j] = _mm256_fmadd_pd(vV, ULOAD256PD(pRow[j]), pY[j]);
649
+ }
650
+ const ae_int_t tail = nVec<<2;
651
+ for(j=tail; j<m; j++) {
652
+ y[j] += v*a->ptr.pp_double[i+ia][j+ja];
653
+ }
654
+ }
655
+ }
656
+
657
+ void rgemvx_transposed_fma(const ae_int_t m, const ae_int_t n,
658
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
659
+ const ae_int_t ja, const double* __restrict x, double* __restrict y,
660
+ ae_state* _state)
661
+ {
662
+ ae_int_t i;
663
+ ae_int_t j;
664
+ if( m<=3 ) {
665
+ for(i=0; i<n; i++) {
666
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
667
+ const double v = alpha*x[i];
668
+ for(j=0; j<m; j++) {
669
+ y[j] += v*p_a[j];
670
+ }
671
+ }
672
+ return;
673
+ }
674
+
675
+ const ptrdiff_t unal = ((ptrdiff_t)y) & 31;
676
+ if(unal == 0)
677
+ {
678
+ rgemvx_transposed_fma_yaligned(m, n, alpha, a, ia, ja, x, y, _state);
679
+ return;
680
+ }
681
+ const ptrdiff_t shift = 4-(unal>>3);
682
+ for(i=0; i<n; i++) {
683
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
684
+ const double v = alpha*x[i];
685
+ for(j=0; j<shift; j++) {
686
+ y[j] += v*p_a[j];
687
+ }
688
+ }
689
+ rgemvx_transposed_fma_yaligned(m-shift, n, alpha, a, ia, ja+shift, x, y+shift, _state);
690
+ }
691
+
692
+ /*************************************************************************
693
+ Computes product A*transpose(B) of two MICRO_SIZE*ROUND_LENGTH rowwise
694
+ 'horizontal' matrices, stored with stride=block_size, and writes it to the
695
+ row-wise matrix C.
696
+
697
+ ROUND_LENGTH is expected to be properly SIMD-rounded length, as returned
698
+ by ablasf_packblkh_avx2().
699
+
700
+ Present version of the function supports only MICRO_SIZE=2, the behavior
701
+ is undefined for other micro sizes.
702
+
703
+ Requires AVX2, does NOT check its presense.
704
+
705
+ -- ALGLIB routine --
706
+ 19.07.2021
707
+ Bochkanov Sergey
708
+ *************************************************************************/
709
+ void ablasf_dotblkh_fma(
710
+ const double *src_a,
711
+ const double *src_b,
712
+ ae_int_t round_length,
713
+ ae_int_t block_size,
714
+ ae_int_t micro_size,
715
+ double *dst,
716
+ ae_int_t dst_stride)
717
+ {
718
+ ae_int_t z;
719
+ __m256d r00 = _mm256_setzero_pd(), r01 = _mm256_setzero_pd(), r10 = _mm256_setzero_pd(), r11 = _mm256_setzero_pd();
720
+ if( round_length&0x7 )
721
+ {
722
+ /*
723
+ * round_length is multiple of 4, but not multiple of 8
724
+ */
725
+ for(z=0; z<round_length; z+=4, src_a+=4, src_b+=4)
726
+ {
727
+ __m256d a0 = _mm256_load_pd(src_a);
728
+ __m256d a1 = _mm256_load_pd(src_a+block_size);
729
+ __m256d b0 = _mm256_load_pd(src_b);
730
+ __m256d b1 = _mm256_load_pd(src_b+block_size);
731
+ r00 = _mm256_fmadd_pd(a0, b0, r00);
732
+ r01 = _mm256_fmadd_pd(a0, b1, r01);
733
+ r10 = _mm256_fmadd_pd(a1, b0, r10);
734
+ r11 = _mm256_fmadd_pd(a1, b1, r11);
735
+ }
736
+ }
737
+ else
738
+ {
739
+ /*
740
+ * round_length is multiple of 8
741
+ */
742
+ for(z=0; z<round_length; z+=8, src_a+=8, src_b+=8)
743
+ {
744
+ __m256d a0 = _mm256_load_pd(src_a);
745
+ __m256d a1 = _mm256_load_pd(src_a+block_size);
746
+ __m256d b0 = _mm256_load_pd(src_b);
747
+ __m256d b1 = _mm256_load_pd(src_b+block_size);
748
+ __m256d c0 = _mm256_load_pd(src_a+4);
749
+ __m256d c1 = _mm256_load_pd(src_a+block_size+4);
750
+ __m256d d0 = _mm256_load_pd(src_b+4);
751
+ __m256d d1 = _mm256_load_pd(src_b+block_size+4);
752
+ r00 = _mm256_fmadd_pd(c0, d0, _mm256_fmadd_pd(a0, b0, r00));
753
+ r01 = _mm256_fmadd_pd(c0, d1, _mm256_fmadd_pd(a0, b1, r01));
754
+ r10 = _mm256_fmadd_pd(c1, d0, _mm256_fmadd_pd(a1, b0, r10));
755
+ r11 = _mm256_fmadd_pd(c1, d1, _mm256_fmadd_pd(a1, b1, r11));
756
+ }
757
+ }
758
+ __m256d sum0 = _mm256_hadd_pd(r00,r01);
759
+ __m256d sum1 = _mm256_hadd_pd(r10,r11);
760
+ _mm_store_pd(dst, _mm_add_pd(_mm256_castpd256_pd128(sum0), _mm256_extractf128_pd(sum0,1)));
761
+ _mm_store_pd(dst+dst_stride, _mm_add_pd(_mm256_castpd256_pd128(sum1), _mm256_extractf128_pd(sum1,1)));
762
+ }
763
+
764
+
765
+ /*************************************************************************
766
+ Solving linear system: propagating computed supernode.
767
+
768
+ Propagates computed supernode to the rest of the RHS using SIMD-friendly
769
+ RHS storage format.
770
+
771
+ INPUT PARAMETERS:
772
+
773
+ OUTPUT PARAMETERS:
774
+
775
+ -- ALGLIB routine --
776
+ 08.09.2021
777
+ Bochkanov Sergey
778
+ *************************************************************************/
779
+ void spchol_propagatefwd_fma(/* Real */ const ae_vector* x,
780
+ ae_int_t cols0,
781
+ ae_int_t blocksize,
782
+ /* Integer */ const ae_vector* superrowidx,
783
+ ae_int_t rbase,
784
+ ae_int_t offdiagsize,
785
+ /* Real */ const ae_vector* rowstorage,
786
+ ae_int_t offss,
787
+ ae_int_t sstride,
788
+ /* Real */ ae_vector* simdbuf,
789
+ ae_int_t simdwidth,
790
+ ae_state *_state)
791
+ {
792
+ ae_int_t k;
793
+
794
+ ae_assert(simdwidth==4, "SPCHOL: unexpected stride in propagatefwd()", _state);
795
+ if( sstride==4 )
796
+ {
797
+ /*
798
+ * blocksize is either 3 or 4
799
+ */
800
+ ae_int_t supported = ae_cpuid();
801
+ if( supported&CPU_FMA )
802
+ {
803
+ __m256d x_simd;
804
+ double *p_mat_row = rowstorage->ptr.p_double+offss+blocksize*4;
805
+ double *p_simd_buf = simdbuf->ptr.p_double;
806
+ ae_int_t *p_rowidx = superrowidx->ptr.p_int+rbase;
807
+ if( blocksize==3 )
808
+ {
809
+ double xx[4];
810
+ xx[0] = x->ptr.p_double[cols0];
811
+ xx[1] = x->ptr.p_double[cols0+1];
812
+ xx[2] = x->ptr.p_double[cols0+2];
813
+ xx[3] = 0;
814
+ x_simd = _mm256_loadu_pd(xx);
815
+ }
816
+ else
817
+ x_simd = _mm256_loadu_pd(x->ptr.p_double+cols0);
818
+ for(k=0; k<offdiagsize; k++)
819
+ {
820
+ double *p_buf_row = p_simd_buf+p_rowidx[k]*4;
821
+ _mm256_store_pd(p_buf_row, _mm256_fnmadd_pd(_mm256_load_pd(p_mat_row), x_simd, _mm256_load_pd(p_buf_row)));
822
+ p_mat_row += 4;
823
+ }
824
+ return;
825
+ }
826
+ }
827
+ if( blocksize==2 && sstride==2 )
828
+ {
829
+ /*
830
+ * blocksize is 2, stride is 2
831
+ */
832
+ ae_int_t supported = ae_cpuid();
833
+ if( supported&CPU_FMA )
834
+ {
835
+ __m128d x_simd = _mm_loadu_pd(x->ptr.p_double+cols0);
836
+ double *p_mat_row = rowstorage->ptr.p_double+offss+2*2;
837
+ double *p_simd_buf = simdbuf->ptr.p_double;
838
+ ae_int_t *p_rowidx = superrowidx->ptr.p_int+rbase;
839
+ for(k=0; k<offdiagsize; k++)
840
+ {
841
+ double *p_buf_row = p_simd_buf+p_rowidx[k]*4;
842
+ _mm_store_pd(p_buf_row, _mm_fnmadd_pd(_mm_load_pd(p_mat_row), x_simd, _mm_load_pd(p_buf_row)));
843
+ p_mat_row += 2;
844
+ }
845
+ return;
846
+ }
847
+ }
848
+ ae_assert(ae_false, "spchol_propagatefwd_fma: unsupported input", _state);
849
+ }
850
+
851
+ ae_bool spchol_updatekernelabc4_fma(double* rowstorage,
852
+ ae_int_t offss,
853
+ ae_int_t twidth,
854
+ ae_int_t offsu,
855
+ ae_int_t uheight,
856
+ ae_int_t urank,
857
+ ae_int_t urowstride,
858
+ ae_int_t uwidth,
859
+ const double* diagd,
860
+ ae_int_t offsd,
861
+ const ae_int_t* raw2smap,
862
+ const ae_int_t* superrowidx,
863
+ ae_int_t urbase,
864
+ ae_state *_state)
865
+ {
866
+ ae_int_t k;
867
+ ae_int_t targetrow;
868
+ ae_int_t targetcol;
869
+
870
+ /*
871
+ * Filter out unsupported combinations (ones that are too sparse for the non-SIMD code)
872
+ */
873
+ if( twidth<3||twidth>4 )
874
+ {
875
+ return ae_false;
876
+ }
877
+ if( uwidth<1||uwidth>4 )
878
+ {
879
+ return ae_false;
880
+ }
881
+ if( urank>4 )
882
+ {
883
+ return ae_false;
884
+ }
885
+
886
+ /*
887
+ * Shift input arrays to the beginning of the working area.
888
+ * Prepare SIMD masks
889
+ */
890
+ __m256i v_rankmask = _mm256_cmpgt_epi64(_mm256_set_epi64x(urank, urank, urank, urank), _mm256_set_epi64x(3, 2, 1, 0));
891
+ double *update_storage = rowstorage+offsu;
892
+ double *target_storage = rowstorage+offss;
893
+ superrowidx += urbase;
894
+
895
+ /*
896
+ * Load head of the update matrix
897
+ */
898
+ __m256d v_d0123 = _mm256_maskload_pd(diagd+offsd, v_rankmask);
899
+ __m256d u_0_0123 = _mm256_setzero_pd();
900
+ __m256d u_1_0123 = _mm256_setzero_pd();
901
+ __m256d u_2_0123 = _mm256_setzero_pd();
902
+ __m256d u_3_0123 = _mm256_setzero_pd();
903
+ for(k=0; k<=uwidth-1; k++)
904
+ {
905
+ targetcol = raw2smap[superrowidx[k]];
906
+ if( targetcol==0 )
907
+ u_0_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
908
+ if( targetcol==1 )
909
+ u_1_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
910
+ if( targetcol==2 )
911
+ u_2_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
912
+ if( targetcol==3 )
913
+ u_3_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
914
+ }
915
+
916
+ /*
917
+ * Transpose head
918
+ */
919
+ __m256d u01_lo = _mm256_unpacklo_pd(u_0_0123,u_1_0123);
920
+ __m256d u01_hi = _mm256_unpackhi_pd(u_0_0123,u_1_0123);
921
+ __m256d u23_lo = _mm256_unpacklo_pd(u_2_0123,u_3_0123);
922
+ __m256d u23_hi = _mm256_unpackhi_pd(u_2_0123,u_3_0123);
923
+ __m256d u_0123_0 = _mm256_permute2f128_pd(u01_lo, u23_lo, 0x20);
924
+ __m256d u_0123_1 = _mm256_permute2f128_pd(u01_hi, u23_hi, 0x20);
925
+ __m256d u_0123_2 = _mm256_permute2f128_pd(u23_lo, u01_lo, 0x13);
926
+ __m256d u_0123_3 = _mm256_permute2f128_pd(u23_hi, u01_hi, 0x13);
927
+
928
+ /*
929
+ * Run update
930
+ */
931
+ if( urank==1 )
932
+ {
933
+ for(k=0; k<=uheight-1; k++)
934
+ {
935
+ targetrow = raw2smap[superrowidx[k]]*4;
936
+ double *update_row = rowstorage+offsu+k*urowstride;
937
+ _mm256_store_pd(target_storage+targetrow,
938
+ _mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+0), u_0123_0,
939
+ _mm256_load_pd(target_storage+targetrow)));
940
+ }
941
+ }
942
+ if( urank==2 )
943
+ {
944
+ for(k=0; k<=uheight-1; k++)
945
+ {
946
+ targetrow = raw2smap[superrowidx[k]]*4;
947
+ double *update_row = rowstorage+offsu+k*urowstride;
948
+ _mm256_store_pd(target_storage+targetrow,
949
+ _mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+1), u_0123_1,
950
+ _mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+0), u_0123_0,
951
+ _mm256_load_pd(target_storage+targetrow))));
952
+ }
953
+ }
954
+ if( urank==3 )
955
+ {
956
+ for(k=0; k<=uheight-1; k++)
957
+ {
958
+ targetrow = raw2smap[superrowidx[k]]*4;
959
+ double *update_row = rowstorage+offsu+k*urowstride;
960
+ _mm256_store_pd(target_storage+targetrow,
961
+ _mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+2), u_0123_2,
962
+ _mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+1), u_0123_1,
963
+ _mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+0), u_0123_0,
964
+ _mm256_load_pd(target_storage+targetrow)))));
965
+ }
966
+ }
967
+ if( urank==4 )
968
+ {
969
+ for(k=0; k<=uheight-1; k++)
970
+ {
971
+ targetrow = raw2smap[superrowidx[k]]*4;
972
+ double *update_row = rowstorage+offsu+k*urowstride;
973
+ _mm256_store_pd(target_storage+targetrow,
974
+ _mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+3), u_0123_3,
975
+ _mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+2), u_0123_2,
976
+ _mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+1), u_0123_1,
977
+ _mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+0), u_0123_0,
978
+ _mm256_load_pd(target_storage+targetrow))))));
979
+ }
980
+ }
981
+ return ae_true;
982
+ }
983
+
984
+ ae_bool spchol_updatekernel4444_fma(
985
+ double* rowstorage,
986
+ ae_int_t offss,
987
+ ae_int_t sheight,
988
+ ae_int_t offsu,
989
+ ae_int_t uheight,
990
+ const double* diagd,
991
+ ae_int_t offsd,
992
+ const ae_int_t* raw2smap,
993
+ const ae_int_t* superrowidx,
994
+ ae_int_t urbase,
995
+ ae_state *_state)
996
+ {
997
+ ae_int_t k;
998
+ ae_int_t targetrow;
999
+ ae_int_t offsk;
1000
+ __m256d v_negd_u0, v_negd_u1, v_negd_u2, v_negd_u3, v_negd;
1001
+ __m256d v_w0, v_w1, v_w2, v_w3, u01_lo, u01_hi, u23_lo, u23_hi;
1002
+
1003
+ /*
1004
+ * Compute W = -D*transpose(U[0:3])
1005
+ */
1006
+ v_negd = _mm256_mul_pd(_mm256_loadu_pd(diagd+offsd),_mm256_set1_pd(-1.0));
1007
+ v_negd_u0 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+0*4),v_negd);
1008
+ v_negd_u1 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+1*4),v_negd);
1009
+ v_negd_u2 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+2*4),v_negd);
1010
+ v_negd_u3 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+3*4),v_negd);
1011
+ u01_lo = _mm256_unpacklo_pd(v_negd_u0,v_negd_u1);
1012
+ u01_hi = _mm256_unpackhi_pd(v_negd_u0,v_negd_u1);
1013
+ u23_lo = _mm256_unpacklo_pd(v_negd_u2,v_negd_u3);
1014
+ u23_hi = _mm256_unpackhi_pd(v_negd_u2,v_negd_u3);
1015
+ v_w0 = _mm256_permute2f128_pd(u01_lo, u23_lo, 0x20);
1016
+ v_w1 = _mm256_permute2f128_pd(u01_hi, u23_hi, 0x20);
1017
+ v_w2 = _mm256_permute2f128_pd(u23_lo, u01_lo, 0x13);
1018
+ v_w3 = _mm256_permute2f128_pd(u23_hi, u01_hi, 0x13);
1019
+
1020
+ //
1021
+ // Compute update S:= S + row_scatter(U*W)
1022
+ //
1023
+ if( sheight==uheight )
1024
+ {
1025
+ /*
1026
+ * No row scatter, the most efficient code
1027
+ */
1028
+ for(k=0; k<uheight; k++)
1029
+ {
1030
+ targetrow = offss+k*4;
1031
+ offsk = offsu+k*4;
1032
+ _mm256_store_pd(rowstorage+targetrow,
1033
+ _mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+3),v_w3,
1034
+ _mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+2),v_w2,
1035
+ _mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+1),v_w1,
1036
+ _mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+0),v_w0,
1037
+ _mm256_load_pd(rowstorage+targetrow))))));
1038
+ }
1039
+ }
1040
+ else
1041
+ {
1042
+ /*
1043
+ * Row scatter is performed, less efficient code using double mapping to determine target row index
1044
+ */
1045
+ for(k=0; k<=uheight-1; k++)
1046
+ {
1047
+ targetrow = offss+raw2smap[superrowidx[urbase+k]]*4;
1048
+ offsk = offsu+k*4;
1049
+ _mm256_store_pd(rowstorage+targetrow,
1050
+ _mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+3),v_w3,
1051
+ _mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+2),v_w2,
1052
+ _mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+1),v_w1,
1053
+ _mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+0),v_w0,
1054
+ _mm256_load_pd(rowstorage+targetrow))))));
1055
+ }
1056
+ }
1057
+ return ae_true;
1058
+ }
1059
+
1060
+ /* ALGLIB_NO_FAST_KERNELS, _ALGLIB_HAS_AVX2_INTRINSICS */
1061
+ #endif
1062
+
1063
+
1064
+ }
1065
+