alglib4 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +47 -0
- data/ext/alglib/alglib.cpp +537 -0
- data/ext/alglib/alglib_array_converters.cpp +86 -0
- data/ext/alglib/alglib_array_converters.h +15 -0
- data/ext/alglib/alglib_utils.cpp +10 -0
- data/ext/alglib/alglib_utils.h +6 -0
- data/ext/alglib/alglibinternal.cpp +21749 -0
- data/ext/alglib/alglibinternal.h +2168 -0
- data/ext/alglib/alglibmisc.cpp +9106 -0
- data/ext/alglib/alglibmisc.h +2114 -0
- data/ext/alglib/ap.cpp +20094 -0
- data/ext/alglib/ap.h +7244 -0
- data/ext/alglib/dataanalysis.cpp +52588 -0
- data/ext/alglib/dataanalysis.h +10601 -0
- data/ext/alglib/diffequations.cpp +1342 -0
- data/ext/alglib/diffequations.h +282 -0
- data/ext/alglib/extconf.rb +5 -0
- data/ext/alglib/fasttransforms.cpp +4696 -0
- data/ext/alglib/fasttransforms.h +1018 -0
- data/ext/alglib/integration.cpp +4249 -0
- data/ext/alglib/integration.h +869 -0
- data/ext/alglib/interpolation.cpp +74502 -0
- data/ext/alglib/interpolation.h +12264 -0
- data/ext/alglib/kernels_avx2.cpp +2171 -0
- data/ext/alglib/kernels_avx2.h +201 -0
- data/ext/alglib/kernels_fma.cpp +1065 -0
- data/ext/alglib/kernels_fma.h +137 -0
- data/ext/alglib/kernels_sse2.cpp +735 -0
- data/ext/alglib/kernels_sse2.h +100 -0
- data/ext/alglib/linalg.cpp +65182 -0
- data/ext/alglib/linalg.h +9927 -0
- data/ext/alglib/optimization.cpp +135331 -0
- data/ext/alglib/optimization.h +19235 -0
- data/ext/alglib/solvers.cpp +20488 -0
- data/ext/alglib/solvers.h +4781 -0
- data/ext/alglib/specialfunctions.cpp +10672 -0
- data/ext/alglib/specialfunctions.h +2305 -0
- data/ext/alglib/statistics.cpp +19791 -0
- data/ext/alglib/statistics.h +1359 -0
- data/ext/alglib/stdafx.h +2 -0
- data/gpl2.txt +339 -0
- data/gpl3.txt +674 -0
- data/lib/alglib/version.rb +3 -0
- data/lib/alglib.rb +4 -0
- metadata +101 -0
@@ -0,0 +1,1065 @@
|
|
1
|
+
/*************************************************************************
|
2
|
+
ALGLIB 4.04.0 (source code generated 2024-12-21)
|
3
|
+
Copyright (c) Sergey Bochkanov (ALGLIB project).
|
4
|
+
|
5
|
+
>>> SOURCE LICENSE >>>
|
6
|
+
This program is free software; you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation (www.fsf.org); either version 2 of the
|
9
|
+
License, or (at your option) any later version.
|
10
|
+
|
11
|
+
This program is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
GNU General Public License for more details.
|
15
|
+
|
16
|
+
A copy of the GNU General Public License is available at
|
17
|
+
http://www.fsf.org/licensing/licenses
|
18
|
+
>>> END OF LICENSE >>>
|
19
|
+
*************************************************************************/
|
20
|
+
#ifdef _MSC_VER
|
21
|
+
#define _CRT_SECURE_NO_WARNINGS
|
22
|
+
#endif
|
23
|
+
#include "stdafx.h"
|
24
|
+
|
25
|
+
//
|
26
|
+
// Must be defined before we include kernel header
|
27
|
+
//
|
28
|
+
#define _ALGLIB_IMPL_DEFINES
|
29
|
+
#define _ALGLIB_INTEGRITY_CHECKS_ONCE
|
30
|
+
|
31
|
+
#include "kernels_fma.h"
|
32
|
+
|
33
|
+
// disable some irrelevant warnings
|
34
|
+
#if (AE_COMPILER==AE_MSVC) && !defined(AE_ALL_WARNINGS)
|
35
|
+
#pragma warning(disable:4100)
|
36
|
+
#pragma warning(disable:4127)
|
37
|
+
#pragma warning(disable:4611)
|
38
|
+
#pragma warning(disable:4702)
|
39
|
+
#pragma warning(disable:4996)
|
40
|
+
#endif
|
41
|
+
|
42
|
+
namespace alglib_impl
|
43
|
+
{
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
#if !defined(ALGLIB_NO_FAST_KERNELS) && defined(_ALGLIB_HAS_FMA_INTRINSICS)
|
48
|
+
double rdotv_fma(const ae_int_t n,
|
49
|
+
/* Real */ const double* __restrict x,
|
50
|
+
/* Real */ const double* __restrict y,
|
51
|
+
const ae_state* __restrict _state)
|
52
|
+
{
|
53
|
+
ae_int_t i;
|
54
|
+
|
55
|
+
const ae_int_t avx2len = n>>2;
|
56
|
+
const ae_int_t fmaLen = (avx2len >> 2) << 2;
|
57
|
+
const __m256d* __restrict pX = (const __m256d*)(x);
|
58
|
+
const __m256d* __restrict pY = (const __m256d*)(y);
|
59
|
+
__m256d ans;
|
60
|
+
if(fmaLen >= 4) {
|
61
|
+
__m256d fmaUnroll[4];
|
62
|
+
fmaUnroll[0] = _mm256_mul_pd(pX[0], pY[0]);
|
63
|
+
fmaUnroll[1] = _mm256_mul_pd(pX[1], pY[1]);
|
64
|
+
fmaUnroll[2] = _mm256_mul_pd(pX[2], pY[2]);
|
65
|
+
fmaUnroll[3] = _mm256_mul_pd(pX[3], pY[3]);
|
66
|
+
for(i=4; i<fmaLen; i+=4) {
|
67
|
+
fmaUnroll[0] = _mm256_fmadd_pd(pX[i], pY[i], fmaUnroll[0]);
|
68
|
+
fmaUnroll[1] = _mm256_fmadd_pd(pX[i+1], pY[i+1], fmaUnroll[1]);
|
69
|
+
fmaUnroll[2] = _mm256_fmadd_pd(pX[i+2], pY[i+2], fmaUnroll[2]);
|
70
|
+
fmaUnroll[3] = _mm256_fmadd_pd(pX[i+3], pY[i+3], fmaUnroll[3]);
|
71
|
+
}
|
72
|
+
switch(avx2len-fmaLen) {
|
73
|
+
case 3:
|
74
|
+
fmaUnroll[2] = _mm256_fmadd_pd(pX[i+2], pY[i+2], fmaUnroll[2]);
|
75
|
+
case 2:
|
76
|
+
fmaUnroll[1] = _mm256_fmadd_pd(pX[i+1], pY[i+1], fmaUnroll[1]);
|
77
|
+
case 1:
|
78
|
+
fmaUnroll[0] = _mm256_fmadd_pd(pX[i], pY[i], fmaUnroll[0]);
|
79
|
+
}
|
80
|
+
ans = _mm256_add_pd(
|
81
|
+
_mm256_add_pd(fmaUnroll[0], fmaUnroll[1]),
|
82
|
+
_mm256_add_pd(fmaUnroll[2], fmaUnroll[3]));
|
83
|
+
}
|
84
|
+
else {
|
85
|
+
ans = _mm256_setzero_pd();
|
86
|
+
switch(avx2len) {
|
87
|
+
case 3:
|
88
|
+
ans = _mm256_mul_pd(pX[2], pY[2]);
|
89
|
+
case 2:
|
90
|
+
ans = _mm256_fmadd_pd(pX[1], pY[1], ans);
|
91
|
+
case 1:
|
92
|
+
ans = _mm256_fmadd_pd(pX[0], pY[0], ans);
|
93
|
+
}
|
94
|
+
}
|
95
|
+
const __m128d s = _mm_add_pd(_mm256_extractf128_pd(ans, 0), _mm256_extractf128_pd(ans, 1));
|
96
|
+
const double *pComps = (const double*)&s;
|
97
|
+
double dot =pComps[0] + pComps[1];
|
98
|
+
const ae_int_t tail = avx2len<<2;
|
99
|
+
switch(n-tail) {
|
100
|
+
case 1: {
|
101
|
+
dot += x[tail]*y[tail];
|
102
|
+
break;
|
103
|
+
}
|
104
|
+
case 2: {
|
105
|
+
dot += x[tail+0]*y[tail+0];
|
106
|
+
dot += x[tail+1]*y[tail+1];
|
107
|
+
break;
|
108
|
+
}
|
109
|
+
case 3: {
|
110
|
+
dot += x[tail+0]*y[tail+0];
|
111
|
+
dot += x[tail+1]*y[tail+1];
|
112
|
+
dot += x[tail+2]*y[tail+2];
|
113
|
+
break;
|
114
|
+
}
|
115
|
+
}
|
116
|
+
return dot;
|
117
|
+
}
|
118
|
+
|
119
|
+
double rdotv2_fma(const ae_int_t n,
|
120
|
+
/* Real */ const double* __restrict x,
|
121
|
+
const ae_state* __restrict _state)
|
122
|
+
{
|
123
|
+
ae_int_t i;
|
124
|
+
|
125
|
+
const ae_int_t avx2len = n>>2;
|
126
|
+
const ae_int_t fmaLen = (avx2len >> 2) << 2;
|
127
|
+
const __m256d* __restrict pX = (const __m256d*)(x);
|
128
|
+
__m256d ans;
|
129
|
+
if(fmaLen >= 4) {
|
130
|
+
//TODO: this can be further unrolled to 8 because AVX(2) provides 16 registers
|
131
|
+
__m256d fmaUnroll[4];
|
132
|
+
fmaUnroll[0] = _mm256_mul_pd(pX[0], pX[0]);
|
133
|
+
fmaUnroll[1] = _mm256_mul_pd(pX[1], pX[1]);
|
134
|
+
fmaUnroll[2] = _mm256_mul_pd(pX[2], pX[2]);
|
135
|
+
fmaUnroll[3] = _mm256_mul_pd(pX[3], pX[3]);
|
136
|
+
for(i=4; i<fmaLen; i+=4) {
|
137
|
+
fmaUnroll[0] = _mm256_fmadd_pd(pX[i], pX[i], fmaUnroll[0]);
|
138
|
+
fmaUnroll[1] = _mm256_fmadd_pd(pX[i+1], pX[i+1], fmaUnroll[1]);
|
139
|
+
fmaUnroll[2] = _mm256_fmadd_pd(pX[i+2], pX[i+2], fmaUnroll[2]);
|
140
|
+
fmaUnroll[3] = _mm256_fmadd_pd(pX[i+3], pX[i+3], fmaUnroll[3]);
|
141
|
+
}
|
142
|
+
switch(avx2len-fmaLen) {
|
143
|
+
case 3:
|
144
|
+
fmaUnroll[2] = _mm256_fmadd_pd(pX[i+2], pX[i+2], fmaUnroll[2]);
|
145
|
+
case 2:
|
146
|
+
fmaUnroll[1] = _mm256_fmadd_pd(pX[i+1], pX[i+1], fmaUnroll[1]);
|
147
|
+
case 1:
|
148
|
+
fmaUnroll[0] = _mm256_fmadd_pd(pX[i], pX[i], fmaUnroll[0]);
|
149
|
+
}
|
150
|
+
ans = _mm256_add_pd(
|
151
|
+
_mm256_add_pd(fmaUnroll[0], fmaUnroll[1]),
|
152
|
+
_mm256_add_pd(fmaUnroll[2], fmaUnroll[3]));
|
153
|
+
}
|
154
|
+
else {
|
155
|
+
ans = _mm256_setzero_pd();
|
156
|
+
switch(avx2len) {
|
157
|
+
case 3:
|
158
|
+
ans = _mm256_mul_pd(pX[2], pX[2]);
|
159
|
+
case 2:
|
160
|
+
ans = _mm256_fmadd_pd(pX[1], pX[1], ans);
|
161
|
+
case 1:
|
162
|
+
ans = _mm256_fmadd_pd(pX[0], pX[0], ans);
|
163
|
+
}
|
164
|
+
}
|
165
|
+
const __m128d s = _mm_add_pd(_mm256_extractf128_pd(ans, 0), _mm256_extractf128_pd(ans, 1));
|
166
|
+
const double *pComps = (const double*)&s;
|
167
|
+
double dot =pComps[0] + pComps[1];
|
168
|
+
const ae_int_t tail = avx2len<<2;
|
169
|
+
switch(n-tail) {
|
170
|
+
case 1: {
|
171
|
+
dot += x[tail]*x[tail];
|
172
|
+
break;
|
173
|
+
}
|
174
|
+
case 2: {
|
175
|
+
dot += x[tail+0]*x[tail+0];
|
176
|
+
dot += x[tail+1]*x[tail+1];
|
177
|
+
break;
|
178
|
+
}
|
179
|
+
case 3: {
|
180
|
+
dot += x[tail+0]*x[tail+0];
|
181
|
+
dot += x[tail+1]*x[tail+1];
|
182
|
+
dot += x[tail+2]*x[tail+2];
|
183
|
+
break;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
return dot;
|
187
|
+
}
|
188
|
+
|
189
|
+
void raddv_fma(const ae_int_t n,
|
190
|
+
const double alpha,
|
191
|
+
/* Real */ const double* __restrict y,
|
192
|
+
/* Real */ double* __restrict x,
|
193
|
+
const ae_state* __restrict _state)
|
194
|
+
{
|
195
|
+
ae_int_t i;
|
196
|
+
|
197
|
+
const ae_int_t avx2len = n>>2;
|
198
|
+
const __m256d* __restrict pSrc = (const __m256d*)(y);
|
199
|
+
__m256d* __restrict pDest = (__m256d*)(x);
|
200
|
+
const __m256d avx2alpha = _mm256_set1_pd(alpha);
|
201
|
+
for(i=0; i<avx2len; i++) {
|
202
|
+
pDest[i] = _mm256_fmadd_pd(avx2alpha, pSrc[i], pDest[i]);
|
203
|
+
}
|
204
|
+
const ae_int_t tail = avx2len<<2;
|
205
|
+
switch(n-tail) {
|
206
|
+
case 1:
|
207
|
+
*(double*)(pDest+i) += alpha*(*(const double*)(pSrc+i));
|
208
|
+
break;
|
209
|
+
case 2:
|
210
|
+
*(__m128d*)(pDest+i) = _mm_fmadd_pd(_mm256_extractf128_pd(avx2alpha, 0), *(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
|
211
|
+
break;
|
212
|
+
case 3:
|
213
|
+
*(__m128d*)(pDest+i) = _mm_fmadd_pd(_mm256_extractf128_pd(avx2alpha, 0), *(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
|
214
|
+
x[tail+2] += alpha*y[tail+2];
|
215
|
+
break;
|
216
|
+
}
|
217
|
+
}
|
218
|
+
|
219
|
+
void raddvx_fma3avx_xaligned(const ae_int_t n, const double alpha, const double* __restrict y, double* __restrict x, ae_state *_state)
|
220
|
+
{
|
221
|
+
ae_int_t i;
|
222
|
+
const ae_int_t vecLen = (n>>2)<<2;
|
223
|
+
const __m256d avx2alpha = _mm256_set1_pd(alpha);
|
224
|
+
__m256d* __restrict pDest = (__m256d*)x;
|
225
|
+
for(i=0; i<vecLen; i+=4)
|
226
|
+
{
|
227
|
+
const ae_int_t iDest = i>>2;
|
228
|
+
pDest[iDest] = _mm256_fmadd_pd(avx2alpha, _mm256_loadu_pd(y+i), pDest[iDest]);
|
229
|
+
}
|
230
|
+
switch(n-vecLen) {
|
231
|
+
case 1:
|
232
|
+
x[i] += alpha*y[i];
|
233
|
+
break;
|
234
|
+
case 2: {
|
235
|
+
const ae_int_t iDest = i>>2;
|
236
|
+
*(__m128d*)(pDest+iDest) = _mm_fmadd_pd(_mm256_extractf128_pd(avx2alpha, 0),
|
237
|
+
_mm_loadu_pd(y+i), *(const __m128d*)(pDest+iDest));
|
238
|
+
break;
|
239
|
+
}
|
240
|
+
case 3:
|
241
|
+
{
|
242
|
+
const ae_int_t iDest = i>>2;
|
243
|
+
*(__m128d*)(pDest+iDest) = _mm_fmadd_pd(_mm256_extractf128_pd(avx2alpha, 0), _mm_loadu_pd(y+i), *(const __m128d*)(pDest+iDest));
|
244
|
+
x[i+2] += alpha*y[i+2];
|
245
|
+
break;
|
246
|
+
}
|
247
|
+
}
|
248
|
+
}
|
249
|
+
|
250
|
+
void raddvx_fma(const ae_int_t n, const double alpha, const double* __restrict y, double* __restrict x, ae_state *_state)
|
251
|
+
{
|
252
|
+
const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
|
253
|
+
if( n<=4 )
|
254
|
+
{
|
255
|
+
ae_int_t i;
|
256
|
+
for(i=0; i<=n-1; i++)
|
257
|
+
x[i] += alpha*y[i];
|
258
|
+
return;
|
259
|
+
}
|
260
|
+
switch(unal) {
|
261
|
+
case 0:
|
262
|
+
raddvx_fma3avx_xaligned(n, alpha, y, x, _state);
|
263
|
+
return;
|
264
|
+
case 8:
|
265
|
+
x[2] += alpha*y[2];
|
266
|
+
case 16:
|
267
|
+
x[1] += alpha*y[1];
|
268
|
+
case 24: {
|
269
|
+
x[0] += alpha*y[0];
|
270
|
+
const ptrdiff_t nDone = 4-(unal>>3);
|
271
|
+
raddvx_fma3avx_xaligned(n-nDone, alpha, y+nDone, x+nDone, _state);
|
272
|
+
return;
|
273
|
+
}
|
274
|
+
}
|
275
|
+
}
|
276
|
+
|
277
|
+
void rmuladdv_fma(const ae_int_t n,
|
278
|
+
const double* __restrict y,
|
279
|
+
const double* __restrict z,
|
280
|
+
double* __restrict x,
|
281
|
+
const ae_state* _state)
|
282
|
+
{
|
283
|
+
ae_int_t i;
|
284
|
+
const ae_int_t avx2len = n>>2;
|
285
|
+
const __m256d* __restrict pSrc0 = (const __m256d*)y;
|
286
|
+
const __m256d* __restrict pSrc1 = (const __m256d*)z;
|
287
|
+
__m256d* __restrict pDest = (__m256d*)x;
|
288
|
+
for(i=0; i<avx2len; i++)
|
289
|
+
pDest[i] = _mm256_fmadd_pd(pSrc0[i], pSrc1[i], pDest[i]);
|
290
|
+
const ae_int_t tail = avx2len<<2;
|
291
|
+
for(i=tail; i<n; i++)
|
292
|
+
x[i] += y[i]*z[i];
|
293
|
+
}
|
294
|
+
|
295
|
+
void rnegmuladdv_fma(const ae_int_t n,
|
296
|
+
const double* __restrict y,
|
297
|
+
const double* __restrict z,
|
298
|
+
double* __restrict x,
|
299
|
+
const ae_state* _state)
|
300
|
+
{
|
301
|
+
ae_int_t i;
|
302
|
+
const ae_int_t avx2len = n>>2;
|
303
|
+
const __m256d* __restrict pSrc0 = (const __m256d*)y;
|
304
|
+
const __m256d* __restrict pSrc1 = (const __m256d*)z;
|
305
|
+
__m256d* __restrict pDest = (__m256d*)x;
|
306
|
+
for(i=0; i<avx2len; i++)
|
307
|
+
pDest[i] = _mm256_fnmadd_pd(pSrc0[i], pSrc1[i], pDest[i]);
|
308
|
+
const ae_int_t tail = avx2len<<2;
|
309
|
+
for(i=tail; i<n; i++)
|
310
|
+
x[i] -= y[i]*z[i];
|
311
|
+
}
|
312
|
+
|
313
|
+
void rcopymuladdv_fma(const ae_int_t n,
|
314
|
+
const double* __restrict y,
|
315
|
+
const double* __restrict z,
|
316
|
+
const double* __restrict x,
|
317
|
+
double* __restrict r,
|
318
|
+
const ae_state* _state)
|
319
|
+
{
|
320
|
+
ae_int_t i;
|
321
|
+
const ae_int_t avx2len = n>>2;
|
322
|
+
const __m256d* __restrict pSrc0 = (const __m256d*)y;
|
323
|
+
const __m256d* __restrict pSrc1 = (const __m256d*)z;
|
324
|
+
const __m256d* __restrict pSrc2 = (const __m256d*)x;
|
325
|
+
__m256d* __restrict pDest = (__m256d*)r;
|
326
|
+
for(i=0; i<avx2len; i++)
|
327
|
+
pDest[i] = _mm256_fmadd_pd(pSrc0[i], pSrc1[i], pSrc2[i]);
|
328
|
+
const ae_int_t tail = avx2len<<2;
|
329
|
+
for(i=tail; i<n; i++)
|
330
|
+
r[i] = x[i]+y[i]*z[i];
|
331
|
+
}
|
332
|
+
|
333
|
+
void rcopynegmuladdv_fma(const ae_int_t n,
|
334
|
+
const double* __restrict y,
|
335
|
+
const double* __restrict z,
|
336
|
+
const double* __restrict x,
|
337
|
+
double* __restrict r,
|
338
|
+
const ae_state* _state)
|
339
|
+
{
|
340
|
+
ae_int_t i;
|
341
|
+
const ae_int_t avx2len = n>>2;
|
342
|
+
const __m256d* __restrict pSrc0 = (const __m256d*)y;
|
343
|
+
const __m256d* __restrict pSrc1 = (const __m256d*)z;
|
344
|
+
const __m256d* __restrict pSrc2 = (const __m256d*)x;
|
345
|
+
__m256d* __restrict pDest = (__m256d*)r;
|
346
|
+
for(i=0; i<avx2len; i++)
|
347
|
+
pDest[i] = _mm256_fnmadd_pd(pSrc0[i], pSrc1[i], pSrc2[i]);
|
348
|
+
const ae_int_t tail = avx2len<<2;
|
349
|
+
for(i=tail; i<n; i++)
|
350
|
+
r[i] = x[i]-y[i]*z[i];
|
351
|
+
}
|
352
|
+
|
353
|
+
void rgemv_straight_fma(const ae_int_t m, const ae_int_t n,
|
354
|
+
const double alpha, const ae_matrix* __restrict a,
|
355
|
+
const double* __restrict x, double* __restrict y, ae_state* _state)
|
356
|
+
{
|
357
|
+
ae_int_t i;
|
358
|
+
ae_int_t j;
|
359
|
+
__m256d sum = _mm256_setzero_pd();
|
360
|
+
const __m256d* __restrict pX = (const __m256d*)x;
|
361
|
+
const ae_int_t nVec = n >> 2;
|
362
|
+
const ae_int_t nUnroll = nVec >> 3;
|
363
|
+
for(i=0; i<m; i++) {
|
364
|
+
const __m256d* __restrict pRow = (const __m256d*)a->ptr.pp_double[i];
|
365
|
+
if(nUnroll >= 1) {
|
366
|
+
__m256d u0 = _mm256_mul_pd(pRow[0], pX[0]);
|
367
|
+
__m256d u1 = _mm256_mul_pd(pRow[1], pX[1]);
|
368
|
+
__m256d u2 = _mm256_mul_pd(pRow[2], pX[2]);
|
369
|
+
__m256d u3 = _mm256_mul_pd(pRow[3], pX[3]);
|
370
|
+
__m256d u4 = _mm256_mul_pd(pRow[4], pX[4]);
|
371
|
+
__m256d u5 = _mm256_mul_pd(pRow[5], pX[5]);
|
372
|
+
__m256d u6 = _mm256_mul_pd(pRow[6], pX[6]);
|
373
|
+
__m256d u7 = _mm256_mul_pd(pRow[7], pX[7]);
|
374
|
+
for(j=1; j<nUnroll; j++) {
|
375
|
+
const ae_int_t at = j<<3;
|
376
|
+
u0 = _mm256_fmadd_pd(pRow[at], pX[at], u0);
|
377
|
+
u1 = _mm256_fmadd_pd(pRow[at+1], pX[at+1], u1);
|
378
|
+
u2 = _mm256_fmadd_pd(pRow[at+2], pX[at+2], u2);
|
379
|
+
u3 = _mm256_fmadd_pd(pRow[at+3], pX[at+3], u3);
|
380
|
+
u4 = _mm256_fmadd_pd(pRow[at+4], pX[at+4], u4);
|
381
|
+
u5 = _mm256_fmadd_pd(pRow[at+5], pX[at+5], u5);
|
382
|
+
u6 = _mm256_fmadd_pd(pRow[at+6], pX[at+6], u6);
|
383
|
+
u7 = _mm256_fmadd_pd(pRow[at+7], pX[at+7], u7);
|
384
|
+
}
|
385
|
+
const ae_int_t at = j<<3;
|
386
|
+
switch(nVec-at) {
|
387
|
+
case 7:
|
388
|
+
u6 = _mm256_fmadd_pd(pX[at+6], pRow[at+6], u6);
|
389
|
+
case 6:
|
390
|
+
u5 = _mm256_fmadd_pd(pX[at+5], pRow[at+5], u5);
|
391
|
+
case 5:
|
392
|
+
u4 = _mm256_fmadd_pd(pX[at+4], pRow[at+4], u4);
|
393
|
+
case 4:
|
394
|
+
u3 = _mm256_fmadd_pd(pX[at+3], pRow[at+3], u3);
|
395
|
+
case 3:
|
396
|
+
u2 = _mm256_fmadd_pd(pX[at+2], pRow[at+2], u2);
|
397
|
+
case 2:
|
398
|
+
u1 = _mm256_fmadd_pd(pX[at+1], pRow[at+1], u1);
|
399
|
+
case 1:
|
400
|
+
u0 = _mm256_fmadd_pd(pX[at+0], pRow[at+0], u0);
|
401
|
+
}
|
402
|
+
sum = _mm256_add_pd(
|
403
|
+
_mm256_add_pd(_mm256_add_pd(u0, u1), _mm256_add_pd(u2, u3)),
|
404
|
+
_mm256_add_pd(_mm256_add_pd(u4, u5), _mm256_add_pd(u6, u7)));
|
405
|
+
}
|
406
|
+
else {
|
407
|
+
switch(nVec) {
|
408
|
+
case 0:
|
409
|
+
sum = _mm256_setzero_pd();
|
410
|
+
break;
|
411
|
+
case 1:
|
412
|
+
sum = _mm256_mul_pd(pX[0], pRow[0]);
|
413
|
+
break;
|
414
|
+
case 2:
|
415
|
+
sum = _mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1]));
|
416
|
+
break;
|
417
|
+
case 3:
|
418
|
+
sum = _mm256_fmadd_pd(pX[2], pRow[2],
|
419
|
+
_mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1])));
|
420
|
+
break;
|
421
|
+
case 4:
|
422
|
+
sum = _mm256_add_pd(
|
423
|
+
_mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1])),
|
424
|
+
_mm256_fmadd_pd(pX[2], pRow[2], _mm256_mul_pd(pX[3], pRow[3])));
|
425
|
+
break;
|
426
|
+
case 5:
|
427
|
+
sum = _mm256_fmadd_pd(pX[4], pRow[4],
|
428
|
+
_mm256_add_pd(
|
429
|
+
_mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1])),
|
430
|
+
_mm256_fmadd_pd(pX[2], pRow[2], _mm256_mul_pd(pX[3], pRow[3]))));
|
431
|
+
break;
|
432
|
+
case 6:
|
433
|
+
sum = _mm256_add_pd(
|
434
|
+
_mm256_add_pd(
|
435
|
+
_mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1])),
|
436
|
+
_mm256_fmadd_pd(pX[2], pRow[2], _mm256_mul_pd(pX[3], pRow[3]))),
|
437
|
+
_mm256_fmadd_pd(pX[4], pRow[4], _mm256_mul_pd(pX[5], pRow[5])));
|
438
|
+
break;
|
439
|
+
case 7:
|
440
|
+
sum = _mm256_add_pd(
|
441
|
+
_mm256_add_pd(
|
442
|
+
_mm256_fmadd_pd(pX[0], pRow[0], _mm256_mul_pd(pX[1], pRow[1])),
|
443
|
+
_mm256_fmadd_pd(pX[2], pRow[2], _mm256_mul_pd(pX[3], pRow[3]))),
|
444
|
+
_mm256_fmadd_pd(pX[6], pRow[6],
|
445
|
+
_mm256_fmadd_pd(pX[4], pRow[4], _mm256_mul_pd(pX[5], pRow[5]))));
|
446
|
+
break;
|
447
|
+
}
|
448
|
+
}
|
449
|
+
const __m128d t = _mm_add_pd(_mm256_extractf128_pd(sum, 0), _mm256_extractf128_pd(sum, 1));
|
450
|
+
const double* pComps = (const double*)&t;
|
451
|
+
double ans = pComps[0] + pComps[1];
|
452
|
+
const ae_int_t tail = nVec<<2;
|
453
|
+
for(j=tail; j<n; j++) {
|
454
|
+
ans += a->ptr.pp_double[i][j] * x[j];
|
455
|
+
}
|
456
|
+
y[i] += alpha*ans;
|
457
|
+
}
|
458
|
+
}
|
459
|
+
|
460
|
+
void rgemv_transposed_fma(const ae_int_t m, const ae_int_t n,
|
461
|
+
const double alpha, const ae_matrix* __restrict a,
|
462
|
+
const double* __restrict x, double* __restrict y, ae_state* _state)
|
463
|
+
{
|
464
|
+
ae_int_t i;
|
465
|
+
ae_int_t j;
|
466
|
+
__m256d* __restrict pY = (__m256d*)y;
|
467
|
+
const ae_int_t nVec = m >> 2;
|
468
|
+
|
469
|
+
for(i=0; i<=n-1; i++)
|
470
|
+
{
|
471
|
+
const __m256d* __restrict pRow = (const __m256d*)a->ptr.pp_double[i];
|
472
|
+
const double v = alpha*x[i];
|
473
|
+
const __m256d vV = _mm256_set1_pd(v);
|
474
|
+
for(j=0; j<nVec; j++)
|
475
|
+
{
|
476
|
+
pY[j] = _mm256_fmadd_pd(vV, pRow[j], pY[j]);
|
477
|
+
}
|
478
|
+
const ae_int_t tail = nVec<<2;
|
479
|
+
for(j=tail; j<m; j++) {
|
480
|
+
y[j] += v*a->ptr.pp_double[i][j];
|
481
|
+
}
|
482
|
+
}
|
483
|
+
}
|
484
|
+
|
485
|
+
void rgemvx_straight_fma_xaligned(const ae_int_t m, const ae_int_t n,
|
486
|
+
const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
|
487
|
+
const ae_int_t ja, const double* __restrict x,
|
488
|
+
double* __restrict y, ae_state* _state)
|
489
|
+
{
|
490
|
+
ae_int_t i;
|
491
|
+
ae_int_t j;
|
492
|
+
const __m256d* __restrict pX = (const __m256d*)x;
|
493
|
+
const ae_int_t nVec = n >> 2;
|
494
|
+
const ae_int_t nUnroll = nVec >> 3;
|
495
|
+
__m256d sum = _mm256_setzero_pd();
|
496
|
+
for(i=0; i<m; i++) {
|
497
|
+
const __m256d* __restrict pRow = (const __m256d*)(a->ptr.pp_double[i+ia]+ja);
|
498
|
+
if(nUnroll >= 1) {
|
499
|
+
__m256d u0 = _mm256_mul_pd(ULOAD256PD(pRow[0]), pX[0]);
|
500
|
+
__m256d u1 = _mm256_mul_pd(ULOAD256PD(pRow[1]), pX[1]);
|
501
|
+
__m256d u2 = _mm256_mul_pd(ULOAD256PD(pRow[2]), pX[2]);
|
502
|
+
__m256d u3 = _mm256_mul_pd(ULOAD256PD(pRow[3]), pX[3]);
|
503
|
+
__m256d u4 = _mm256_mul_pd(ULOAD256PD(pRow[4]), pX[4]);
|
504
|
+
__m256d u5 = _mm256_mul_pd(ULOAD256PD(pRow[5]), pX[5]);
|
505
|
+
__m256d u6 = _mm256_mul_pd(ULOAD256PD(pRow[6]), pX[6]);
|
506
|
+
__m256d u7 = _mm256_mul_pd(ULOAD256PD(pRow[7]), pX[7]);
|
507
|
+
for(j=1; j<nUnroll; j++) {
|
508
|
+
const ae_int_t at = j<<3;
|
509
|
+
u0 = _mm256_fmadd_pd(ULOAD256PD(pRow[at]), pX[at], u0);
|
510
|
+
u1 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+1]), pX[at+1], u1);
|
511
|
+
u2 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+2]), pX[at+2], u2);
|
512
|
+
u3 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+3]), pX[at+3], u3);
|
513
|
+
u4 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+4]), pX[at+4], u4);
|
514
|
+
u5 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+5]), pX[at+5], u5);
|
515
|
+
u6 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+6]), pX[at+6], u6);
|
516
|
+
u7 = _mm256_fmadd_pd(ULOAD256PD(pRow[at+7]), pX[at+7], u7);
|
517
|
+
}
|
518
|
+
const ae_int_t at = j<<3;
|
519
|
+
switch(nVec-at) {
|
520
|
+
case 7:
|
521
|
+
u6 = _mm256_fmadd_pd(pX[at+6], ULOAD256PD(pRow[at+6]), u6);
|
522
|
+
case 6:
|
523
|
+
u5 = _mm256_fmadd_pd(pX[at+5], ULOAD256PD(pRow[at+5]), u5);
|
524
|
+
case 5:
|
525
|
+
u4 = _mm256_fmadd_pd(pX[at+4], ULOAD256PD(pRow[at+4]), u4);
|
526
|
+
case 4:
|
527
|
+
u3 = _mm256_fmadd_pd(pX[at+3], ULOAD256PD(pRow[at+3]), u3);
|
528
|
+
case 3:
|
529
|
+
u2 = _mm256_fmadd_pd(pX[at+2], ULOAD256PD(pRow[at+2]), u2);
|
530
|
+
case 2:
|
531
|
+
u1 = _mm256_fmadd_pd(pX[at+1], ULOAD256PD(pRow[at+1]), u1);
|
532
|
+
case 1:
|
533
|
+
u0 = _mm256_fmadd_pd(pX[at+0], ULOAD256PD(pRow[at+0]), u0);
|
534
|
+
}
|
535
|
+
sum = _mm256_add_pd(
|
536
|
+
_mm256_add_pd(_mm256_add_pd(u0, u1), _mm256_add_pd(u2, u3)),
|
537
|
+
_mm256_add_pd(_mm256_add_pd(u4, u5), _mm256_add_pd(u6, u7)));
|
538
|
+
}
|
539
|
+
else {
|
540
|
+
switch(nVec) {
|
541
|
+
case 0:
|
542
|
+
sum = _mm256_setzero_pd();
|
543
|
+
break;
|
544
|
+
case 1:
|
545
|
+
sum = _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0]));
|
546
|
+
break;
|
547
|
+
case 2:
|
548
|
+
sum = _mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]),
|
549
|
+
_mm256_mul_pd(pX[1], ULOAD256PD(pRow[1])));
|
550
|
+
break;
|
551
|
+
case 3:
|
552
|
+
sum = _mm256_fmadd_pd(pX[2], ULOAD256PD(pRow[2]),
|
553
|
+
_mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]), _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))));
|
554
|
+
break;
|
555
|
+
case 4:
|
556
|
+
sum = _mm256_add_pd(
|
557
|
+
_mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]), _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
|
558
|
+
_mm256_fmadd_pd(pX[2], ULOAD256PD(pRow[2]), _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3]))));
|
559
|
+
break;
|
560
|
+
case 5:
|
561
|
+
sum = _mm256_fmadd_pd(pX[4], ULOAD256PD(pRow[4]),
|
562
|
+
_mm256_add_pd(
|
563
|
+
_mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]), _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
|
564
|
+
_mm256_fmadd_pd(pX[2], ULOAD256PD(pRow[2]), _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))));
|
565
|
+
break;
|
566
|
+
case 6:
|
567
|
+
sum = _mm256_add_pd(
|
568
|
+
_mm256_add_pd(
|
569
|
+
_mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]), _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
|
570
|
+
_mm256_fmadd_pd(pX[2], ULOAD256PD(pRow[2]), _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
|
571
|
+
_mm256_fmadd_pd(pX[4], ULOAD256PD(pRow[4]), _mm256_mul_pd(pX[5], ULOAD256PD(pRow[5]))));
|
572
|
+
break;
|
573
|
+
case 7:
|
574
|
+
sum = _mm256_add_pd(
|
575
|
+
_mm256_add_pd(
|
576
|
+
_mm256_fmadd_pd(pX[0], ULOAD256PD(pRow[0]), _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
|
577
|
+
_mm256_fmadd_pd(pX[2], ULOAD256PD(pRow[2]), _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
|
578
|
+
_mm256_fmadd_pd(pX[6], ULOAD256PD(pRow[6]),
|
579
|
+
_mm256_fmadd_pd(pX[4], ULOAD256PD(pRow[4]), _mm256_mul_pd(pX[5], ULOAD256PD(pRow[5])))));
|
580
|
+
break;
|
581
|
+
}
|
582
|
+
}
|
583
|
+
const __m128d t = _mm_add_pd(_mm256_extractf128_pd(sum, 0), _mm256_extractf128_pd(sum, 1));
|
584
|
+
const double* pComps = (const double*)&t;
|
585
|
+
double ans = pComps[0] + pComps[1];
|
586
|
+
const ae_int_t tail = nVec<<2;
|
587
|
+
for(j=tail; j<n; j++) {
|
588
|
+
ans += a->ptr.pp_double[i+ia][j+ja] * x[j];
|
589
|
+
}
|
590
|
+
y[i] += alpha*ans;
|
591
|
+
}
|
592
|
+
}
|
593
|
+
|
594
|
+
void rgemvx_straight_fma(const ae_int_t m, const ae_int_t n,
|
595
|
+
const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
|
596
|
+
const ae_int_t ja, const double* __restrict x,
|
597
|
+
double* __restrict y, ae_state* _state)
|
598
|
+
{
|
599
|
+
ae_int_t i;
|
600
|
+
ae_int_t j;
|
601
|
+
if( n<=3 ) {
|
602
|
+
for(i=0; i<m; i++) {
|
603
|
+
const double *p_a = a->ptr.pp_double[ia+i]+ja;
|
604
|
+
double v = 0.0;
|
605
|
+
for(j=0; j<n; j++) {
|
606
|
+
v += p_a[j] * x[j];
|
607
|
+
}
|
608
|
+
y[i] += alpha*v;
|
609
|
+
}
|
610
|
+
return;
|
611
|
+
}
|
612
|
+
|
613
|
+
const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
|
614
|
+
if(unal == 0)
|
615
|
+
{
|
616
|
+
rgemvx_straight_fma_xaligned(m, n, alpha, a, ia, ja, x, y, _state);
|
617
|
+
return;
|
618
|
+
}
|
619
|
+
const ptrdiff_t shift = 4-(unal>>3);
|
620
|
+
for(i=0; i<m; i++) {
|
621
|
+
const double *p_a = a->ptr.pp_double[ia+i]+ja;
|
622
|
+
double v = 0.0;
|
623
|
+
for(j=0; j<shift; j++) {
|
624
|
+
v += p_a[j] * x[j];
|
625
|
+
}
|
626
|
+
y[i] += alpha*v;
|
627
|
+
}
|
628
|
+
rgemvx_straight_fma_xaligned(m, n-shift, alpha, a, ia, ja+shift, x+shift, y, _state);
|
629
|
+
}
|
630
|
+
|
631
|
+
void rgemvx_transposed_fma_yaligned(const ae_int_t m, const ae_int_t n,
|
632
|
+
const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
|
633
|
+
const ae_int_t ja, const double* __restrict x, double* __restrict y,
|
634
|
+
ae_state* _state)
|
635
|
+
{
|
636
|
+
ae_int_t i;
|
637
|
+
ae_int_t j;
|
638
|
+
__m256d* __restrict pY = (__m256d*)y;
|
639
|
+
const ae_int_t nVec = m >> 2;
|
640
|
+
|
641
|
+
for(i=0; i<=n-1; i++)
|
642
|
+
{
|
643
|
+
const __m256d* __restrict pRow = (const __m256d*)(a->ptr.pp_double[i+ia]+ja);
|
644
|
+
const double v = alpha*x[i];
|
645
|
+
const __m256d vV = _mm256_set1_pd(v);
|
646
|
+
for(j=0; j<nVec; j++)
|
647
|
+
{
|
648
|
+
pY[j] = _mm256_fmadd_pd(vV, ULOAD256PD(pRow[j]), pY[j]);
|
649
|
+
}
|
650
|
+
const ae_int_t tail = nVec<<2;
|
651
|
+
for(j=tail; j<m; j++) {
|
652
|
+
y[j] += v*a->ptr.pp_double[i+ia][j+ja];
|
653
|
+
}
|
654
|
+
}
|
655
|
+
}
|
656
|
+
|
657
|
+
void rgemvx_transposed_fma(const ae_int_t m, const ae_int_t n,
|
658
|
+
const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
|
659
|
+
const ae_int_t ja, const double* __restrict x, double* __restrict y,
|
660
|
+
ae_state* _state)
|
661
|
+
{
|
662
|
+
ae_int_t i;
|
663
|
+
ae_int_t j;
|
664
|
+
if( m<=3 ) {
|
665
|
+
for(i=0; i<n; i++) {
|
666
|
+
const double *p_a = a->ptr.pp_double[ia+i]+ja;
|
667
|
+
const double v = alpha*x[i];
|
668
|
+
for(j=0; j<m; j++) {
|
669
|
+
y[j] += v*p_a[j];
|
670
|
+
}
|
671
|
+
}
|
672
|
+
return;
|
673
|
+
}
|
674
|
+
|
675
|
+
const ptrdiff_t unal = ((ptrdiff_t)y) & 31;
|
676
|
+
if(unal == 0)
|
677
|
+
{
|
678
|
+
rgemvx_transposed_fma_yaligned(m, n, alpha, a, ia, ja, x, y, _state);
|
679
|
+
return;
|
680
|
+
}
|
681
|
+
const ptrdiff_t shift = 4-(unal>>3);
|
682
|
+
for(i=0; i<n; i++) {
|
683
|
+
const double *p_a = a->ptr.pp_double[ia+i]+ja;
|
684
|
+
const double v = alpha*x[i];
|
685
|
+
for(j=0; j<shift; j++) {
|
686
|
+
y[j] += v*p_a[j];
|
687
|
+
}
|
688
|
+
}
|
689
|
+
rgemvx_transposed_fma_yaligned(m-shift, n, alpha, a, ia, ja+shift, x, y+shift, _state);
|
690
|
+
}
|
691
|
+
|
692
|
+
/*************************************************************************
|
693
|
+
Computes product A*transpose(B) of two MICRO_SIZE*ROUND_LENGTH rowwise
|
694
|
+
'horizontal' matrices, stored with stride=block_size, and writes it to the
|
695
|
+
row-wise matrix C.
|
696
|
+
|
697
|
+
ROUND_LENGTH is expected to be properly SIMD-rounded length, as returned
|
698
|
+
by ablasf_packblkh_avx2().
|
699
|
+
|
700
|
+
Present version of the function supports only MICRO_SIZE=2, the behavior
|
701
|
+
is undefined for other micro sizes.
|
702
|
+
|
703
|
+
Requires AVX2, does NOT check its presense.
|
704
|
+
|
705
|
+
-- ALGLIB routine --
|
706
|
+
19.07.2021
|
707
|
+
Bochkanov Sergey
|
708
|
+
*************************************************************************/
|
709
|
+
void ablasf_dotblkh_fma(
|
710
|
+
const double *src_a,
|
711
|
+
const double *src_b,
|
712
|
+
ae_int_t round_length,
|
713
|
+
ae_int_t block_size,
|
714
|
+
ae_int_t micro_size,
|
715
|
+
double *dst,
|
716
|
+
ae_int_t dst_stride)
|
717
|
+
{
|
718
|
+
ae_int_t z;
|
719
|
+
__m256d r00 = _mm256_setzero_pd(), r01 = _mm256_setzero_pd(), r10 = _mm256_setzero_pd(), r11 = _mm256_setzero_pd();
|
720
|
+
if( round_length&0x7 )
|
721
|
+
{
|
722
|
+
/*
|
723
|
+
* round_length is multiple of 4, but not multiple of 8
|
724
|
+
*/
|
725
|
+
for(z=0; z<round_length; z+=4, src_a+=4, src_b+=4)
|
726
|
+
{
|
727
|
+
__m256d a0 = _mm256_load_pd(src_a);
|
728
|
+
__m256d a1 = _mm256_load_pd(src_a+block_size);
|
729
|
+
__m256d b0 = _mm256_load_pd(src_b);
|
730
|
+
__m256d b1 = _mm256_load_pd(src_b+block_size);
|
731
|
+
r00 = _mm256_fmadd_pd(a0, b0, r00);
|
732
|
+
r01 = _mm256_fmadd_pd(a0, b1, r01);
|
733
|
+
r10 = _mm256_fmadd_pd(a1, b0, r10);
|
734
|
+
r11 = _mm256_fmadd_pd(a1, b1, r11);
|
735
|
+
}
|
736
|
+
}
|
737
|
+
else
|
738
|
+
{
|
739
|
+
/*
|
740
|
+
* round_length is multiple of 8
|
741
|
+
*/
|
742
|
+
for(z=0; z<round_length; z+=8, src_a+=8, src_b+=8)
|
743
|
+
{
|
744
|
+
__m256d a0 = _mm256_load_pd(src_a);
|
745
|
+
__m256d a1 = _mm256_load_pd(src_a+block_size);
|
746
|
+
__m256d b0 = _mm256_load_pd(src_b);
|
747
|
+
__m256d b1 = _mm256_load_pd(src_b+block_size);
|
748
|
+
__m256d c0 = _mm256_load_pd(src_a+4);
|
749
|
+
__m256d c1 = _mm256_load_pd(src_a+block_size+4);
|
750
|
+
__m256d d0 = _mm256_load_pd(src_b+4);
|
751
|
+
__m256d d1 = _mm256_load_pd(src_b+block_size+4);
|
752
|
+
r00 = _mm256_fmadd_pd(c0, d0, _mm256_fmadd_pd(a0, b0, r00));
|
753
|
+
r01 = _mm256_fmadd_pd(c0, d1, _mm256_fmadd_pd(a0, b1, r01));
|
754
|
+
r10 = _mm256_fmadd_pd(c1, d0, _mm256_fmadd_pd(a1, b0, r10));
|
755
|
+
r11 = _mm256_fmadd_pd(c1, d1, _mm256_fmadd_pd(a1, b1, r11));
|
756
|
+
}
|
757
|
+
}
|
758
|
+
__m256d sum0 = _mm256_hadd_pd(r00,r01);
|
759
|
+
__m256d sum1 = _mm256_hadd_pd(r10,r11);
|
760
|
+
_mm_store_pd(dst, _mm_add_pd(_mm256_castpd256_pd128(sum0), _mm256_extractf128_pd(sum0,1)));
|
761
|
+
_mm_store_pd(dst+dst_stride, _mm_add_pd(_mm256_castpd256_pd128(sum1), _mm256_extractf128_pd(sum1,1)));
|
762
|
+
}
|
763
|
+
|
764
|
+
|
765
|
+
/*************************************************************************
|
766
|
+
Solving linear system: propagating computed supernode.
|
767
|
+
|
768
|
+
Propagates computed supernode to the rest of the RHS using SIMD-friendly
|
769
|
+
RHS storage format.
|
770
|
+
|
771
|
+
INPUT PARAMETERS:
|
772
|
+
|
773
|
+
OUTPUT PARAMETERS:
|
774
|
+
|
775
|
+
-- ALGLIB routine --
|
776
|
+
08.09.2021
|
777
|
+
Bochkanov Sergey
|
778
|
+
*************************************************************************/
|
779
|
+
void spchol_propagatefwd_fma(/* Real */ const ae_vector* x,
|
780
|
+
ae_int_t cols0,
|
781
|
+
ae_int_t blocksize,
|
782
|
+
/* Integer */ const ae_vector* superrowidx,
|
783
|
+
ae_int_t rbase,
|
784
|
+
ae_int_t offdiagsize,
|
785
|
+
/* Real */ const ae_vector* rowstorage,
|
786
|
+
ae_int_t offss,
|
787
|
+
ae_int_t sstride,
|
788
|
+
/* Real */ ae_vector* simdbuf,
|
789
|
+
ae_int_t simdwidth,
|
790
|
+
ae_state *_state)
|
791
|
+
{
|
792
|
+
ae_int_t k;
|
793
|
+
|
794
|
+
ae_assert(simdwidth==4, "SPCHOL: unexpected stride in propagatefwd()", _state);
|
795
|
+
if( sstride==4 )
|
796
|
+
{
|
797
|
+
/*
|
798
|
+
* blocksize is either 3 or 4
|
799
|
+
*/
|
800
|
+
ae_int_t supported = ae_cpuid();
|
801
|
+
if( supported&CPU_FMA )
|
802
|
+
{
|
803
|
+
__m256d x_simd;
|
804
|
+
double *p_mat_row = rowstorage->ptr.p_double+offss+blocksize*4;
|
805
|
+
double *p_simd_buf = simdbuf->ptr.p_double;
|
806
|
+
ae_int_t *p_rowidx = superrowidx->ptr.p_int+rbase;
|
807
|
+
if( blocksize==3 )
|
808
|
+
{
|
809
|
+
double xx[4];
|
810
|
+
xx[0] = x->ptr.p_double[cols0];
|
811
|
+
xx[1] = x->ptr.p_double[cols0+1];
|
812
|
+
xx[2] = x->ptr.p_double[cols0+2];
|
813
|
+
xx[3] = 0;
|
814
|
+
x_simd = _mm256_loadu_pd(xx);
|
815
|
+
}
|
816
|
+
else
|
817
|
+
x_simd = _mm256_loadu_pd(x->ptr.p_double+cols0);
|
818
|
+
for(k=0; k<offdiagsize; k++)
|
819
|
+
{
|
820
|
+
double *p_buf_row = p_simd_buf+p_rowidx[k]*4;
|
821
|
+
_mm256_store_pd(p_buf_row, _mm256_fnmadd_pd(_mm256_load_pd(p_mat_row), x_simd, _mm256_load_pd(p_buf_row)));
|
822
|
+
p_mat_row += 4;
|
823
|
+
}
|
824
|
+
return;
|
825
|
+
}
|
826
|
+
}
|
827
|
+
if( blocksize==2 && sstride==2 )
|
828
|
+
{
|
829
|
+
/*
|
830
|
+
* blocksize is 2, stride is 2
|
831
|
+
*/
|
832
|
+
ae_int_t supported = ae_cpuid();
|
833
|
+
if( supported&CPU_FMA )
|
834
|
+
{
|
835
|
+
__m128d x_simd = _mm_loadu_pd(x->ptr.p_double+cols0);
|
836
|
+
double *p_mat_row = rowstorage->ptr.p_double+offss+2*2;
|
837
|
+
double *p_simd_buf = simdbuf->ptr.p_double;
|
838
|
+
ae_int_t *p_rowidx = superrowidx->ptr.p_int+rbase;
|
839
|
+
for(k=0; k<offdiagsize; k++)
|
840
|
+
{
|
841
|
+
double *p_buf_row = p_simd_buf+p_rowidx[k]*4;
|
842
|
+
_mm_store_pd(p_buf_row, _mm_fnmadd_pd(_mm_load_pd(p_mat_row), x_simd, _mm_load_pd(p_buf_row)));
|
843
|
+
p_mat_row += 2;
|
844
|
+
}
|
845
|
+
return;
|
846
|
+
}
|
847
|
+
}
|
848
|
+
ae_assert(ae_false, "spchol_propagatefwd_fma: unsupported input", _state);
|
849
|
+
}
|
850
|
+
|
851
|
+
ae_bool spchol_updatekernelabc4_fma(double* rowstorage,
|
852
|
+
ae_int_t offss,
|
853
|
+
ae_int_t twidth,
|
854
|
+
ae_int_t offsu,
|
855
|
+
ae_int_t uheight,
|
856
|
+
ae_int_t urank,
|
857
|
+
ae_int_t urowstride,
|
858
|
+
ae_int_t uwidth,
|
859
|
+
const double* diagd,
|
860
|
+
ae_int_t offsd,
|
861
|
+
const ae_int_t* raw2smap,
|
862
|
+
const ae_int_t* superrowidx,
|
863
|
+
ae_int_t urbase,
|
864
|
+
ae_state *_state)
|
865
|
+
{
|
866
|
+
ae_int_t k;
|
867
|
+
ae_int_t targetrow;
|
868
|
+
ae_int_t targetcol;
|
869
|
+
|
870
|
+
/*
|
871
|
+
* Filter out unsupported combinations (ones that are too sparse for the non-SIMD code)
|
872
|
+
*/
|
873
|
+
if( twidth<3||twidth>4 )
|
874
|
+
{
|
875
|
+
return ae_false;
|
876
|
+
}
|
877
|
+
if( uwidth<1||uwidth>4 )
|
878
|
+
{
|
879
|
+
return ae_false;
|
880
|
+
}
|
881
|
+
if( urank>4 )
|
882
|
+
{
|
883
|
+
return ae_false;
|
884
|
+
}
|
885
|
+
|
886
|
+
/*
|
887
|
+
* Shift input arrays to the beginning of the working area.
|
888
|
+
* Prepare SIMD masks
|
889
|
+
*/
|
890
|
+
__m256i v_rankmask = _mm256_cmpgt_epi64(_mm256_set_epi64x(urank, urank, urank, urank), _mm256_set_epi64x(3, 2, 1, 0));
|
891
|
+
double *update_storage = rowstorage+offsu;
|
892
|
+
double *target_storage = rowstorage+offss;
|
893
|
+
superrowidx += urbase;
|
894
|
+
|
895
|
+
/*
|
896
|
+
* Load head of the update matrix
|
897
|
+
*/
|
898
|
+
__m256d v_d0123 = _mm256_maskload_pd(diagd+offsd, v_rankmask);
|
899
|
+
__m256d u_0_0123 = _mm256_setzero_pd();
|
900
|
+
__m256d u_1_0123 = _mm256_setzero_pd();
|
901
|
+
__m256d u_2_0123 = _mm256_setzero_pd();
|
902
|
+
__m256d u_3_0123 = _mm256_setzero_pd();
|
903
|
+
for(k=0; k<=uwidth-1; k++)
|
904
|
+
{
|
905
|
+
targetcol = raw2smap[superrowidx[k]];
|
906
|
+
if( targetcol==0 )
|
907
|
+
u_0_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
|
908
|
+
if( targetcol==1 )
|
909
|
+
u_1_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
|
910
|
+
if( targetcol==2 )
|
911
|
+
u_2_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
|
912
|
+
if( targetcol==3 )
|
913
|
+
u_3_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
|
914
|
+
}
|
915
|
+
|
916
|
+
/*
|
917
|
+
* Transpose head
|
918
|
+
*/
|
919
|
+
__m256d u01_lo = _mm256_unpacklo_pd(u_0_0123,u_1_0123);
|
920
|
+
__m256d u01_hi = _mm256_unpackhi_pd(u_0_0123,u_1_0123);
|
921
|
+
__m256d u23_lo = _mm256_unpacklo_pd(u_2_0123,u_3_0123);
|
922
|
+
__m256d u23_hi = _mm256_unpackhi_pd(u_2_0123,u_3_0123);
|
923
|
+
__m256d u_0123_0 = _mm256_permute2f128_pd(u01_lo, u23_lo, 0x20);
|
924
|
+
__m256d u_0123_1 = _mm256_permute2f128_pd(u01_hi, u23_hi, 0x20);
|
925
|
+
__m256d u_0123_2 = _mm256_permute2f128_pd(u23_lo, u01_lo, 0x13);
|
926
|
+
__m256d u_0123_3 = _mm256_permute2f128_pd(u23_hi, u01_hi, 0x13);
|
927
|
+
|
928
|
+
/*
|
929
|
+
* Run update
|
930
|
+
*/
|
931
|
+
if( urank==1 )
|
932
|
+
{
|
933
|
+
for(k=0; k<=uheight-1; k++)
|
934
|
+
{
|
935
|
+
targetrow = raw2smap[superrowidx[k]]*4;
|
936
|
+
double *update_row = rowstorage+offsu+k*urowstride;
|
937
|
+
_mm256_store_pd(target_storage+targetrow,
|
938
|
+
_mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+0), u_0123_0,
|
939
|
+
_mm256_load_pd(target_storage+targetrow)));
|
940
|
+
}
|
941
|
+
}
|
942
|
+
if( urank==2 )
|
943
|
+
{
|
944
|
+
for(k=0; k<=uheight-1; k++)
|
945
|
+
{
|
946
|
+
targetrow = raw2smap[superrowidx[k]]*4;
|
947
|
+
double *update_row = rowstorage+offsu+k*urowstride;
|
948
|
+
_mm256_store_pd(target_storage+targetrow,
|
949
|
+
_mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+1), u_0123_1,
|
950
|
+
_mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+0), u_0123_0,
|
951
|
+
_mm256_load_pd(target_storage+targetrow))));
|
952
|
+
}
|
953
|
+
}
|
954
|
+
if( urank==3 )
|
955
|
+
{
|
956
|
+
for(k=0; k<=uheight-1; k++)
|
957
|
+
{
|
958
|
+
targetrow = raw2smap[superrowidx[k]]*4;
|
959
|
+
double *update_row = rowstorage+offsu+k*urowstride;
|
960
|
+
_mm256_store_pd(target_storage+targetrow,
|
961
|
+
_mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+2), u_0123_2,
|
962
|
+
_mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+1), u_0123_1,
|
963
|
+
_mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+0), u_0123_0,
|
964
|
+
_mm256_load_pd(target_storage+targetrow)))));
|
965
|
+
}
|
966
|
+
}
|
967
|
+
if( urank==4 )
|
968
|
+
{
|
969
|
+
for(k=0; k<=uheight-1; k++)
|
970
|
+
{
|
971
|
+
targetrow = raw2smap[superrowidx[k]]*4;
|
972
|
+
double *update_row = rowstorage+offsu+k*urowstride;
|
973
|
+
_mm256_store_pd(target_storage+targetrow,
|
974
|
+
_mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+3), u_0123_3,
|
975
|
+
_mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+2), u_0123_2,
|
976
|
+
_mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+1), u_0123_1,
|
977
|
+
_mm256_fnmadd_pd(_mm256_broadcast_sd(update_row+0), u_0123_0,
|
978
|
+
_mm256_load_pd(target_storage+targetrow))))));
|
979
|
+
}
|
980
|
+
}
|
981
|
+
return ae_true;
|
982
|
+
}
|
983
|
+
|
984
|
+
ae_bool spchol_updatekernel4444_fma(
|
985
|
+
double* rowstorage,
|
986
|
+
ae_int_t offss,
|
987
|
+
ae_int_t sheight,
|
988
|
+
ae_int_t offsu,
|
989
|
+
ae_int_t uheight,
|
990
|
+
const double* diagd,
|
991
|
+
ae_int_t offsd,
|
992
|
+
const ae_int_t* raw2smap,
|
993
|
+
const ae_int_t* superrowidx,
|
994
|
+
ae_int_t urbase,
|
995
|
+
ae_state *_state)
|
996
|
+
{
|
997
|
+
ae_int_t k;
|
998
|
+
ae_int_t targetrow;
|
999
|
+
ae_int_t offsk;
|
1000
|
+
__m256d v_negd_u0, v_negd_u1, v_negd_u2, v_negd_u3, v_negd;
|
1001
|
+
__m256d v_w0, v_w1, v_w2, v_w3, u01_lo, u01_hi, u23_lo, u23_hi;
|
1002
|
+
|
1003
|
+
/*
|
1004
|
+
* Compute W = -D*transpose(U[0:3])
|
1005
|
+
*/
|
1006
|
+
v_negd = _mm256_mul_pd(_mm256_loadu_pd(diagd+offsd),_mm256_set1_pd(-1.0));
|
1007
|
+
v_negd_u0 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+0*4),v_negd);
|
1008
|
+
v_negd_u1 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+1*4),v_negd);
|
1009
|
+
v_negd_u2 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+2*4),v_negd);
|
1010
|
+
v_negd_u3 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+3*4),v_negd);
|
1011
|
+
u01_lo = _mm256_unpacklo_pd(v_negd_u0,v_negd_u1);
|
1012
|
+
u01_hi = _mm256_unpackhi_pd(v_negd_u0,v_negd_u1);
|
1013
|
+
u23_lo = _mm256_unpacklo_pd(v_negd_u2,v_negd_u3);
|
1014
|
+
u23_hi = _mm256_unpackhi_pd(v_negd_u2,v_negd_u3);
|
1015
|
+
v_w0 = _mm256_permute2f128_pd(u01_lo, u23_lo, 0x20);
|
1016
|
+
v_w1 = _mm256_permute2f128_pd(u01_hi, u23_hi, 0x20);
|
1017
|
+
v_w2 = _mm256_permute2f128_pd(u23_lo, u01_lo, 0x13);
|
1018
|
+
v_w3 = _mm256_permute2f128_pd(u23_hi, u01_hi, 0x13);
|
1019
|
+
|
1020
|
+
//
|
1021
|
+
// Compute update S:= S + row_scatter(U*W)
|
1022
|
+
//
|
1023
|
+
if( sheight==uheight )
|
1024
|
+
{
|
1025
|
+
/*
|
1026
|
+
* No row scatter, the most efficient code
|
1027
|
+
*/
|
1028
|
+
for(k=0; k<uheight; k++)
|
1029
|
+
{
|
1030
|
+
targetrow = offss+k*4;
|
1031
|
+
offsk = offsu+k*4;
|
1032
|
+
_mm256_store_pd(rowstorage+targetrow,
|
1033
|
+
_mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+3),v_w3,
|
1034
|
+
_mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+2),v_w2,
|
1035
|
+
_mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+1),v_w1,
|
1036
|
+
_mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+0),v_w0,
|
1037
|
+
_mm256_load_pd(rowstorage+targetrow))))));
|
1038
|
+
}
|
1039
|
+
}
|
1040
|
+
else
|
1041
|
+
{
|
1042
|
+
/*
|
1043
|
+
* Row scatter is performed, less efficient code using double mapping to determine target row index
|
1044
|
+
*/
|
1045
|
+
for(k=0; k<=uheight-1; k++)
|
1046
|
+
{
|
1047
|
+
targetrow = offss+raw2smap[superrowidx[urbase+k]]*4;
|
1048
|
+
offsk = offsu+k*4;
|
1049
|
+
_mm256_store_pd(rowstorage+targetrow,
|
1050
|
+
_mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+3),v_w3,
|
1051
|
+
_mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+2),v_w2,
|
1052
|
+
_mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+1),v_w1,
|
1053
|
+
_mm256_fmadd_pd(_mm256_broadcast_sd(rowstorage+offsk+0),v_w0,
|
1054
|
+
_mm256_load_pd(rowstorage+targetrow))))));
|
1055
|
+
}
|
1056
|
+
}
|
1057
|
+
return ae_true;
|
1058
|
+
}
|
1059
|
+
|
1060
|
+
/* ALGLIB_NO_FAST_KERNELS, _ALGLIB_HAS_AVX2_INTRINSICS */
|
1061
|
+
#endif
|
1062
|
+
|
1063
|
+
|
1064
|
+
}
|
1065
|
+
|