alglib4 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +47 -0
  3. data/ext/alglib/alglib.cpp +537 -0
  4. data/ext/alglib/alglib_array_converters.cpp +86 -0
  5. data/ext/alglib/alglib_array_converters.h +15 -0
  6. data/ext/alglib/alglib_utils.cpp +10 -0
  7. data/ext/alglib/alglib_utils.h +6 -0
  8. data/ext/alglib/alglibinternal.cpp +21749 -0
  9. data/ext/alglib/alglibinternal.h +2168 -0
  10. data/ext/alglib/alglibmisc.cpp +9106 -0
  11. data/ext/alglib/alglibmisc.h +2114 -0
  12. data/ext/alglib/ap.cpp +20094 -0
  13. data/ext/alglib/ap.h +7244 -0
  14. data/ext/alglib/dataanalysis.cpp +52588 -0
  15. data/ext/alglib/dataanalysis.h +10601 -0
  16. data/ext/alglib/diffequations.cpp +1342 -0
  17. data/ext/alglib/diffequations.h +282 -0
  18. data/ext/alglib/extconf.rb +5 -0
  19. data/ext/alglib/fasttransforms.cpp +4696 -0
  20. data/ext/alglib/fasttransforms.h +1018 -0
  21. data/ext/alglib/integration.cpp +4249 -0
  22. data/ext/alglib/integration.h +869 -0
  23. data/ext/alglib/interpolation.cpp +74502 -0
  24. data/ext/alglib/interpolation.h +12264 -0
  25. data/ext/alglib/kernels_avx2.cpp +2171 -0
  26. data/ext/alglib/kernels_avx2.h +201 -0
  27. data/ext/alglib/kernels_fma.cpp +1065 -0
  28. data/ext/alglib/kernels_fma.h +137 -0
  29. data/ext/alglib/kernels_sse2.cpp +735 -0
  30. data/ext/alglib/kernels_sse2.h +100 -0
  31. data/ext/alglib/linalg.cpp +65182 -0
  32. data/ext/alglib/linalg.h +9927 -0
  33. data/ext/alglib/optimization.cpp +135331 -0
  34. data/ext/alglib/optimization.h +19235 -0
  35. data/ext/alglib/solvers.cpp +20488 -0
  36. data/ext/alglib/solvers.h +4781 -0
  37. data/ext/alglib/specialfunctions.cpp +10672 -0
  38. data/ext/alglib/specialfunctions.h +2305 -0
  39. data/ext/alglib/statistics.cpp +19791 -0
  40. data/ext/alglib/statistics.h +1359 -0
  41. data/ext/alglib/stdafx.h +2 -0
  42. data/gpl2.txt +339 -0
  43. data/gpl3.txt +674 -0
  44. data/lib/alglib/version.rb +3 -0
  45. data/lib/alglib.rb +4 -0
  46. metadata +101 -0
@@ -0,0 +1,2171 @@
1
+ /*************************************************************************
2
+ ALGLIB 4.04.0 (source code generated 2024-12-21)
3
+ Copyright (c) Sergey Bochkanov (ALGLIB project).
4
+
5
+ >>> SOURCE LICENSE >>>
6
+ This program is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation (www.fsf.org); either version 2 of the
9
+ License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ A copy of the GNU General Public License is available at
17
+ http://www.fsf.org/licensing/licenses
18
+ >>> END OF LICENSE >>>
19
+ *************************************************************************/
20
+ #ifdef _MSC_VER
21
+ #define _CRT_SECURE_NO_WARNINGS
22
+ #endif
23
+ #include "stdafx.h"
24
+
25
+ //
26
+ // Must be defined before we include kernel header
27
+ //
28
+ #define _ALGLIB_IMPL_DEFINES
29
+ #define _ALGLIB_INTEGRITY_CHECKS_ONCE
30
+
31
+ #include "kernels_avx2.h"
32
+
33
+ // disable some irrelevant warnings
34
+ #if (AE_COMPILER==AE_MSVC) && !defined(AE_ALL_WARNINGS)
35
+ #pragma warning(disable:4100)
36
+ #pragma warning(disable:4127)
37
+ #pragma warning(disable:4611)
38
+ #pragma warning(disable:4702)
39
+ #pragma warning(disable:4996)
40
+ #endif
41
+
42
+ namespace alglib_impl
43
+ {
44
+
45
+
46
+
47
+ #if !defined(ALGLIB_NO_FAST_KERNELS) && defined(_ALGLIB_HAS_AVX2_INTRINSICS)
48
+
49
+ double rdotv_avx2(const ae_int_t n,
50
+ /* Real */ const double* __restrict x,
51
+ /* Real */ const double* __restrict y,
52
+ const ae_state* __restrict _state)
53
+ {
54
+ ae_int_t i;
55
+
56
+ const ae_int_t avx2len = n>>2;
57
+ const ae_int_t fmaLen = (avx2len >> 2) << 2;
58
+ const __m256d* __restrict pX = (const __m256d*)(x);
59
+ const __m256d* __restrict pY = (const __m256d*)(y);
60
+ __m256d ans;
61
+ if(fmaLen >= 4) {
62
+ __m256d fmaUnroll[4];
63
+ fmaUnroll[0] = _mm256_mul_pd(pX[0], pY[0]);
64
+ fmaUnroll[1] = _mm256_mul_pd(pX[1], pY[1]);
65
+ fmaUnroll[2] = _mm256_mul_pd(pX[2], pY[2]);
66
+ fmaUnroll[3] = _mm256_mul_pd(pX[3], pY[3]);
67
+ for(i=4; i<fmaLen; i+=4) {
68
+ fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pY[i]), fmaUnroll[0]);
69
+ fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pY[i+1]), fmaUnroll[1]);
70
+ fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pY[i+2]), fmaUnroll[2]);
71
+ fmaUnroll[3] = _mm256_add_pd(_mm256_mul_pd(pX[i+3], pY[i+3]), fmaUnroll[3]);
72
+ }
73
+ switch(avx2len-fmaLen) {
74
+ case 3:
75
+ fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pY[i+2]), fmaUnroll[2]);
76
+ case 2:
77
+ fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pY[i+1]), fmaUnroll[1]);
78
+ case 1:
79
+ fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pY[i]), fmaUnroll[0]);
80
+ }
81
+ ans = _mm256_add_pd(
82
+ _mm256_add_pd(fmaUnroll[0], fmaUnroll[1]),
83
+ _mm256_add_pd(fmaUnroll[2], fmaUnroll[3]));
84
+ }
85
+ else {
86
+ ans = _mm256_setzero_pd();
87
+ switch(avx2len) {
88
+ case 3:
89
+ ans = _mm256_mul_pd(pX[2], pY[2]);
90
+ case 2:
91
+ ans = _mm256_add_pd(_mm256_mul_pd(pX[1], pY[1]), ans);
92
+ case 1:
93
+ ans = _mm256_add_pd(_mm256_mul_pd(pX[0], pY[0]), ans);
94
+ }
95
+ }
96
+ const __m128d s = _mm_add_pd(_mm256_extractf128_pd(ans, 0), _mm256_extractf128_pd(ans, 1));
97
+ const double *pComps = (const double*)&s;
98
+ double dot =pComps[0] + pComps[1];
99
+ const ae_int_t tail = avx2len<<2;
100
+ switch(n-tail) {
101
+ case 1: {
102
+ dot += x[tail]*y[tail];
103
+ break;
104
+ }
105
+ case 2: {
106
+ dot += x[tail+0]*y[tail+0];
107
+ dot += x[tail+1]*y[tail+1];
108
+ break;
109
+ }
110
+ case 3: {
111
+ dot += x[tail+0]*y[tail+0];
112
+ dot += x[tail+1]*y[tail+1];
113
+ dot += x[tail+2]*y[tail+2];
114
+ break;
115
+ }
116
+ }
117
+ return dot;
118
+ }
119
+
120
+ double rdotv2_avx2(const ae_int_t n,
121
+ /* Real */ const double* __restrict x,
122
+ const ae_state* __restrict _state)
123
+ {
124
+ ae_int_t i;
125
+
126
+ const ae_int_t avx2len = n>>2;
127
+ const ae_int_t fmaLen = (avx2len >> 2) << 2;
128
+ const __m256d* __restrict pX = (const __m256d*)(x);
129
+ __m256d ans;
130
+ if(fmaLen >= 4) {
131
+ //TODO: this can be further unrolled to 8 because AVX(2) provides 16 registers
132
+ __m256d fmaUnroll[4];
133
+ fmaUnroll[0] = _mm256_mul_pd(pX[0], pX[0]);
134
+ fmaUnroll[1] = _mm256_mul_pd(pX[1], pX[1]);
135
+ fmaUnroll[2] = _mm256_mul_pd(pX[2], pX[2]);
136
+ fmaUnroll[3] = _mm256_mul_pd(pX[3], pX[3]);
137
+ for(i=4; i<fmaLen; i+=4) {
138
+ fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pX[i]), fmaUnroll[0]);
139
+ fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pX[i+1]), fmaUnroll[1]);
140
+ fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pX[i+2]), fmaUnroll[2]);
141
+ fmaUnroll[3] = _mm256_add_pd(_mm256_mul_pd(pX[i+3], pX[i+3]), fmaUnroll[3]);
142
+ }
143
+ switch(avx2len-fmaLen) {
144
+ case 3:
145
+ fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pX[i+2]), fmaUnroll[2]);
146
+ case 2:
147
+ fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pX[i+1]), fmaUnroll[1]);
148
+ case 1:
149
+ fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pX[i]), fmaUnroll[0]);
150
+ }
151
+ ans = _mm256_add_pd(
152
+ _mm256_add_pd(fmaUnroll[0], fmaUnroll[1]),
153
+ _mm256_add_pd(fmaUnroll[2], fmaUnroll[3]));
154
+ }
155
+ else {
156
+ ans = _mm256_setzero_pd();
157
+ switch(avx2len) {
158
+ case 3:
159
+ ans = _mm256_mul_pd(pX[2], pX[2]);
160
+ case 2:
161
+ ans = _mm256_add_pd(_mm256_mul_pd(pX[1], pX[1]), ans);
162
+ case 1:
163
+ ans = _mm256_add_pd(_mm256_mul_pd(pX[0], pX[0]), ans);
164
+ }
165
+ }
166
+ const __m128d s = _mm_add_pd(_mm256_extractf128_pd(ans, 0), _mm256_extractf128_pd(ans, 1));
167
+ const double *pComps = (const double*)&s;
168
+ double dot =pComps[0] + pComps[1];
169
+ const ae_int_t tail = avx2len<<2;
170
+ switch(n-tail) {
171
+ case 1: {
172
+ dot += x[tail]*x[tail];
173
+ break;
174
+ }
175
+ case 2: {
176
+ dot += x[tail+0]*x[tail+0];
177
+ dot += x[tail+1]*x[tail+1];
178
+ break;
179
+ }
180
+ case 3: {
181
+ dot += x[tail+0]*x[tail+0];
182
+ dot += x[tail+1]*x[tail+1];
183
+ dot += x[tail+2]*x[tail+2];
184
+ break;
185
+ }
186
+ }
187
+ return dot;
188
+ }
189
+
190
+ void rcopyv_avx2(ae_int_t n,
191
+ /* Real */ const double* __restrict x,
192
+ /* Real */ double* __restrict y,
193
+ ae_state* __restrict _state)
194
+ {
195
+ ae_int_t i;
196
+
197
+ const ae_int_t avx2len = n>>2;
198
+ const ae_int_t tail = avx2len<<2;
199
+ const __m256d* __restrict pSrc = (const __m256d*)(x);
200
+ __m256d* __restrict pDest = (__m256d*)(y);
201
+ for(i=0; i<avx2len; i++) {
202
+ pDest[i] = pSrc[i];
203
+ }
204
+ switch(n-tail) {
205
+ case 1:
206
+ *(double*)(pDest+i) = *(const double*)(pSrc+i);
207
+ break;
208
+ case 2:
209
+ *(__m128d*)(pDest+i) = *(const __m128d*)(pSrc+i);
210
+ break;
211
+ case 3:
212
+ *(__m128d*)(pDest+i) = *(const __m128d*)(pSrc+i);
213
+ y[tail+2] = x[tail+2];
214
+ break;
215
+ }
216
+ }
217
+
218
+ void rcopymulv_avx2(const ae_int_t n,
219
+ const double v,
220
+ /* Real */ const double* __restrict x,
221
+ /* Real */ double* __restrict y,
222
+ const ae_state* __restrict _state)
223
+ {
224
+ ae_int_t i;
225
+
226
+ const ae_int_t avx2len = n>>2;
227
+ const ae_int_t tail = avx2len<<2;
228
+ const __m256d* __restrict pSrc = (const __m256d*)(x);
229
+ __m256d* __restrict pDest = (__m256d*)(y);
230
+ const __m256d avx2v = _mm256_set1_pd(v);
231
+ for(i=0; i<avx2len; i++) {
232
+ pDest[i] = _mm256_mul_pd(avx2v, pSrc[i]);
233
+ }
234
+ switch(n-tail) {
235
+ case 1:
236
+ *(double*)(pDest+i) = v * (*(const double*)(pSrc+i));
237
+ break;
238
+ case 2:
239
+ *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pSrc+i));
240
+ break;
241
+ case 3:
242
+ *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pSrc+i));
243
+ y[tail+2] = v*x[tail+2];
244
+ break;
245
+ }
246
+ }
247
+
248
+ void icopyv_avx2(const ae_int_t n, const ae_int_t* __restrict x,
249
+ ae_int_t* __restrict y, ae_state* __restrict _state)
250
+ {
251
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 31;
252
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
253
+ __m256i *__restrict pDest = (__m256i*)y;
254
+ const __m256i* __restrict pSrc = (const __m256i*)x;
255
+ const ae_int_t nVec = even>>5;
256
+ ae_int_t i;
257
+ for(i=0; i<nVec; i++) {
258
+ pDest[i] = pSrc[i];
259
+ }
260
+ i = even/sizeof(ae_int_t);
261
+ if(tail & 16) {
262
+ *(__m128i*)(y+i) = *(const __m128i*)(x+i);
263
+ i += 16/sizeof(ae_int_t);
264
+ }
265
+ if(tail & 8) {
266
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
267
+ i += 8/sizeof(ae_int_t);
268
+ }
269
+ if(tail & 4) {
270
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
271
+ }
272
+ }
273
+
274
+ void bcopyv_avx2(const ae_int_t n, const ae_bool* __restrict x,
275
+ ae_bool* __restrict y, ae_state* __restrict _state)
276
+ {
277
+ const ae_int_t tail = n & 31;
278
+ const ae_int_t even = n-tail;
279
+ __m256i *__restrict pDest = (__m256i*)y;
280
+ const __m256i* __restrict pSrc = (const __m256i*)x;
281
+ const ae_int_t nVec = even>>5;
282
+ ae_int_t i;
283
+ for(i=0; i<nVec; i++) {
284
+ pDest[i] = pSrc[i];
285
+ }
286
+ i = even;
287
+ if(tail & 16) {
288
+ *(__m128i*)(y+i) = *(const __m128i*)(x+i);
289
+ i += 16;
290
+ }
291
+ if(tail & 8) {
292
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
293
+ i += 8;
294
+ }
295
+ if(tail & 4) {
296
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
297
+ i += 4;
298
+ }
299
+ if(tail & 2) {
300
+ *(y+i+0) = *(x+i+0);
301
+ *(y+i+1) = *(x+i+1);
302
+ i += 2;
303
+ }
304
+ if(tail & 1) {
305
+ *(y+i) = *(x+i);
306
+ }
307
+ }
308
+
309
+ void rsetv_avx2(const ae_int_t n,
310
+ const double v,
311
+ /* Real */ double* __restrict x,
312
+ const ae_state* __restrict _state)
313
+ {
314
+ ae_int_t i;
315
+
316
+ const ae_int_t avx2len = n>>2;
317
+ __m256d* __restrict pDest = (__m256d*)(x);
318
+ const __m256d avx2v = _mm256_set1_pd(v);
319
+ for(i=0; i<avx2len; i++) {
320
+ pDest[i] = avx2v;
321
+ }
322
+ const ae_int_t tail = avx2len<<2;
323
+ switch(n-tail) {
324
+ case 1:
325
+ *(double*)(pDest+i) = v;
326
+ break;
327
+ case 2:
328
+ *(__m128d*)(pDest+i) = _mm256_extractf128_pd(avx2v, 0);
329
+ break;
330
+ case 3:
331
+ *(__m128d*)(pDest+i) = _mm256_extractf128_pd(avx2v, 0);
332
+ x[tail+2] = v;
333
+ break;
334
+ }
335
+ }
336
+
337
+ void rsetvx_avx2(const ae_int_t n, const double v, double* __restrict x,
338
+ const ae_state* __restrict _state)
339
+ {
340
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
341
+ if( n<=4 )
342
+ {
343
+ ae_int_t j;
344
+ for(j=0; j<=n-1; j++)
345
+ x[j] = v;
346
+ return;
347
+ }
348
+ switch(unal)
349
+ {
350
+ case 0:
351
+ rsetv_avx2(n, v, x, _state);
352
+ return;
353
+ case 8:
354
+ x[2] = v;
355
+ case 16:
356
+ x[1] = v;
357
+ case 24:
358
+ {
359
+ x[0] = v;
360
+ const ptrdiff_t nDone = 4-(unal>>3);
361
+ rsetv_avx2(n-nDone, v, x+nDone, _state);
362
+ return;
363
+ }
364
+ }
365
+ }
366
+
367
+ void isetv_avx2(const ae_int_t n, const ae_int_t v,
368
+ ae_int_t* __restrict x, ae_state* __restrict _state)
369
+ {
370
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 31;
371
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
372
+ __m256i *__restrict pDest = (__m256i*)x;
373
+ const __m256i avx2v = ((sizeof(v) == 4) ? _mm256_set1_epi32((ae_int32_t)v) : _mm256_set1_epi64x(v));
374
+ const ae_int_t nVec = even>>5;
375
+ ae_int_t i;
376
+ for(i=0; i<nVec; i++) {
377
+ pDest[i] = avx2v;
378
+ }
379
+ memmove(pDest+i, &avx2v, tail);
380
+ }
381
+
382
+ void bsetv_avx2(const ae_int_t n, const ae_bool v, ae_bool* __restrict x,
383
+ ae_state* __restrict _state)
384
+ {
385
+ const ae_int_t tail = n & 31;
386
+ const ae_int_t even = n-tail;
387
+ __m256i *__restrict pDest = (__m256i*)x;
388
+ const __m256i avx2v = _mm256_set1_epi8(v);
389
+ const ae_int_t nVec = even>>5;
390
+ ae_int_t i;
391
+ for(i=0; i<nVec; i++) {
392
+ pDest[i] = avx2v;
393
+ }
394
+ /* _mm256_extracti128_si256() has a too high latency on latest processors (Skylake+) */
395
+ memset(x+even, v, tail);
396
+ }
397
+
398
+ void rmulv_avx2(const ae_int_t n, const double v, double* __restrict x,
399
+ const ae_state* __restrict _state)
400
+ {
401
+ ae_int_t i;
402
+
403
+ const ae_int_t avx2len = n>>2;
404
+ __m256d* __restrict pDest = (__m256d*)(x);
405
+ const __m256d avx2v = _mm256_set1_pd(v);
406
+ for(i=0; i<avx2len; i++) {
407
+ pDest[i] = _mm256_mul_pd(avx2v, pDest[i]);
408
+ }
409
+ const ae_int_t tail = avx2len<<2;
410
+ switch(n-tail) {
411
+ case 1:
412
+ *(double*)(pDest+i) = v * (*(const double*)(pDest+i));
413
+ break;
414
+ case 2:
415
+ *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pDest+i));
416
+ break;
417
+ case 3:
418
+ *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pDest+i));
419
+ x[tail+2] *= v;
420
+ break;
421
+ }
422
+ }
423
+
424
+ void rsqrtv_avx2(const ae_int_t n, double* __restrict x, const ae_state* __restrict _state)
425
+ {
426
+ ae_int_t i;
427
+
428
+ const ae_int_t avx2len = n>>2;
429
+ const ae_int_t tail = avx2len<<2;
430
+ __m256d* __restrict pDest = (__m256d*)(x);
431
+ for(i=0; i<avx2len; i++)
432
+ pDest[i] = _mm256_sqrt_pd(pDest[i]);
433
+ for(i=tail; i<n; i++)
434
+ x[i] = sqrt(x[i]);
435
+ }
436
+
437
+ void rmulvx_avx2(const ae_int_t n, const double v, double* __restrict x,
438
+ const ae_state* __restrict _state)
439
+ {
440
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
441
+ if( n<=4 )
442
+ {
443
+ ae_int_t i;
444
+ for(i=0; i<=n-1; i++)
445
+ x[i] *= v;
446
+ return;
447
+ }
448
+ switch(unal) {
449
+ case 0:
450
+ rmulv_avx2(n, v, x, _state);
451
+ return;
452
+ case 8:
453
+ x[2] = v*x[2];
454
+ case 16:
455
+ x[1] = v*x[1];
456
+ case 24: {
457
+ x[0] = v*x[0];
458
+ const ptrdiff_t nDone = 4-(unal>>3);
459
+ rmulv_avx2(n-nDone, v, x+nDone, _state);
460
+ return;
461
+ }
462
+ }
463
+ }
464
+
465
+ void raddv_avx2(const ae_int_t n,
466
+ const double alpha,
467
+ /* Real */ const double* __restrict y,
468
+ /* Real */ double* __restrict x,
469
+ const ae_state* __restrict _state)
470
+ {
471
+ ae_int_t i;
472
+
473
+ const ae_int_t avx2len = n>>2;
474
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
475
+ __m256d* __restrict pDest = (__m256d*)(x);
476
+ const __m256d avx2alpha = _mm256_set1_pd(alpha);
477
+ for(i=0; i<avx2len; i++) {
478
+ pDest[i] = _mm256_add_pd(_mm256_mul_pd(avx2alpha, pSrc[i]), pDest[i]);
479
+ }
480
+ const ae_int_t tail = avx2len<<2;
481
+ switch(n-tail) {
482
+ case 1:
483
+ *(double*)(pDest+i) = alpha * (*(const double*)(pSrc+i))
484
+ + (*(const double*)(pDest+i));
485
+ break;
486
+ case 2:
487
+ *(__m128d*)(pDest+i) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0), *(const __m128d*)(pSrc+i)),*(const __m128d*)(pDest+i));
488
+ break;
489
+ case 3:
490
+ *(__m128d*)(pDest+i) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0), *(const __m128d*)(pSrc+i)),*(const __m128d*)(pDest+i));
491
+ x[tail+2] += alpha*y[tail+2];
492
+ break;
493
+ }
494
+ }
495
+
496
+ void raddvx_avx_xaligned(const ae_int_t n, const double alpha, const double* __restrict y, double* __restrict x, ae_state *_state)
497
+ {
498
+ ae_int_t i;
499
+ const ae_int_t vecLen = (n>>2)<<2;
500
+ const __m256d avx2alpha = _mm256_set1_pd(alpha);
501
+ __m256d* __restrict pDest = (__m256d*)x;
502
+ for(i=0; i<vecLen; i+=4)
503
+ {
504
+ const ae_int_t iDest = i>>2;
505
+ pDest[iDest] = _mm256_add_pd(_mm256_mul_pd(avx2alpha, _mm256_loadu_pd(y+i)), pDest[iDest]);
506
+ }
507
+ switch(n-vecLen) {
508
+ case 1:
509
+ x[i] += alpha*y[i];
510
+ break;
511
+ case 2: {
512
+ const ae_int_t iDest = i>>2;
513
+ *(__m128d*)(pDest+iDest) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0),_mm_loadu_pd(y+i)),*(const __m128d*)(pDest+iDest));
514
+ break;
515
+ }
516
+ case 3:
517
+ {
518
+ const ae_int_t iDest = i>>2;
519
+ *(__m128d*)(pDest+iDest) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0),_mm_loadu_pd(y+i)),*(const __m128d*)(pDest+iDest));
520
+ x[i+2] += alpha*y[i+2];
521
+ break;
522
+ }
523
+ }
524
+ }
525
+
526
+ void raddvx_avx2(const ae_int_t n, const double alpha, const double* __restrict y,
527
+ double* __restrict x, ae_state *_state)
528
+ {
529
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
530
+ if( n<=4 )
531
+ {
532
+ ae_int_t i;
533
+ for(i=0; i<=n-1; i++)
534
+ x[i] += alpha*y[i];
535
+ return;
536
+ }
537
+ switch(unal)
538
+ {
539
+ case 0:
540
+ raddvx_avx_xaligned(n, alpha, y, x, _state);
541
+ return;
542
+ case 8:
543
+ x[2] += alpha*y[2];
544
+ case 16:
545
+ x[1] += alpha*y[1];
546
+ case 24:
547
+ {
548
+ x[0] += alpha*y[0];
549
+ const ptrdiff_t nDone = 4-(unal>>3);
550
+ raddvx_avx_xaligned(n-nDone, alpha, y+nDone, x+nDone, _state);
551
+ return;
552
+ }
553
+ }
554
+ }
555
+
556
+ void rmergemulv_avx2(ae_int_t n,
557
+ /* Real */ const double* __restrict y,
558
+ /* Real */ double* __restrict x,
559
+ const ae_state* __restrict _state)
560
+ {
561
+ ae_int_t i;
562
+
563
+ const ae_int_t avx2len = n>>2;
564
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
565
+ __m256d* __restrict pDest = (__m256d*)(x);
566
+ for(i=0; i<avx2len; i++) {
567
+ pDest[i] = _mm256_mul_pd(pSrc[i], pDest[i]);
568
+ }
569
+ const ae_int_t tail = avx2len<<2;
570
+ switch(n-tail) {
571
+ case 1:
572
+ *(double*)(pDest+i) = *(const double*)(pSrc+i)
573
+ * (*(const double*)(pDest+i));
574
+ break;
575
+ case 2:
576
+ *(__m128d*)(pDest+i) = _mm_mul_pd(
577
+ *(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
578
+ break;
579
+ case 3: {
580
+ *(__m128d*)(pDest+i) = _mm_mul_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
581
+ ((double*)(pDest+i))[2] *= ((const double*)(pSrc+i))[2];
582
+ break;
583
+ }
584
+ }
585
+ }
586
+
587
+ void rmergedivv_avx2(ae_int_t n,
588
+ /* Real */ const double* __restrict y,
589
+ /* Real */ double* __restrict x,
590
+ const ae_state* __restrict _state)
591
+ {
592
+ ae_int_t i;
593
+
594
+ const ae_int_t avx2len = n>>2;
595
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
596
+ __m256d* __restrict pDest = (__m256d*)(x);
597
+ for(i=0; i<avx2len; i++) {
598
+ pDest[i] = _mm256_div_pd(pDest[i], pSrc[i]);
599
+ }
600
+ const ae_int_t tail = avx2len<<2;
601
+ switch(n-tail) {
602
+ case 1:
603
+ *(double*)(pDest+i) = (*(const double*)(pDest+i)) / (*(const double*)(pSrc+i));
604
+ break;
605
+ case 2:
606
+ *(__m128d*)(pDest+i) = _mm_div_pd(*(const __m128d*)(pDest+i), *(const __m128d*)(pSrc+i));
607
+ break;
608
+ case 3: {
609
+ *(__m128d*)(pDest+i) = _mm_div_pd(*(const __m128d*)(pDest+i), *(const __m128d*)(pSrc+i));
610
+ ((double*)(pDest+i))[2] /= ((const double*)(pSrc+i))[2];
611
+ break;
612
+ }
613
+ }
614
+ }
615
+
616
+ void rmergemaxv_avx2(ae_int_t n,
617
+ /* Real */ const double* __restrict y,
618
+ /* Real */ double* __restrict x,
619
+ ae_state* __restrict _state)
620
+ {
621
+ ae_int_t i;
622
+
623
+ const ae_int_t avx2len = n>>2;
624
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
625
+ __m256d* __restrict pDest = (__m256d*)(x);
626
+ for(i=0; i<avx2len; i++) {
627
+ pDest[i] = _mm256_max_pd(pSrc[i], pDest[i]);
628
+ }
629
+ const ae_int_t tail = avx2len<<2;
630
+ switch(n-tail) {
631
+ case 1:
632
+ *(double*)(pDest+i) = *(const double*)(pSrc+i)>*(const double*)(pDest+i) ? *(const double*)(pSrc+i) : *(const double*)(pDest+i);
633
+ break;
634
+ case 2:
635
+ *(__m128d*)(pDest+i) = _mm_max_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
636
+ break;
637
+ case 3:
638
+ {
639
+ double s2 = ((const double*)(pSrc+i))[2];
640
+ double *d2 = ((double*)(pDest+i))+2;
641
+ *(__m128d*)(pDest+i) = _mm_max_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
642
+ *d2 = s2>*d2 ? s2 : *d2;
643
+ break;
644
+ }
645
+ }
646
+ }
647
+
648
+ void rmergeminv_avx2(ae_int_t n,
649
+ /* Real */ const double* __restrict y,
650
+ /* Real */ double* __restrict x,
651
+ ae_state* __restrict _state)
652
+ {
653
+ ae_int_t i;
654
+
655
+ const ae_int_t avx2len = n>>2;
656
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
657
+ __m256d* __restrict pDest = (__m256d*)(x);
658
+ for(i=0; i<avx2len; i++) {
659
+ pDest[i] = _mm256_min_pd(pSrc[i], pDest[i]);
660
+ }
661
+ const ae_int_t tail = avx2len<<2;
662
+ switch(n-tail) {
663
+ case 1:
664
+ *(double*)(pDest+i) = ae_minreal(*(const double*)(pSrc+i),
665
+ *(const double*)(pDest+i), _state);
666
+ break;
667
+ case 2:
668
+ *(__m128d*)(pDest+i) = _mm_min_pd(
669
+ *(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
670
+ break;
671
+ case 3: {
672
+ double s2 = ((const double*)(pSrc+i))[2];
673
+ double *d2 = ((double*)(pDest+i))+2;
674
+ *(__m128d*)(pDest+i) = _mm_min_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
675
+ *d2 = s2<*d2 ? s2 : *d2;
676
+ break;
677
+ }
678
+ }
679
+ }
680
+
681
+ double rmaxv_avx2(ae_int_t n, /* Real */ const double* __restrict x, ae_state* __restrict _state)
682
+ {
683
+ ae_int_t i;
684
+ const ae_int_t avx2len = n>>2;
685
+ const ae_int_t tail = avx2len<<2;
686
+ const __m256d* __restrict pSrc = (const __m256d*)(x);
687
+ if( n<=4 )
688
+ {
689
+ double result;
690
+ if(n == 0)
691
+ return 0.0;
692
+ result = x[0];
693
+ for(i=1; i<=n-1; i++)
694
+ {
695
+ double v = x[i];
696
+ if( v>result )
697
+ result = v;
698
+ }
699
+ return result;
700
+ }
701
+ __m256d curMax = pSrc[0];
702
+ for(i=1; i<avx2len; i++) {
703
+ curMax = _mm256_max_pd(curMax, pSrc[i]);
704
+ }
705
+ const __m128d sseMax = _mm_max_pd(_mm256_extractf128_pd(curMax, 0), _mm256_extractf128_pd(curMax, 1));
706
+ const double *pComps = (const double *)&sseMax;
707
+ double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
708
+ const double *p_tail = (const double*)(pSrc+i);
709
+ switch(n-tail)
710
+ {
711
+ case 1:
712
+ {
713
+ dMax = p_tail[0]>dMax ? p_tail[0] : dMax;
714
+ break;
715
+ }
716
+ case 2: {
717
+ dMax = p_tail[0]>dMax ? p_tail[0] : dMax;
718
+ dMax = p_tail[1]>dMax ? p_tail[1] : dMax;
719
+ break;
720
+ }
721
+ case 3: {
722
+ dMax = p_tail[0]>dMax ? p_tail[0] : dMax;
723
+ dMax = p_tail[1]>dMax ? p_tail[1] : dMax;
724
+ dMax = p_tail[2]>dMax ? p_tail[2] : dMax;
725
+ break;
726
+ }
727
+ }
728
+ return dMax;
729
+ }
730
+
731
+ double rmaxabsv_avx2(ae_int_t n, /* Real */ const double* __restrict x, ae_state* __restrict _state)
732
+ {
733
+ const __m256d signMask = _mm256_set1_pd(-0.); // -0. = 1 << 63
734
+ const ae_int_t avx2len = n>>2;
735
+ const __m256d* __restrict pSrc = (const __m256d*)(x);
736
+ if( n<=4 )
737
+ {
738
+ double result;
739
+ ae_int_t i;
740
+ result = 0;
741
+ for(i=0; i<=n-1; i++)
742
+ {
743
+ double v = fabs(x[i]);
744
+ if( v>result )
745
+ result = v;
746
+ }
747
+ return result;
748
+ }
749
+ __m256d curMax = _mm256_andnot_pd(signMask, pSrc[0]); // abs
750
+ ae_int_t i;
751
+ for(i=1; i<avx2len; i++) {
752
+ curMax = _mm256_max_pd(curMax, _mm256_andnot_pd(signMask, pSrc[i])); // abs
753
+ }
754
+ const ae_int_t tail = avx2len<<2;
755
+ const __m128d sseMax = _mm_max_pd(_mm256_extractf128_pd(curMax, 0), _mm256_extractf128_pd(curMax, 1));
756
+ const double *p_tail = (const double*)(pSrc+i);
757
+ const double *pComps = (const double *)&sseMax;
758
+ double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
759
+ switch(n-tail)
760
+ {
761
+ case 1:
762
+ {
763
+ double a0 = fabs(p_tail[0]);
764
+ dMax = a0>dMax ? a0 : dMax;
765
+ break;
766
+ }
767
+ case 2:
768
+ {
769
+ double a0 = fabs(p_tail[0]);
770
+ double a1 = fabs(p_tail[1]);
771
+ dMax = a0>dMax ? a0 : dMax;
772
+ dMax = a1>dMax ? a1 : dMax;
773
+ break;
774
+ }
775
+ case 3:
776
+ {
777
+ double a0 = fabs(p_tail[0]);
778
+ double a1 = fabs(p_tail[1]);
779
+ double a2 = fabs(p_tail[2]);
780
+ dMax = a0>dMax ? a0 : dMax;
781
+ dMax = a1>dMax ? a1 : dMax;
782
+ dMax = a2>dMax ? a2 : dMax;
783
+ break;
784
+ }
785
+ }
786
+ return dMax;
787
+ }
788
+
789
+ static void rcopyvx_avx2_xaligned(const ae_int_t n, const double* __restrict x,
790
+ double* __restrict y, ae_state *_state)
791
+ {
792
+ ae_int_t i;
793
+ const ae_int_t vecLen = (n>>2)<<2;
794
+ const __m256d* __restrict pSrc = (const __m256d*)x;
795
+ for(i=0; i<vecLen; i+=4) {
796
+ const ae_int_t iSrc = i>>2;
797
+ _mm256_storeu_pd(y+i, pSrc[iSrc]);
798
+ }
799
+ switch(n-vecLen) {
800
+ case 1:
801
+ y[i] = x[i];
802
+ break;
803
+ case 2: {
804
+ const ae_int_t iSrc = i>>2;
805
+ _mm_storeu_pd(y+i, *(const __m128d*)(pSrc+iSrc));
806
+ break;
807
+ }
808
+ case 3: {
809
+ const ae_int_t iSrc = i>>2;
810
+ const __m256d t = pSrc[iSrc];
811
+ _mm_storeu_pd(y+i, _mm256_extractf128_pd(t, 0));
812
+ y[i+2] = *(((const double*)&t)+2);
813
+ break;
814
+ }
815
+ }
816
+ }
817
+
818
+ void rcopyvx_avx2(const ae_int_t n, const double* __restrict x,
819
+ double* __restrict y, ae_state *_state)
820
+ {
821
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
822
+ if( n<=4 )
823
+ {
824
+ ae_int_t j;
825
+ for(j=0; j<n; j++)
826
+ y[j] = x[j];
827
+ return;
828
+ }
829
+ switch(unal) {
830
+ case 0:
831
+ rcopyvx_avx2_xaligned(n, x, y, _state);
832
+ return;
833
+ case 8:
834
+ y[2] = x[2];
835
+ case 16:
836
+ y[1] = x[1];
837
+ case 24: {
838
+ y[0] = x[0];
839
+ const ptrdiff_t nDone = 4-(unal>>3);
840
+ rcopyvx_avx2_xaligned(n-nDone, x+nDone, y+nDone, _state);
841
+ return;
842
+ }
843
+ }
844
+ }
845
+
846
+ static void icopyvx_avx2_xaligned(const ae_int_t n, const ae_int_t* __restrict x,
847
+ ae_int_t* __restrict y, ae_state* __restrict _state)
848
+ {
849
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 31;
850
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
851
+ const __m256i* __restrict pSrc = (const __m256i*)x;
852
+ const ae_int_t nVec = even>>5;
853
+ const ae_int_t shift_by = 3-sizeof(ae_int_t)/8;
854
+ ae_int_t i;
855
+ for(i=0; i<nVec; i++) {
856
+ const ae_int_t j = i<<shift_by;
857
+ _mm256_storeu_si256((__m256i*)(y+j), pSrc[i]);
858
+ }
859
+ i = even/sizeof(ae_int_t);
860
+ if(tail & 16) {
861
+ _mm_storeu_si128((__m128i*)(y+i), *(const __m128i*)(x+i));
862
+ i += 16/sizeof(ae_int_t);
863
+ }
864
+ if(tail & 8) {
865
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
866
+ i += 8/sizeof(ae_int_t);
867
+ }
868
+ if(tail & 4) {
869
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
870
+ }
871
+ }
872
+
873
+ void icopyvx_avx2(const ae_int_t n, const ae_int_t* __restrict x,
874
+ ae_int_t* __restrict y, ae_state* __restrict _state)
875
+ {
876
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
877
+ if( n<=8 )
878
+ {
879
+ ae_int_t j;
880
+ for(j=0; j<=n-1; j++)
881
+ y[j] = x[j];
882
+ return;
883
+ }
884
+ if(unal == 0)
885
+ {
886
+ icopyvx_avx2_xaligned(n, x, y, _state);
887
+ return;
888
+ }
889
+ const ae_int_t offset = 32-unal;
890
+ memmove(y, x, offset);
891
+ const ae_int_t nDone = offset / sizeof(ae_int_t);
892
+ icopyvx_avx2_xaligned(n-nDone, x+nDone, y+nDone, _state);
893
+ }
894
+
895
+ void rgemv_straight_avx2(const ae_int_t m, const ae_int_t n,
896
+ const double alpha, const ae_matrix* __restrict a,
897
+ const double* __restrict x, double* __restrict y, ae_state* _state)
898
+ {
899
+ ae_int_t i;
900
+ ae_int_t j;
901
+ const __m256d* __restrict pX = (const __m256d*)x;
902
+ const ae_int_t nVec = n >> 2;
903
+ const ae_int_t nUnroll = nVec >> 3;
904
+ __m256d sum = _mm256_setzero_pd();
905
+ for(i=0; i<m; i++) {
906
+ const __m256d* __restrict pRow = (const __m256d*)a->ptr.pp_double[i];
907
+ if(nUnroll >= 1) {
908
+ __m256d u0 = _mm256_mul_pd(pRow[0], pX[0]);
909
+ __m256d u1 = _mm256_mul_pd(pRow[1], pX[1]);
910
+ __m256d u2 = _mm256_mul_pd(pRow[2], pX[2]);
911
+ __m256d u3 = _mm256_mul_pd(pRow[3], pX[3]);
912
+ __m256d u4 = _mm256_mul_pd(pRow[4], pX[4]);
913
+ __m256d u5 = _mm256_mul_pd(pRow[5], pX[5]);
914
+ __m256d u6 = _mm256_mul_pd(pRow[6], pX[6]);
915
+ __m256d u7 = _mm256_mul_pd(pRow[7], pX[7]);
916
+ for(j=1; j<nUnroll; j++) {
917
+ const ae_int_t at = j<<3;
918
+ u0 = _mm256_add_pd(u0, _mm256_mul_pd(pRow[at], pX[at]));
919
+ u1 = _mm256_add_pd(u1, _mm256_mul_pd(pRow[at+1], pX[at+1]));
920
+ u2 = _mm256_add_pd(u2, _mm256_mul_pd(pRow[at+2], pX[at+2]));
921
+ u3 = _mm256_add_pd(u3, _mm256_mul_pd(pRow[at+3], pX[at+3]));
922
+ u4 = _mm256_add_pd(u4, _mm256_mul_pd(pRow[at+4], pX[at+4]));
923
+ u5 = _mm256_add_pd(u5, _mm256_mul_pd(pRow[at+5], pX[at+5]));
924
+ u6 = _mm256_add_pd(u6, _mm256_mul_pd(pRow[at+6], pX[at+6]));
925
+ u7 = _mm256_add_pd(u7, _mm256_mul_pd(pRow[at+7], pX[at+7]));
926
+ }
927
+ const ae_int_t at = j<<3;
928
+ switch(nVec-at) {
929
+ case 7:
930
+ u6 = _mm256_add_pd(_mm256_mul_pd(pX[at+6], pRow[at+6]), u6);
931
+ case 6:
932
+ u5 = _mm256_add_pd(_mm256_mul_pd(pX[at+5], pRow[at+5]), u5);
933
+ case 5:
934
+ u4 = _mm256_add_pd(_mm256_mul_pd(pX[at+4], pRow[at+4]), u4);
935
+ case 4:
936
+ u3 = _mm256_add_pd(_mm256_mul_pd(pX[at+3], pRow[at+3]), u3);
937
+ case 3:
938
+ u2 = _mm256_add_pd(_mm256_mul_pd(pX[at+2], pRow[at+2]), u2);
939
+ case 2:
940
+ u1 = _mm256_add_pd(_mm256_mul_pd(pX[at+1], pRow[at+1]), u1);
941
+ case 1:
942
+ u0 = _mm256_add_pd(_mm256_mul_pd(pX[at+0], pRow[at+0]), u0);
943
+ }
944
+ sum = _mm256_add_pd(
945
+ _mm256_add_pd(_mm256_add_pd(u0, u1), _mm256_add_pd(u2, u3)),
946
+ _mm256_add_pd(_mm256_add_pd(u4, u5), _mm256_add_pd(u6, u7)));
947
+ }
948
+ else {
949
+ switch(nVec) {
950
+ case 0:
951
+ sum = _mm256_setzero_pd();
952
+ break;
953
+ case 1:
954
+ sum = _mm256_mul_pd(pX[0], pRow[0]);
955
+ break;
956
+ case 2:
957
+ sum = _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1]));
958
+ break;
959
+ case 3:
960
+ sum = _mm256_add_pd(
961
+ _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
962
+ _mm256_mul_pd(pX[2], pRow[2]));
963
+ break;
964
+ case 4:
965
+ sum = _mm256_add_pd(
966
+ _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
967
+ _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3])));
968
+ break;
969
+ case 5:
970
+ sum = _mm256_add_pd(
971
+ _mm256_add_pd(
972
+ _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
973
+ _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3]))),
974
+ _mm256_mul_pd(pX[4], pRow[4]));
975
+ break;
976
+ case 6:
977
+ sum = _mm256_add_pd(
978
+ _mm256_add_pd(
979
+ _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
980
+ _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3]))),
981
+ _mm256_add_pd(_mm256_mul_pd(pX[4], pRow[4]), _mm256_mul_pd(pX[5], pRow[5])));
982
+ break;
983
+ case 7:
984
+ sum = _mm256_add_pd(
985
+ _mm256_add_pd(
986
+ _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
987
+ _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3]))),
988
+ _mm256_add_pd(
989
+ _mm256_add_pd(_mm256_mul_pd(pX[4], pRow[4]), _mm256_mul_pd(pX[5], pRow[5])),
990
+ _mm256_mul_pd(pX[6], pRow[6])));
991
+ break;
992
+ }
993
+ }
994
+ const __m128d t = _mm_add_pd(_mm256_extractf128_pd(sum, 0), _mm256_extractf128_pd(sum, 1));
995
+ const double* pComps = (const double*)&t;
996
+ double ans = pComps[0] + pComps[1];
997
+ const ae_int_t tail = nVec<<2;
998
+ for(j=tail; j<n; j++) {
999
+ ans += a->ptr.pp_double[i][j] * x[j];
1000
+ }
1001
+ y[i] += alpha*ans;
1002
+ }
1003
+ }
1004
+
1005
+ void rgemv_transposed_avx2(const ae_int_t m, const ae_int_t n,
1006
+ const double alpha, const ae_matrix* __restrict a,
1007
+ const double* __restrict x, double* __restrict y, ae_state* _state)
1008
+ {
1009
+ ae_int_t i;
1010
+ ae_int_t j;
1011
+ __m256d* __restrict pY = (__m256d*)y;
1012
+ const ae_int_t nVec = m >> 2;
1013
+
1014
+ for(i=0; i<=n-1; i++)
1015
+ {
1016
+ const __m256d* __restrict pRow = (const __m256d*)a->ptr.pp_double[i];
1017
+ const double v = alpha*x[i];
1018
+ const __m256d vV = _mm256_set1_pd(v);
1019
+ for(j=0; j<nVec; j++)
1020
+ {
1021
+ pY[j] = _mm256_add_pd(_mm256_mul_pd(vV, pRow[j]), pY[j]);
1022
+ }
1023
+ const ae_int_t tail = nVec<<2;
1024
+ for(j=tail; j<m; j++) {
1025
+ y[j] += v*a->ptr.pp_double[i][j];
1026
+ }
1027
+ }
1028
+ }
1029
+
1030
+ void rgemvx_straight_avx2_xaligned(const ae_int_t m, const ae_int_t n,
1031
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
1032
+ const ae_int_t ja, const double* __restrict x,
1033
+ double* __restrict y, ae_state* _state)
1034
+ {
1035
+ ae_int_t i;
1036
+ ae_int_t j;
1037
+ const __m256d* __restrict pX = (const __m256d*)x;
1038
+ const ae_int_t nVec = n >> 2;
1039
+ const ae_int_t nUnroll = nVec >> 3;
1040
+ __m256d sum = _mm256_setzero_pd();
1041
+ for(i=0; i<m; i++) {
1042
+ const __m256d* __restrict pRow = (const __m256d*)(a->ptr.pp_double[i+ia]+ja);
1043
+ if(nUnroll >= 1) {
1044
+ __m256d u0 = _mm256_mul_pd(ULOAD256PD(pRow[0]), pX[0]);
1045
+ __m256d u1 = _mm256_mul_pd(ULOAD256PD(pRow[1]), pX[1]);
1046
+ __m256d u2 = _mm256_mul_pd(ULOAD256PD(pRow[2]), pX[2]);
1047
+ __m256d u3 = _mm256_mul_pd(ULOAD256PD(pRow[3]), pX[3]);
1048
+ __m256d u4 = _mm256_mul_pd(ULOAD256PD(pRow[4]), pX[4]);
1049
+ __m256d u5 = _mm256_mul_pd(ULOAD256PD(pRow[5]), pX[5]);
1050
+ __m256d u6 = _mm256_mul_pd(ULOAD256PD(pRow[6]), pX[6]);
1051
+ __m256d u7 = _mm256_mul_pd(ULOAD256PD(pRow[7]), pX[7]);
1052
+ for(j=1; j<nUnroll; j++) {
1053
+ const ae_int_t at = j<<3;
1054
+ u0 = _mm256_add_pd(u0, _mm256_mul_pd(ULOAD256PD(pRow[at]), pX[at]));
1055
+ u1 = _mm256_add_pd(u1, _mm256_mul_pd(ULOAD256PD(pRow[at+1]), pX[at+1]));
1056
+ u2 = _mm256_add_pd(u2, _mm256_mul_pd(ULOAD256PD(pRow[at+2]), pX[at+2]));
1057
+ u3 = _mm256_add_pd(u3, _mm256_mul_pd(ULOAD256PD(pRow[at+3]), pX[at+3]));
1058
+ u4 = _mm256_add_pd(u4, _mm256_mul_pd(ULOAD256PD(pRow[at+4]), pX[at+4]));
1059
+ u5 = _mm256_add_pd(u5, _mm256_mul_pd(ULOAD256PD(pRow[at+5]), pX[at+5]));
1060
+ u6 = _mm256_add_pd(u6, _mm256_mul_pd(ULOAD256PD(pRow[at+6]), pX[at+6]));
1061
+ u7 = _mm256_add_pd(u7, _mm256_mul_pd(ULOAD256PD(pRow[at+7]), pX[at+7]));
1062
+ }
1063
+ const ae_int_t at = j<<3;
1064
+ switch(nVec-at) {
1065
+ case 7:
1066
+ u6 = _mm256_add_pd(_mm256_mul_pd(pX[at+6], ULOAD256PD(pRow[at+6])), u6);
1067
+ case 6:
1068
+ u5 = _mm256_add_pd(_mm256_mul_pd(pX[at+5], ULOAD256PD(pRow[at+5])), u5);
1069
+ case 5:
1070
+ u4 = _mm256_add_pd(_mm256_mul_pd(pX[at+4], ULOAD256PD(pRow[at+4])), u4);
1071
+ case 4:
1072
+ u3 = _mm256_add_pd(_mm256_mul_pd(pX[at+3], ULOAD256PD(pRow[at+3])), u3);
1073
+ case 3:
1074
+ u2 = _mm256_add_pd(_mm256_mul_pd(pX[at+2], ULOAD256PD(pRow[at+2])), u2);
1075
+ case 2:
1076
+ u1 = _mm256_add_pd(_mm256_mul_pd(pX[at+1], ULOAD256PD(pRow[at+1])), u1);
1077
+ case 1:
1078
+ u0 = _mm256_add_pd(_mm256_mul_pd(pX[at+0], ULOAD256PD(pRow[at+0])), u0);
1079
+ }
1080
+ sum = _mm256_add_pd(
1081
+ _mm256_add_pd(_mm256_add_pd(u0, u1), _mm256_add_pd(u2, u3)),
1082
+ _mm256_add_pd(_mm256_add_pd(u4, u5), _mm256_add_pd(u6, u7)));
1083
+ }
1084
+ else {
1085
+ switch(nVec) {
1086
+ case 0:
1087
+ sum = _mm256_setzero_pd();
1088
+ break;
1089
+ case 1:
1090
+ sum = _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0]));
1091
+ break;
1092
+ case 2:
1093
+ sum = _mm256_add_pd(
1094
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1095
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1])));
1096
+ break;
1097
+ case 3:
1098
+ sum = _mm256_add_pd(
1099
+ _mm256_add_pd(
1100
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1101
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
1102
+ _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])));
1103
+ break;
1104
+ case 4:
1105
+ sum = _mm256_add_pd(
1106
+ _mm256_add_pd(
1107
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1108
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
1109
+ _mm256_add_pd(
1110
+ _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
1111
+ _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3]))));
1112
+ break;
1113
+ case 5:
1114
+ sum = _mm256_add_pd(
1115
+ _mm256_add_pd(
1116
+ _mm256_add_pd(
1117
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1118
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
1119
+ _mm256_add_pd(
1120
+ _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
1121
+ _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
1122
+ _mm256_mul_pd(pX[4], ULOAD256PD(pRow[4])));
1123
+ break;
1124
+ case 6:
1125
+ sum = _mm256_add_pd(
1126
+ _mm256_add_pd(
1127
+ _mm256_add_pd(
1128
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1129
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
1130
+ _mm256_add_pd(
1131
+ _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
1132
+ _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
1133
+ _mm256_add_pd(
1134
+ _mm256_mul_pd(pX[4], ULOAD256PD(pRow[4])),
1135
+ _mm256_mul_pd(pX[5], ULOAD256PD(pRow[5]))));
1136
+ break;
1137
+ case 7:
1138
+ sum = _mm256_add_pd(
1139
+ _mm256_add_pd(
1140
+ _mm256_add_pd(
1141
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1142
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
1143
+ _mm256_add_pd(
1144
+ _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
1145
+ _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
1146
+ _mm256_add_pd(
1147
+ _mm256_add_pd(
1148
+ _mm256_mul_pd(pX[4], ULOAD256PD(pRow[4])),
1149
+ _mm256_mul_pd(pX[5], ULOAD256PD(pRow[5]))),
1150
+ _mm256_mul_pd(pX[6], ULOAD256PD(pRow[6]))));
1151
+ break;
1152
+ }
1153
+ }
1154
+ const __m128d t = _mm_add_pd(_mm256_extractf128_pd(sum, 0), _mm256_extractf128_pd(sum, 1));
1155
+ const double* pComps = (const double*)&t;
1156
+ double ans = pComps[0] + pComps[1];
1157
+ const ae_int_t tail = nVec<<2;
1158
+ for(j=tail; j<n; j++) {
1159
+ ans += a->ptr.pp_double[i+ia][j+ja] * x[j];
1160
+ }
1161
+ y[i] += alpha*ans;
1162
+ }
1163
+ }
1164
+
1165
+ void rgemvx_straight_avx2(const ae_int_t m, const ae_int_t n,
1166
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
1167
+ const ae_int_t ja, const double* __restrict x,
1168
+ double* __restrict y, ae_state* _state)
1169
+ {
1170
+ ae_int_t i;
1171
+ ae_int_t j;
1172
+ if( n<=3 ) {
1173
+ for(i=0; i<m; i++) {
1174
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
1175
+ double v = 0.0;
1176
+ for(j=0; j<n; j++) {
1177
+ v += p_a[j] * x[j];
1178
+ }
1179
+ y[i] += alpha*v;
1180
+ }
1181
+ return;
1182
+ }
1183
+
1184
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
1185
+ if(unal == 0)
1186
+ {
1187
+ rgemvx_straight_avx2_xaligned(m, n, alpha, a, ia, ja, x, y, _state);
1188
+ return;
1189
+ }
1190
+ const ptrdiff_t shift = 4-(unal>>3);
1191
+ for(i=0; i<m; i++) {
1192
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
1193
+ double v = 0.0;
1194
+ for(j=0; j<shift; j++) {
1195
+ v += p_a[j] * x[j];
1196
+ }
1197
+ y[i] += alpha*v;
1198
+ }
1199
+ rgemvx_straight_avx2_xaligned(m, n-shift, alpha, a, ia, ja+shift, x+shift, y, _state);
1200
+ }
1201
+
1202
+ void rgemvx_transposed_avx2_yaligned(const ae_int_t m, const ae_int_t n,
1203
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
1204
+ const ae_int_t ja, const double* __restrict x, double* __restrict y,
1205
+ ae_state* _state)
1206
+ {
1207
+ ae_int_t i;
1208
+ ae_int_t j;
1209
+ __m256d* __restrict pY = (__m256d*)y;
1210
+ const ae_int_t nVec = m >> 2;
1211
+
1212
+ for(i=0; i<=n-1; i++)
1213
+ {
1214
+ const __m256d* __restrict pRow = (const __m256d*)(a->ptr.pp_double[i+ia]+ja);
1215
+ const double v = alpha*x[i];
1216
+ const __m256d vV = _mm256_set1_pd(v);
1217
+ for(j=0; j<nVec; j++)
1218
+ {
1219
+ pY[j] = _mm256_add_pd(_mm256_mul_pd(vV, ULOAD256PD(pRow[j])), pY[j]);
1220
+ }
1221
+ const ae_int_t tail = nVec<<2;
1222
+ for(j=tail; j<m; j++) {
1223
+ y[j] += v*a->ptr.pp_double[i+ia][j+ja];
1224
+ }
1225
+ }
1226
+ }
1227
+
1228
+ void rgemvx_transposed_avx2(const ae_int_t m, const ae_int_t n,
1229
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
1230
+ const ae_int_t ja, const double* __restrict x, double* __restrict y,
1231
+ ae_state* _state)
1232
+ {
1233
+ ae_int_t i;
1234
+ ae_int_t j;
1235
+ if( m<=3 ) {
1236
+ for(i=0; i<n; i++) {
1237
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
1238
+ const double v = alpha*x[i];
1239
+ for(j=0; j<m; j++) {
1240
+ y[j] += v*p_a[j];
1241
+ }
1242
+ }
1243
+ return;
1244
+ }
1245
+
1246
+ const ptrdiff_t unal = ((ptrdiff_t)y) & 31;
1247
+ if(unal == 0)
1248
+ {
1249
+ rgemvx_transposed_avx2_yaligned(m, n, alpha, a, ia, ja, x, y, _state);
1250
+ return;
1251
+ }
1252
+ const ptrdiff_t shift = 4-(unal>>3);
1253
+ for(i=0; i<n; i++) {
1254
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
1255
+ const double v = alpha*x[i];
1256
+ for(j=0; j<shift; j++) {
1257
+ y[j] += v*p_a[j];
1258
+ }
1259
+ }
1260
+ rgemvx_transposed_avx2_yaligned(m-shift, n, alpha, a, ia, ja+shift, x, y+shift, _state);
1261
+ }
1262
+
1263
+ /*************************************************************************
1264
+ Block packing function for fast rGEMM. Loads long WIDTH*LENGTH submatrix
1265
+ with LENGTH<=BLOCK_SIZE and WIDTH<=MICRO_SIZE into contiguous MICRO_SIZE*
1266
+ BLOCK_SIZE row-wise 'horizontal' storage (hence H in the function name).
1267
+
1268
+ The matrix occupies first ROUND_LENGTH cols of the storage (with LENGTH
1269
+ being rounded up to nearest SIMD granularity). ROUND_LENGTH is returned
1270
+ as result. It is guaranteed that ROUND_LENGTH depends only on LENGTH, and
1271
+ that it will be same for all function calls.
1272
+
1273
+ Unused rows and columns in [LENGTH,ROUND_LENGTH) range are filled by zeros;
1274
+ unused cols in [ROUND_LENGTH,BLOCK_SIZE) range are ignored.
1275
+
1276
+ * op=0 means that source is an WIDTH*LENGTH matrix stored with src_stride
1277
+ stride. The matrix is NOT transposed on load.
1278
+ * op=1 means that source is an LENGTH*WIDTH matrix stored with src_stride
1279
+ that is loaded with transposition
1280
+ * present version of the function supports only MICRO_SIZE=2, the behavior
1281
+ is undefined for other micro sizes.
1282
+ * the target is properly aligned; the source can be unaligned.
1283
+
1284
+ Requires AVX2, does NOT check its presense.
1285
+
1286
+ The function is present in two versions, one with variable opsrc_length
1287
+ and another one with opsrc_length==block_size==32.
1288
+
1289
+ -- ALGLIB routine --
1290
+ 19.07.2021
1291
+ Bochkanov Sergey
1292
+ *************************************************************************/
1293
+ ae_int_t ablasf_packblkh_avx2(
1294
+ const double *src,
1295
+ ae_int_t src_stride,
1296
+ ae_int_t op,
1297
+ ae_int_t opsrc_length,
1298
+ ae_int_t opsrc_width,
1299
+ double *dst,
1300
+ ae_int_t block_size,
1301
+ ae_int_t micro_size)
1302
+ {
1303
+ ae_int_t i;
1304
+
1305
+ /*
1306
+ * Write to the storage
1307
+ */
1308
+ if( op==0 )
1309
+ {
1310
+ /*
1311
+ * Copy without transposition
1312
+ */
1313
+ const ae_int_t len8=(opsrc_length>>3)<<3;
1314
+ const double *src1 = src+src_stride;
1315
+ double *dst1 = dst+block_size;
1316
+ if( opsrc_width==2 )
1317
+ {
1318
+ /*
1319
+ * Width=2
1320
+ */
1321
+ for(i=0; i<len8; i+=8)
1322
+ {
1323
+ _mm256_store_pd(dst+i, _mm256_loadu_pd(src+i));
1324
+ _mm256_store_pd(dst+i+4, _mm256_loadu_pd(src+i+4));
1325
+ _mm256_store_pd(dst1+i, _mm256_loadu_pd(src1+i));
1326
+ _mm256_store_pd(dst1+i+4, _mm256_loadu_pd(src1+i+4));
1327
+ }
1328
+ for(i=len8; i<opsrc_length; i++)
1329
+ {
1330
+ dst[i] = src[i];
1331
+ dst1[i] = src1[i];
1332
+ }
1333
+ }
1334
+ else
1335
+ {
1336
+ /*
1337
+ * Width=1, pad by zeros
1338
+ */
1339
+ __m256d vz = _mm256_setzero_pd();
1340
+ for(i=0; i<len8; i+=8)
1341
+ {
1342
+ _mm256_store_pd(dst+i, _mm256_loadu_pd(src+i));
1343
+ _mm256_store_pd(dst+i+4, _mm256_loadu_pd(src+i+4));
1344
+ _mm256_store_pd(dst1+i, vz);
1345
+ _mm256_store_pd(dst1+i+4, vz);
1346
+ }
1347
+ for(i=len8; i<opsrc_length; i++)
1348
+ {
1349
+ dst[i] = src[i];
1350
+ dst1[i] = 0.0;
1351
+ }
1352
+ }
1353
+ }
1354
+ else
1355
+ {
1356
+ /*
1357
+ * Copy with transposition
1358
+ */
1359
+ const ae_int_t stride2 = src_stride<<1;
1360
+ const ae_int_t stride3 = src_stride+stride2;
1361
+ const ae_int_t stride4 = src_stride<<2;
1362
+ const ae_int_t len4=(opsrc_length>>2)<<2;
1363
+ const double *srci = src;
1364
+ double *dst1 = dst+block_size;
1365
+ if( opsrc_width==2 )
1366
+ {
1367
+ /*
1368
+ * Width=2
1369
+ */
1370
+ for(i=0; i<len4; i+=4)
1371
+ {
1372
+ __m128d s0 = _mm_loadu_pd(srci), s1 = _mm_loadu_pd(srci+src_stride);
1373
+ __m128d s2 = _mm_loadu_pd(srci+stride2), s3 = _mm_loadu_pd(srci+stride3);
1374
+ _mm_store_pd(dst+i, _mm_unpacklo_pd(s0,s1));
1375
+ _mm_store_pd(dst1+i, _mm_unpackhi_pd(s0,s1));
1376
+ _mm_store_pd(dst+i+2, _mm_unpacklo_pd(s2,s3));
1377
+ _mm_store_pd(dst1+i+2, _mm_unpackhi_pd(s2,s3));
1378
+ srci += stride4;
1379
+ }
1380
+ for(i=len4; i<opsrc_length; i++)
1381
+ {
1382
+ dst[i] = srci[0];
1383
+ dst1[i] = srci[1];
1384
+ srci += src_stride;
1385
+ }
1386
+ }
1387
+ else
1388
+ {
1389
+ /*
1390
+ * Width=1, pad by zeros
1391
+ */
1392
+ __m128d vz = _mm_setzero_pd();
1393
+ for(i=0; i<len4; i+=4)
1394
+ {
1395
+ __m128d s0 = _mm_load_sd(srci), s1 = _mm_load_sd(srci+src_stride);
1396
+ __m128d s2 = _mm_load_sd(srci+stride2), s3 = _mm_load_sd(srci+stride3);
1397
+ _mm_store_pd(dst+i, _mm_unpacklo_pd(s0,s1));
1398
+ _mm_store_pd(dst+i+2, _mm_unpacklo_pd(s2,s3));
1399
+ _mm_store_pd(dst1+i, vz);
1400
+ _mm_store_pd(dst1+i+2, vz);
1401
+ srci += stride4;
1402
+ }
1403
+ for(i=len4; i<opsrc_length; i++)
1404
+ {
1405
+ dst[i] = srci[0];
1406
+ dst1[i] = 0.0;
1407
+ srci += src_stride;
1408
+ }
1409
+ }
1410
+ }
1411
+
1412
+ /*
1413
+ * Pad by zeros, if needed
1414
+ */
1415
+ ae_int_t round_length = ((opsrc_length+3)>>2)<<2;
1416
+ for(i=opsrc_length; i<round_length; i++)
1417
+ {
1418
+ dst[i] = 0;
1419
+ dst[i+block_size] = 0;
1420
+ }
1421
+ return round_length;
1422
+ }
1423
+
1424
+ ae_int_t ablasf_packblkh32_avx2(
1425
+ const double *src,
1426
+ ae_int_t src_stride,
1427
+ ae_int_t op,
1428
+ ae_int_t ignore_opsrc_length,
1429
+ ae_int_t opsrc_width,
1430
+ double *dst,
1431
+ ae_int_t ignore_block_size,
1432
+ ae_int_t micro_size)
1433
+ {
1434
+ ae_int_t i;
1435
+
1436
+ /*
1437
+ * Write to the storage
1438
+ */
1439
+ if( op==0 )
1440
+ {
1441
+ /*
1442
+ * Copy without transposition
1443
+ */
1444
+ const double *src1 = src+src_stride;
1445
+ double *dst1 = dst+32;
1446
+ if( opsrc_width==2 )
1447
+ {
1448
+ /*
1449
+ * Width=2
1450
+ */
1451
+ for(i=0; i<32; i+=8)
1452
+ {
1453
+ _mm256_store_pd(dst+i, _mm256_loadu_pd(src+i));
1454
+ _mm256_store_pd(dst+i+4, _mm256_loadu_pd(src+i+4));
1455
+ _mm256_store_pd(dst1+i, _mm256_loadu_pd(src1+i));
1456
+ _mm256_store_pd(dst1+i+4, _mm256_loadu_pd(src1+i+4));
1457
+ }
1458
+ }
1459
+ else
1460
+ {
1461
+ /*
1462
+ * Width=1, pad by zeros
1463
+ */
1464
+ __m256d vz = _mm256_setzero_pd();
1465
+ for(i=0; i<32; i+=8)
1466
+ {
1467
+ _mm256_store_pd(dst+i, _mm256_loadu_pd(src+i));
1468
+ _mm256_store_pd(dst+i+4, _mm256_loadu_pd(src+i+4));
1469
+ _mm256_store_pd(dst1+i, vz);
1470
+ _mm256_store_pd(dst1+i+4, vz);
1471
+ }
1472
+ }
1473
+ }
1474
+ else
1475
+ {
1476
+ /*
1477
+ * Copy with transposition
1478
+ */
1479
+ const ae_int_t stride2 = src_stride<<1;
1480
+ const ae_int_t stride3 = src_stride+stride2;
1481
+ const ae_int_t stride4 = src_stride<<2;
1482
+ const double *srci = src;
1483
+ double *dst1 = dst+32;
1484
+ if( opsrc_width==2 )
1485
+ {
1486
+ /*
1487
+ * Width=2
1488
+ */
1489
+ for(i=0; i<32; i+=4)
1490
+ {
1491
+ __m128d s0 = _mm_loadu_pd(srci), s1 = _mm_loadu_pd(srci+src_stride);
1492
+ __m128d s2 = _mm_loadu_pd(srci+stride2), s3 = _mm_loadu_pd(srci+stride3);
1493
+ _mm_store_pd(dst+i, _mm_unpacklo_pd(s0,s1));
1494
+ _mm_store_pd(dst1+i, _mm_unpackhi_pd(s0,s1));
1495
+ _mm_store_pd(dst+i+2, _mm_unpacklo_pd(s2,s3));
1496
+ _mm_store_pd(dst1+i+2, _mm_unpackhi_pd(s2,s3));
1497
+ srci += stride4;
1498
+ }
1499
+ }
1500
+ else
1501
+ {
1502
+ /*
1503
+ * Width=1, pad by zeros
1504
+ */
1505
+ __m128d vz = _mm_setzero_pd();
1506
+ for(i=0; i<32; i+=4)
1507
+ {
1508
+ __m128d s0 = _mm_load_sd(srci), s1 = _mm_load_sd(srci+src_stride);
1509
+ __m128d s2 = _mm_load_sd(srci+stride2), s3 = _mm_load_sd(srci+stride3);
1510
+ _mm_store_pd(dst+i, _mm_unpacklo_pd(s0,s1));
1511
+ _mm_store_pd(dst+i+2, _mm_unpacklo_pd(s2,s3));
1512
+ _mm_store_pd(dst1+i, vz);
1513
+ _mm_store_pd(dst1+i+2, vz);
1514
+ srci += stride4;
1515
+ }
1516
+ }
1517
+ }
1518
+ return 32;
1519
+ }
1520
+
1521
+ /*************************************************************************
1522
+ Computes product A*transpose(B) of two MICRO_SIZE*ROUND_LENGTH rowwise
1523
+ 'horizontal' matrices, stored with stride=block_size, and writes it to the
1524
+ row-wise matrix C.
1525
+
1526
+ ROUND_LENGTH is expected to be properly SIMD-rounded length, as returned
1527
+ by ablasf_packblkh_avx2().
1528
+
1529
+ Present version of the function supports only MICRO_SIZE=2, the behavior
1530
+ is undefined for other micro sizes.
1531
+
1532
+ Requires AVX2, does NOT check its presense.
1533
+
1534
+ -- ALGLIB routine --
1535
+ 19.07.2021
1536
+ Bochkanov Sergey
1537
+ *************************************************************************/
1538
+ void ablasf_dotblkh_avx2(
1539
+ const double *src_a,
1540
+ const double *src_b,
1541
+ ae_int_t round_length,
1542
+ ae_int_t block_size,
1543
+ ae_int_t micro_size,
1544
+ double *dst,
1545
+ ae_int_t dst_stride)
1546
+ {
1547
+ ae_int_t z;
1548
+ __m256d r00 = _mm256_setzero_pd(), r01 = _mm256_setzero_pd(), r10 = _mm256_setzero_pd(), r11 = _mm256_setzero_pd();
1549
+ if( round_length&0x7 )
1550
+ {
1551
+ /*
1552
+ * round_length is multiple of 4, but not multiple of 8
1553
+ */
1554
+ for(z=0; z<round_length; z+=4, src_a+=4, src_b+=4)
1555
+ {
1556
+ __m256d a0 = _mm256_load_pd(src_a);
1557
+ __m256d a1 = _mm256_load_pd(src_a+block_size);
1558
+ __m256d b0 = _mm256_load_pd(src_b);
1559
+ __m256d b1 = _mm256_load_pd(src_b+block_size);
1560
+ r00 = _mm256_add_pd(_mm256_mul_pd(a0, b0), r00);
1561
+ r01 = _mm256_add_pd(_mm256_mul_pd(a0, b1), r01);
1562
+ r10 = _mm256_add_pd(_mm256_mul_pd(a1, b0), r10);
1563
+ r11 = _mm256_add_pd(_mm256_mul_pd(a1, b1), r11);
1564
+ }
1565
+ }
1566
+ else
1567
+ {
1568
+ /*
1569
+ * round_length is multiple of 8
1570
+ */
1571
+ for(z=0; z<round_length; z+=8, src_a+=8, src_b+=8)
1572
+ {
1573
+ __m256d a0 = _mm256_load_pd(src_a);
1574
+ __m256d a1 = _mm256_load_pd(src_a+block_size);
1575
+ __m256d b0 = _mm256_load_pd(src_b);
1576
+ __m256d b1 = _mm256_load_pd(src_b+block_size);
1577
+ __m256d c0 = _mm256_load_pd(src_a+4);
1578
+ __m256d c1 = _mm256_load_pd(src_a+block_size+4);
1579
+ __m256d d0 = _mm256_load_pd(src_b+4);
1580
+ __m256d d1 = _mm256_load_pd(src_b+block_size+4);
1581
+ r00 = _mm256_add_pd(_mm256_add_pd(r00, _mm256_mul_pd(a0, b0)), _mm256_mul_pd(c0, d0));
1582
+ r01 = _mm256_add_pd(_mm256_add_pd(r01, _mm256_mul_pd(a0, b1)), _mm256_mul_pd(c0, d1));
1583
+ r10 = _mm256_add_pd(_mm256_add_pd(r10, _mm256_mul_pd(a1, b0)), _mm256_mul_pd(c1, d0));
1584
+ r11 = _mm256_add_pd(_mm256_add_pd(r11, _mm256_mul_pd(a1, b1)), _mm256_mul_pd(c1, d1));
1585
+ }
1586
+ }
1587
+ __m256d sum0 = _mm256_hadd_pd(r00,r01);
1588
+ __m256d sum1 = _mm256_hadd_pd(r10,r11);
1589
+ _mm_store_pd(dst, _mm_add_pd(_mm256_castpd256_pd128(sum0), _mm256_extractf128_pd(sum0,1)));
1590
+ _mm_store_pd(dst+dst_stride, _mm_add_pd(_mm256_castpd256_pd128(sum1), _mm256_extractf128_pd(sum1,1)));
1591
+ }
1592
+
1593
+ /*************************************************************************
1594
+ Y := alpha*X + beta*Y
1595
+
1596
+ Requires AVX2, does NOT check its presense.
1597
+
1598
+ -- ALGLIB routine --
1599
+ 19.07.2021
1600
+ Bochkanov Sergey
1601
+ *************************************************************************/
1602
+ void ablasf_daxpby_avx2(
1603
+ ae_int_t n,
1604
+ double alpha,
1605
+ const double *src,
1606
+ double beta,
1607
+ double *dst)
1608
+ {
1609
+ if( beta==1.0 )
1610
+ {
1611
+ /*
1612
+ * The most optimized case: DST := alpha*SRC + DST
1613
+ *
1614
+ * First, we process leading elements with generic C code until DST is aligned.
1615
+ * Then, we process central part, assuming that DST is properly aligned.
1616
+ * Finally, we process tail.
1617
+ */
1618
+ ae_int_t i, n4;
1619
+ __m256d avx_alpha = _mm256_set1_pd(alpha);
1620
+ while( n>0 && (((ptrdiff_t)dst)&31) )
1621
+ {
1622
+ *dst += alpha*(*src);
1623
+ n--;
1624
+ dst++;
1625
+ src++;
1626
+ }
1627
+ n4=(n>>2)<<2;
1628
+ for(i=0; i<n4; i+=4)
1629
+ {
1630
+ __m256d r = _mm256_add_pd(_mm256_mul_pd(avx_alpha, _mm256_loadu_pd(src+i)), _mm256_load_pd(dst+i));
1631
+ _mm256_store_pd(dst+i, r);
1632
+ }
1633
+ for(i=n4; i<n; i++)
1634
+ dst[i] = alpha*src[i]+dst[i];
1635
+ }
1636
+ else if( beta!=0.0 )
1637
+ {
1638
+ /*
1639
+ * Well optimized: DST := alpha*SRC + beta*DST
1640
+ */
1641
+ ae_int_t i, n4;
1642
+ __m256d avx_alpha = _mm256_set1_pd(alpha);
1643
+ __m256d avx_beta = _mm256_set1_pd(beta);
1644
+ while( n>0 && (((ptrdiff_t)dst)&31) )
1645
+ {
1646
+ *dst = alpha*(*src) + beta*(*dst);
1647
+ n--;
1648
+ dst++;
1649
+ src++;
1650
+ }
1651
+ n4=(n>>2)<<2;
1652
+ for(i=0; i<n4; i+=4)
1653
+ {
1654
+ __m256d r = _mm256_add_pd(
1655
+ _mm256_mul_pd(avx_alpha, _mm256_loadu_pd(src+i)),
1656
+ _mm256_mul_pd(avx_beta,_mm256_load_pd(dst+i)));
1657
+ _mm256_store_pd(dst+i, r);
1658
+ }
1659
+ for(i=n4; i<n; i++)
1660
+ dst[i] = alpha*src[i]+beta*dst[i];
1661
+ }
1662
+ else
1663
+ {
1664
+ /*
1665
+ * Easy case: DST := alpha*SRC
1666
+ */
1667
+ ae_int_t i;
1668
+ for(i=0; i<n; i++)
1669
+ dst[i] = alpha*src[i];
1670
+ }
1671
+ }
1672
+
1673
+ ae_bool spchol_updatekernelabc4_avx2(double* rowstorage,
1674
+ ae_int_t offss,
1675
+ ae_int_t twidth,
1676
+ ae_int_t offsu,
1677
+ ae_int_t uheight,
1678
+ ae_int_t urank,
1679
+ ae_int_t urowstride,
1680
+ ae_int_t uwidth,
1681
+ const double* diagd,
1682
+ ae_int_t offsd,
1683
+ const ae_int_t* raw2smap,
1684
+ const ae_int_t* superrowidx,
1685
+ ae_int_t urbase,
1686
+ ae_state *_state)
1687
+ {
1688
+ ae_int_t k;
1689
+ ae_int_t targetrow;
1690
+ ae_int_t targetcol;
1691
+
1692
+ /*
1693
+ * Filter out unsupported combinations (ones that are too sparse for the non-SIMD code)
1694
+ */
1695
+ if( twidth<3||twidth>4 )
1696
+ {
1697
+ return ae_false;
1698
+ }
1699
+ if( uwidth<1||uwidth>4 )
1700
+ {
1701
+ return ae_false;
1702
+ }
1703
+ if( urank>4 )
1704
+ {
1705
+ return ae_false;
1706
+ }
1707
+
1708
+ /*
1709
+ * Shift input arrays to the beginning of the working area.
1710
+ * Prepare SIMD masks
1711
+ */
1712
+ __m256i v_rankmask = _mm256_cmpgt_epi64(_mm256_set_epi64x(urank, urank, urank, urank), _mm256_set_epi64x(3, 2, 1, 0));
1713
+ double *update_storage = rowstorage+offsu;
1714
+ double *target_storage = rowstorage+offss;
1715
+ superrowidx += urbase;
1716
+
1717
+ /*
1718
+ * Load head of the update matrix
1719
+ */
1720
+ __m256d v_d0123 = _mm256_maskload_pd(diagd+offsd, v_rankmask);
1721
+ __m256d u_0_0123 = _mm256_setzero_pd();
1722
+ __m256d u_1_0123 = _mm256_setzero_pd();
1723
+ __m256d u_2_0123 = _mm256_setzero_pd();
1724
+ __m256d u_3_0123 = _mm256_setzero_pd();
1725
+ for(k=0; k<=uwidth-1; k++)
1726
+ {
1727
+ targetcol = raw2smap[superrowidx[k]];
1728
+ if( targetcol==0 )
1729
+ u_0_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
1730
+ if( targetcol==1 )
1731
+ u_1_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
1732
+ if( targetcol==2 )
1733
+ u_2_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
1734
+ if( targetcol==3 )
1735
+ u_3_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
1736
+ }
1737
+
1738
+ /*
1739
+ * Transpose head
1740
+ */
1741
+ __m256d u01_lo = _mm256_unpacklo_pd(u_0_0123,u_1_0123);
1742
+ __m256d u01_hi = _mm256_unpackhi_pd(u_0_0123,u_1_0123);
1743
+ __m256d u23_lo = _mm256_unpacklo_pd(u_2_0123,u_3_0123);
1744
+ __m256d u23_hi = _mm256_unpackhi_pd(u_2_0123,u_3_0123);
1745
+ __m256d u_0123_0 = _mm256_permute2f128_pd(u01_lo, u23_lo, 0x20);
1746
+ __m256d u_0123_1 = _mm256_permute2f128_pd(u01_hi, u23_hi, 0x20);
1747
+ __m256d u_0123_2 = _mm256_permute2f128_pd(u23_lo, u01_lo, 0x13);
1748
+ __m256d u_0123_3 = _mm256_permute2f128_pd(u23_hi, u01_hi, 0x13);
1749
+
1750
+ /*
1751
+ * Run update
1752
+ */
1753
+ if( urank==1 )
1754
+ {
1755
+ for(k=0; k<=uheight-1; k++)
1756
+ {
1757
+ targetrow = raw2smap[superrowidx[k]]*4;
1758
+ double *update_row = rowstorage+offsu+k*urowstride;
1759
+ _mm256_store_pd(target_storage+targetrow,
1760
+ _mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
1761
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
1762
+ }
1763
+ }
1764
+ if( urank==2 )
1765
+ {
1766
+ for(k=0; k<=uheight-1; k++)
1767
+ {
1768
+ targetrow = raw2smap[superrowidx[k]]*4;
1769
+ double *update_row = rowstorage+offsu+k*urowstride;
1770
+ _mm256_store_pd(target_storage+targetrow,
1771
+ _mm256_sub_pd(_mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
1772
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+1), u_0123_1)),
1773
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
1774
+ }
1775
+ }
1776
+ if( urank==3 )
1777
+ {
1778
+ for(k=0; k<=uheight-1; k++)
1779
+ {
1780
+ targetrow = raw2smap[superrowidx[k]]*4;
1781
+ double *update_row = rowstorage+offsu+k*urowstride;
1782
+ _mm256_store_pd(target_storage+targetrow,
1783
+ _mm256_sub_pd(_mm256_sub_pd(_mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
1784
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+2), u_0123_2)),
1785
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+1), u_0123_1)),
1786
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
1787
+ }
1788
+ }
1789
+ if( urank==4 )
1790
+ {
1791
+ for(k=0; k<=uheight-1; k++)
1792
+ {
1793
+ targetrow = raw2smap[superrowidx[k]]*4;
1794
+ double *update_row = rowstorage+offsu+k*urowstride;
1795
+ _mm256_store_pd(target_storage+targetrow,
1796
+ _mm256_sub_pd(_mm256_sub_pd(_mm256_sub_pd(_mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
1797
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+3), u_0123_3)),
1798
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+2), u_0123_2)),
1799
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+1), u_0123_1)),
1800
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
1801
+ }
1802
+ }
1803
+ return ae_true;
1804
+ }
1805
+
1806
+ ae_bool spchol_updatekernel4444_avx2(
1807
+ double* rowstorage,
1808
+ ae_int_t offss,
1809
+ ae_int_t sheight,
1810
+ ae_int_t offsu,
1811
+ ae_int_t uheight,
1812
+ const double* diagd,
1813
+ ae_int_t offsd,
1814
+ const ae_int_t* raw2smap,
1815
+ const ae_int_t* superrowidx,
1816
+ ae_int_t urbase,
1817
+ ae_state *_state)
1818
+ {
1819
+ ae_int_t k;
1820
+ ae_int_t targetrow;
1821
+ ae_int_t offsk;
1822
+ __m256d v_negd_u0, v_negd_u1, v_negd_u2, v_negd_u3, v_negd;
1823
+ __m256d v_w0, v_w1, v_w2, v_w3, u01_lo, u01_hi, u23_lo, u23_hi;
1824
+
1825
+ /*
1826
+ * Compute W = -D*transpose(U[0:3])
1827
+ */
1828
+ v_negd = _mm256_mul_pd(_mm256_loadu_pd(diagd+offsd),_mm256_set1_pd(-1.0));
1829
+ v_negd_u0 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+0*4),v_negd);
1830
+ v_negd_u1 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+1*4),v_negd);
1831
+ v_negd_u2 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+2*4),v_negd);
1832
+ v_negd_u3 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+3*4),v_negd);
1833
+ u01_lo = _mm256_unpacklo_pd(v_negd_u0,v_negd_u1);
1834
+ u01_hi = _mm256_unpackhi_pd(v_negd_u0,v_negd_u1);
1835
+ u23_lo = _mm256_unpacklo_pd(v_negd_u2,v_negd_u3);
1836
+ u23_hi = _mm256_unpackhi_pd(v_negd_u2,v_negd_u3);
1837
+ v_w0 = _mm256_permute2f128_pd(u01_lo, u23_lo, 0x20);
1838
+ v_w1 = _mm256_permute2f128_pd(u01_hi, u23_hi, 0x20);
1839
+ v_w2 = _mm256_permute2f128_pd(u23_lo, u01_lo, 0x13);
1840
+ v_w3 = _mm256_permute2f128_pd(u23_hi, u01_hi, 0x13);
1841
+
1842
+ //
1843
+ // Compute update S:= S + row_scatter(U*W)
1844
+ //
1845
+ if( sheight==uheight )
1846
+ {
1847
+ /*
1848
+ * No row scatter, the most efficient code
1849
+ */
1850
+ for(k=0; k<=uheight-1; k++)
1851
+ {
1852
+ __m256d target;
1853
+
1854
+ targetrow = offss+k*4;
1855
+ offsk = offsu+k*4;
1856
+
1857
+ target = _mm256_load_pd(rowstorage+targetrow);
1858
+ target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+0),v_w0),target);
1859
+ target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+1),v_w1),target);
1860
+ target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+2),v_w2),target);
1861
+ target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+3),v_w3),target);
1862
+ _mm256_store_pd(rowstorage+targetrow, target);
1863
+ }
1864
+ }
1865
+ else
1866
+ {
1867
+ /*
1868
+ * Row scatter is performed, less efficient code using double mapping to determine target row index
1869
+ */
1870
+ for(k=0; k<=uheight-1; k++)
1871
+ {
1872
+ __m256d v_uk0, v_uk1, v_uk2, v_uk3, target;
1873
+
1874
+ targetrow = offss+raw2smap[superrowidx[urbase+k]]*4;
1875
+ offsk = offsu+k*4;
1876
+
1877
+ target = _mm256_load_pd(rowstorage+targetrow);
1878
+ v_uk0 = _mm256_broadcast_sd(rowstorage+offsk+0);
1879
+ v_uk1 = _mm256_broadcast_sd(rowstorage+offsk+1);
1880
+ v_uk2 = _mm256_broadcast_sd(rowstorage+offsk+2);
1881
+ v_uk3 = _mm256_broadcast_sd(rowstorage+offsk+3);
1882
+ target = _mm256_add_pd(_mm256_mul_pd(v_uk0,v_w0),target);
1883
+ target = _mm256_add_pd(_mm256_mul_pd(v_uk1,v_w1),target);
1884
+ target = _mm256_add_pd(_mm256_mul_pd(v_uk2,v_w2),target);
1885
+ target = _mm256_add_pd(_mm256_mul_pd(v_uk3,v_w3),target);
1886
+ _mm256_store_pd(rowstorage+targetrow, target);
1887
+ }
1888
+ }
1889
+ return ae_true;
1890
+ }
1891
+
1892
+
1893
+ /*************************************************************************
1894
+ Fast kernel for biharmonic panel with NY=1
1895
+
1896
+ INPUT PARAMETERS:
1897
+ D0, D1, D2 - evaluation point minus (Panel.C0,Panel.C1,Panel.C2)
1898
+
1899
+ OUTPUT PARAMETERS:
1900
+ F - model value
1901
+ InvPowRPPlus1 - 1/(R^(P+1))
1902
+
1903
+ -- ALGLIB --
1904
+ Copyright 26.08.2022 by Sergey Bochkanov
1905
+ *************************************************************************/
1906
+ ae_bool rbfv3farfields_bhpaneleval1fastkernel16_avx2(double d0,
1907
+ double d1,
1908
+ double d2,
1909
+ const double* pnma,
1910
+ const double* pnmb,
1911
+ const double* pmmcdiag,
1912
+ const double* ynma,
1913
+ const double* tblrmodmn,
1914
+ double* f,
1915
+ double* invpowrpplus1,
1916
+ ae_state *_state)
1917
+ {
1918
+ ae_int_t n;
1919
+ double r, r2, r01, invr;
1920
+ double sintheta, costheta;
1921
+ ae_complex expiphi, expiphi2, expiphi3, expiphi4;
1922
+ ae_int_t jj;
1923
+ ae_bool result;
1924
+
1925
+ *f = 0.0;
1926
+ *invpowrpplus1 = 0.0;
1927
+ result = ae_true;
1928
+
1929
+ /*
1930
+ *Convert to spherical polar coordinates.
1931
+ *
1932
+ * NOTE: we make sure that R is non-zero by adding extremely small perturbation
1933
+ */
1934
+ r2 = d0*d0+d1*d1+d2*d2+ae_minrealnumber;
1935
+ r = ae_sqrt(r2, _state);
1936
+ r01 = ae_sqrt(d0*d0+d1*d1+ae_minrealnumber, _state);
1937
+ costheta = d2/r;
1938
+ sintheta = r01/r;
1939
+ expiphi.x = d0/r01;
1940
+ expiphi.y = d1/r01;
1941
+ invr = (double)1/r;
1942
+
1943
+ /*
1944
+ * prepare precomputed quantities
1945
+ */
1946
+ double powsintheta2 = sintheta*sintheta;
1947
+ double powsintheta3 = powsintheta2*sintheta;
1948
+ double powsintheta4 = powsintheta2*powsintheta2;
1949
+ expiphi2.x = expiphi.x*expiphi.x-expiphi.y*expiphi.y;
1950
+ expiphi2.y = 2*expiphi.x*expiphi.y;
1951
+ expiphi3.x = expiphi2.x*expiphi.x-expiphi2.y*expiphi.y;
1952
+ expiphi3.y = expiphi2.x*expiphi.y+expiphi.x*expiphi2.y;
1953
+ expiphi4.x = expiphi2.x*expiphi2.x-expiphi2.y*expiphi2.y;
1954
+ expiphi4.y = 2*expiphi2.x*expiphi2.y;
1955
+
1956
+ /*
1957
+ * Compute far field expansion for a cluster of basis functions f=r
1958
+ *
1959
+ * NOTE: the original paper by Beatson et al. uses f=r as the basis function,
1960
+ * whilst ALGLIB uses f=-r due to conditional positive definiteness requirement.
1961
+ * We will perform conversion later.
1962
+ */
1963
+ __m256d v_costheta = _mm256_set1_pd(costheta);
1964
+ __m256d v_r2 = _mm256_set1_pd(r2);
1965
+ __m256d v_f = _mm256_setzero_pd();
1966
+ __m256d v_invr = _mm256_set1_pd(invr);
1967
+ __m256d v_powsinthetaj = _mm256_set_pd(powsintheta3, powsintheta2, sintheta, 1.0);
1968
+ __m256d v_powsintheta4 = _mm256_set1_pd(powsintheta4);
1969
+ __m256d v_expijphix = _mm256_set_pd(expiphi3.x, expiphi2.x, expiphi.x, 1.0);
1970
+ __m256d v_expijphiy = _mm256_set_pd(expiphi3.y, expiphi2.y, expiphi.y, 0.0);
1971
+ __m256d v_expi4phix = _mm256_set1_pd(expiphi4.x);
1972
+ __m256d v_expi4phiy = _mm256_set1_pd(expiphi4.y);
1973
+ *f = (double)(0);
1974
+ for(jj=0; jj<4; jj++)
1975
+ {
1976
+ __m256d pnm_cur = _mm256_setzero_pd(), pnm_prev = _mm256_setzero_pd(), pnm_new;
1977
+ __m256d v_powrminusj1 = _mm256_set1_pd(invr);
1978
+ for(n=0; n<jj*4; n++)
1979
+ v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
1980
+ for(n=jj*4; n<16; n++)
1981
+ {
1982
+ ae_int_t j0=jj*4;
1983
+
1984
+
1985
+ pnm_new = _mm256_mul_pd(v_powsinthetaj, _mm256_load_pd(pmmcdiag+n*16+j0));
1986
+ pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(v_costheta,_mm256_mul_pd(pnm_cur,_mm256_load_pd(pnma+n*16+j0))));
1987
+ pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(pnm_prev,_mm256_load_pd(pnmb+n*16+j0)));
1988
+ pnm_prev = pnm_cur;
1989
+ pnm_cur = pnm_new;
1990
+
1991
+ __m256d v_tmp = _mm256_mul_pd(pnm_cur, _mm256_load_pd(ynma+n*16+j0));
1992
+ __m256d v_sphericalx = _mm256_mul_pd(v_tmp, v_expijphix);
1993
+ __m256d v_sphericaly = _mm256_mul_pd(v_tmp, v_expijphiy);
1994
+
1995
+ __m256d v_summnx = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(tblrmodmn+n*64+j0+32)),_mm256_load_pd(tblrmodmn+n*64+j0));
1996
+ __m256d v_summny = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(tblrmodmn+n*64+j0+48)),_mm256_load_pd(tblrmodmn+n*64+j0+16));
1997
+
1998
+ __m256d v_z = _mm256_sub_pd(_mm256_mul_pd(v_sphericalx,v_summnx),_mm256_mul_pd(v_sphericaly,v_summny));
1999
+
2000
+ v_f = _mm256_add_pd(v_f, _mm256_mul_pd(v_powrminusj1, v_z));
2001
+ v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
2002
+ }
2003
+ __m256d v_expijphix_new = _mm256_sub_pd(_mm256_mul_pd(v_expijphix,v_expi4phix),_mm256_mul_pd(v_expijphiy,v_expi4phiy));
2004
+ __m256d v_expijphiy_new = _mm256_add_pd(_mm256_mul_pd(v_expijphix,v_expi4phiy),_mm256_mul_pd(v_expijphiy,v_expi4phix));
2005
+ v_powsinthetaj = _mm256_mul_pd(v_powsinthetaj, v_powsintheta4);
2006
+ v_expijphix = v_expijphix_new;
2007
+ v_expijphiy = v_expijphiy_new;
2008
+ }
2009
+
2010
+ double ttt[4];
2011
+ _mm256_storeu_pd(ttt, v_f);
2012
+ for(int k=0; k<4; k++)
2013
+ *f += ttt[k];
2014
+
2015
+ double r4 = r2*r2;
2016
+ double r8 = r4*r4;
2017
+ double r16 = r8*r8;
2018
+ *invpowrpplus1 = 1/r16;
2019
+
2020
+ return result;
2021
+ }
2022
+
2023
+
2024
+ /*************************************************************************
2025
+ Fast kernel for biharmonic panel with general NY
2026
+
2027
+ INPUT PARAMETERS:
2028
+ D0, D1, D2 - evaluation point minus (Panel.C0,Panel.C1,Panel.C2)
2029
+
2030
+ OUTPUT PARAMETERS:
2031
+ F - array[NY], model value
2032
+ InvPowRPPlus1 - 1/(R^(P+1))
2033
+
2034
+ -- ALGLIB --
2035
+ Copyright 26.08.2022 by Sergey Bochkanov
2036
+ *************************************************************************/
2037
+ ae_bool rbfv3farfields_bhpanelevalfastkernel16_avx2(double d0,
2038
+ double d1,
2039
+ double d2,
2040
+ ae_int_t ny,
2041
+ const double* pnma,
2042
+ const double* pnmb,
2043
+ const double* pmmcdiag,
2044
+ const double* ynma,
2045
+ const double* tblrmodmn,
2046
+ double* f,
2047
+ double* invpowrpplus1,
2048
+ ae_state *_state)
2049
+ {
2050
+ ae_int_t n;
2051
+ double r, r2, r01, invr;
2052
+ double sintheta, costheta;
2053
+ ae_complex expiphi, expiphi2, expiphi3, expiphi4;
2054
+ ae_int_t jj;
2055
+
2056
+ /*
2057
+ * Precomputed buffer which is enough for NY up to 16
2058
+ */
2059
+ __m256d v_f[16];
2060
+ if( ny>16 )
2061
+ return ae_false;
2062
+ for(int k=0; k<ny; k++)
2063
+ {
2064
+ v_f[k] = _mm256_setzero_pd();
2065
+ f[k] = 0.0;
2066
+ }
2067
+
2068
+ /*
2069
+ *Convert to spherical polar coordinates.
2070
+ *
2071
+ * NOTE: we make sure that R is non-zero by adding extremely small perturbation
2072
+ */
2073
+ r2 = d0*d0+d1*d1+d2*d2+ae_minrealnumber;
2074
+ r = ae_sqrt(r2, _state);
2075
+ r01 = ae_sqrt(d0*d0+d1*d1+ae_minrealnumber, _state);
2076
+ costheta = d2/r;
2077
+ sintheta = r01/r;
2078
+ expiphi.x = d0/r01;
2079
+ expiphi.y = d1/r01;
2080
+ invr = (double)1/r;
2081
+
2082
+ /*
2083
+ * prepare precomputed quantities
2084
+ */
2085
+ double powsintheta2 = sintheta*sintheta;
2086
+ double powsintheta3 = powsintheta2*sintheta;
2087
+ double powsintheta4 = powsintheta2*powsintheta2;
2088
+ expiphi2.x = expiphi.x*expiphi.x-expiphi.y*expiphi.y;
2089
+ expiphi2.y = 2*expiphi.x*expiphi.y;
2090
+ expiphi3.x = expiphi2.x*expiphi.x-expiphi2.y*expiphi.y;
2091
+ expiphi3.y = expiphi2.x*expiphi.y+expiphi.x*expiphi2.y;
2092
+ expiphi4.x = expiphi2.x*expiphi2.x-expiphi2.y*expiphi2.y;
2093
+ expiphi4.y = 2*expiphi2.x*expiphi2.y;
2094
+
2095
+ /*
2096
+ * Compute far field expansion for a cluster of basis functions f=r
2097
+ *
2098
+ * NOTE: the original paper by Beatson et al. uses f=r as the basis function,
2099
+ * whilst ALGLIB uses f=-r due to conditional positive definiteness requirement.
2100
+ * We will perform conversion later.
2101
+ */
2102
+ __m256d v_costheta = _mm256_set1_pd(costheta);
2103
+ __m256d v_r2 = _mm256_set1_pd(r2);
2104
+ __m256d v_invr = _mm256_set1_pd(invr);
2105
+ __m256d v_powsinthetaj = _mm256_set_pd(powsintheta3, powsintheta2, sintheta, 1.0);
2106
+ __m256d v_powsintheta4 = _mm256_set1_pd(powsintheta4);
2107
+ __m256d v_expijphix = _mm256_set_pd(expiphi3.x, expiphi2.x, expiphi.x, 1.0);
2108
+ __m256d v_expijphiy = _mm256_set_pd(expiphi3.y, expiphi2.y, expiphi.y, 0.0);
2109
+ __m256d v_expi4phix = _mm256_set1_pd(expiphi4.x);
2110
+ __m256d v_expi4phiy = _mm256_set1_pd(expiphi4.y);
2111
+ *f = (double)(0);
2112
+ for(jj=0; jj<4; jj++)
2113
+ {
2114
+ __m256d pnm_cur = _mm256_setzero_pd(), pnm_prev = _mm256_setzero_pd(), pnm_new;
2115
+ __m256d v_powrminusj1 = _mm256_set1_pd(invr);
2116
+ for(n=0; n<jj*4; n++)
2117
+ v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
2118
+ for(n=jj*4; n<16; n++)
2119
+ {
2120
+ ae_int_t j0=jj*4;
2121
+
2122
+ pnm_new = _mm256_mul_pd(v_powsinthetaj, _mm256_load_pd(pmmcdiag+n*16+j0));
2123
+ pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(v_costheta,_mm256_mul_pd(pnm_cur,_mm256_load_pd(pnma+n*16+j0))));
2124
+ pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(pnm_prev,_mm256_load_pd(pnmb+n*16+j0)));
2125
+ pnm_prev = pnm_cur;
2126
+ pnm_cur = pnm_new;
2127
+
2128
+ __m256d v_tmp = _mm256_mul_pd(pnm_cur, _mm256_load_pd(ynma+n*16+j0));
2129
+ __m256d v_sphericalx = _mm256_mul_pd(v_tmp, v_expijphix);
2130
+ __m256d v_sphericaly = _mm256_mul_pd(v_tmp, v_expijphiy);
2131
+
2132
+ const double *p_rmodmn = tblrmodmn+n*64+j0;
2133
+ for(int k=0; k<ny; k++)
2134
+ {
2135
+ __m256d v_summnx = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(p_rmodmn+32)),_mm256_load_pd(p_rmodmn));
2136
+ __m256d v_summny = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(p_rmodmn+48)),_mm256_load_pd(p_rmodmn+16));
2137
+ __m256d v_z = _mm256_sub_pd(_mm256_mul_pd(v_sphericalx,v_summnx),_mm256_mul_pd(v_sphericaly,v_summny));
2138
+ v_f[k] = _mm256_add_pd(v_f[k], _mm256_mul_pd(v_powrminusj1, v_z));
2139
+ p_rmodmn += 1024;
2140
+ }
2141
+ v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
2142
+ }
2143
+ __m256d v_expijphix_new = _mm256_sub_pd(_mm256_mul_pd(v_expijphix,v_expi4phix),_mm256_mul_pd(v_expijphiy,v_expi4phiy));
2144
+ __m256d v_expijphiy_new = _mm256_add_pd(_mm256_mul_pd(v_expijphix,v_expi4phiy),_mm256_mul_pd(v_expijphiy,v_expi4phix));
2145
+ v_powsinthetaj = _mm256_mul_pd(v_powsinthetaj, v_powsintheta4);
2146
+ v_expijphix = v_expijphix_new;
2147
+ v_expijphiy = v_expijphiy_new;
2148
+ }
2149
+
2150
+ for(int t=0; t<ny; t++)
2151
+ {
2152
+ double ttt[4];
2153
+ _mm256_storeu_pd(ttt, v_f[t]);
2154
+ for(int k=0; k<4; k++)
2155
+ f[t] += ttt[k];
2156
+ }
2157
+
2158
+ double r4 = r2*r2;
2159
+ double r8 = r4*r4;
2160
+ double r16 = r8*r8;
2161
+ *invpowrpplus1 = 1/r16;
2162
+
2163
+ return ae_true;
2164
+ }
2165
+
2166
+ /* ALGLIB_NO_FAST_KERNELS, _ALGLIB_HAS_AVX2_INTRINSICS */
2167
+ #endif
2168
+
2169
+
2170
+ }
2171
+