alglib4 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +47 -0
  3. data/ext/alglib/alglib.cpp +537 -0
  4. data/ext/alglib/alglib_array_converters.cpp +86 -0
  5. data/ext/alglib/alglib_array_converters.h +15 -0
  6. data/ext/alglib/alglib_utils.cpp +10 -0
  7. data/ext/alglib/alglib_utils.h +6 -0
  8. data/ext/alglib/alglibinternal.cpp +21749 -0
  9. data/ext/alglib/alglibinternal.h +2168 -0
  10. data/ext/alglib/alglibmisc.cpp +9106 -0
  11. data/ext/alglib/alglibmisc.h +2114 -0
  12. data/ext/alglib/ap.cpp +20094 -0
  13. data/ext/alglib/ap.h +7244 -0
  14. data/ext/alglib/dataanalysis.cpp +52588 -0
  15. data/ext/alglib/dataanalysis.h +10601 -0
  16. data/ext/alglib/diffequations.cpp +1342 -0
  17. data/ext/alglib/diffequations.h +282 -0
  18. data/ext/alglib/extconf.rb +5 -0
  19. data/ext/alglib/fasttransforms.cpp +4696 -0
  20. data/ext/alglib/fasttransforms.h +1018 -0
  21. data/ext/alglib/integration.cpp +4249 -0
  22. data/ext/alglib/integration.h +869 -0
  23. data/ext/alglib/interpolation.cpp +74502 -0
  24. data/ext/alglib/interpolation.h +12264 -0
  25. data/ext/alglib/kernels_avx2.cpp +2171 -0
  26. data/ext/alglib/kernels_avx2.h +201 -0
  27. data/ext/alglib/kernels_fma.cpp +1065 -0
  28. data/ext/alglib/kernels_fma.h +137 -0
  29. data/ext/alglib/kernels_sse2.cpp +735 -0
  30. data/ext/alglib/kernels_sse2.h +100 -0
  31. data/ext/alglib/linalg.cpp +65182 -0
  32. data/ext/alglib/linalg.h +9927 -0
  33. data/ext/alglib/optimization.cpp +135331 -0
  34. data/ext/alglib/optimization.h +19235 -0
  35. data/ext/alglib/solvers.cpp +20488 -0
  36. data/ext/alglib/solvers.h +4781 -0
  37. data/ext/alglib/specialfunctions.cpp +10672 -0
  38. data/ext/alglib/specialfunctions.h +2305 -0
  39. data/ext/alglib/statistics.cpp +19791 -0
  40. data/ext/alglib/statistics.h +1359 -0
  41. data/ext/alglib/stdafx.h +2 -0
  42. data/gpl2.txt +339 -0
  43. data/gpl3.txt +674 -0
  44. data/lib/alglib/version.rb +3 -0
  45. data/lib/alglib.rb +4 -0
  46. metadata +101 -0
@@ -0,0 +1,2171 @@
1
+ /*************************************************************************
2
+ ALGLIB 4.04.0 (source code generated 2024-12-21)
3
+ Copyright (c) Sergey Bochkanov (ALGLIB project).
4
+
5
+ >>> SOURCE LICENSE >>>
6
+ This program is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation (www.fsf.org); either version 2 of the
9
+ License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ A copy of the GNU General Public License is available at
17
+ http://www.fsf.org/licensing/licenses
18
+ >>> END OF LICENSE >>>
19
+ *************************************************************************/
20
+ #ifdef _MSC_VER
21
+ #define _CRT_SECURE_NO_WARNINGS
22
+ #endif
23
+ #include "stdafx.h"
24
+
25
+ //
26
+ // Must be defined before we include kernel header
27
+ //
28
+ #define _ALGLIB_IMPL_DEFINES
29
+ #define _ALGLIB_INTEGRITY_CHECKS_ONCE
30
+
31
+ #include "kernels_avx2.h"
32
+
33
+ // disable some irrelevant warnings
34
+ #if (AE_COMPILER==AE_MSVC) && !defined(AE_ALL_WARNINGS)
35
+ #pragma warning(disable:4100)
36
+ #pragma warning(disable:4127)
37
+ #pragma warning(disable:4611)
38
+ #pragma warning(disable:4702)
39
+ #pragma warning(disable:4996)
40
+ #endif
41
+
42
+ namespace alglib_impl
43
+ {
44
+
45
+
46
+
47
+ #if !defined(ALGLIB_NO_FAST_KERNELS) && defined(_ALGLIB_HAS_AVX2_INTRINSICS)
48
+
49
+ double rdotv_avx2(const ae_int_t n,
50
+ /* Real */ const double* __restrict x,
51
+ /* Real */ const double* __restrict y,
52
+ const ae_state* __restrict _state)
53
+ {
54
+ ae_int_t i;
55
+
56
+ const ae_int_t avx2len = n>>2;
57
+ const ae_int_t fmaLen = (avx2len >> 2) << 2;
58
+ const __m256d* __restrict pX = (const __m256d*)(x);
59
+ const __m256d* __restrict pY = (const __m256d*)(y);
60
+ __m256d ans;
61
+ if(fmaLen >= 4) {
62
+ __m256d fmaUnroll[4];
63
+ fmaUnroll[0] = _mm256_mul_pd(pX[0], pY[0]);
64
+ fmaUnroll[1] = _mm256_mul_pd(pX[1], pY[1]);
65
+ fmaUnroll[2] = _mm256_mul_pd(pX[2], pY[2]);
66
+ fmaUnroll[3] = _mm256_mul_pd(pX[3], pY[3]);
67
+ for(i=4; i<fmaLen; i+=4) {
68
+ fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pY[i]), fmaUnroll[0]);
69
+ fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pY[i+1]), fmaUnroll[1]);
70
+ fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pY[i+2]), fmaUnroll[2]);
71
+ fmaUnroll[3] = _mm256_add_pd(_mm256_mul_pd(pX[i+3], pY[i+3]), fmaUnroll[3]);
72
+ }
73
+ switch(avx2len-fmaLen) {
74
+ case 3:
75
+ fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pY[i+2]), fmaUnroll[2]);
76
+ case 2:
77
+ fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pY[i+1]), fmaUnroll[1]);
78
+ case 1:
79
+ fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pY[i]), fmaUnroll[0]);
80
+ }
81
+ ans = _mm256_add_pd(
82
+ _mm256_add_pd(fmaUnroll[0], fmaUnroll[1]),
83
+ _mm256_add_pd(fmaUnroll[2], fmaUnroll[3]));
84
+ }
85
+ else {
86
+ ans = _mm256_setzero_pd();
87
+ switch(avx2len) {
88
+ case 3:
89
+ ans = _mm256_mul_pd(pX[2], pY[2]);
90
+ case 2:
91
+ ans = _mm256_add_pd(_mm256_mul_pd(pX[1], pY[1]), ans);
92
+ case 1:
93
+ ans = _mm256_add_pd(_mm256_mul_pd(pX[0], pY[0]), ans);
94
+ }
95
+ }
96
+ const __m128d s = _mm_add_pd(_mm256_extractf128_pd(ans, 0), _mm256_extractf128_pd(ans, 1));
97
+ const double *pComps = (const double*)&s;
98
+ double dot =pComps[0] + pComps[1];
99
+ const ae_int_t tail = avx2len<<2;
100
+ switch(n-tail) {
101
+ case 1: {
102
+ dot += x[tail]*y[tail];
103
+ break;
104
+ }
105
+ case 2: {
106
+ dot += x[tail+0]*y[tail+0];
107
+ dot += x[tail+1]*y[tail+1];
108
+ break;
109
+ }
110
+ case 3: {
111
+ dot += x[tail+0]*y[tail+0];
112
+ dot += x[tail+1]*y[tail+1];
113
+ dot += x[tail+2]*y[tail+2];
114
+ break;
115
+ }
116
+ }
117
+ return dot;
118
+ }
119
+
120
+ double rdotv2_avx2(const ae_int_t n,
121
+ /* Real */ const double* __restrict x,
122
+ const ae_state* __restrict _state)
123
+ {
124
+ ae_int_t i;
125
+
126
+ const ae_int_t avx2len = n>>2;
127
+ const ae_int_t fmaLen = (avx2len >> 2) << 2;
128
+ const __m256d* __restrict pX = (const __m256d*)(x);
129
+ __m256d ans;
130
+ if(fmaLen >= 4) {
131
+ //TODO: this can be further unrolled to 8 because AVX(2) provides 16 registers
132
+ __m256d fmaUnroll[4];
133
+ fmaUnroll[0] = _mm256_mul_pd(pX[0], pX[0]);
134
+ fmaUnroll[1] = _mm256_mul_pd(pX[1], pX[1]);
135
+ fmaUnroll[2] = _mm256_mul_pd(pX[2], pX[2]);
136
+ fmaUnroll[3] = _mm256_mul_pd(pX[3], pX[3]);
137
+ for(i=4; i<fmaLen; i+=4) {
138
+ fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pX[i]), fmaUnroll[0]);
139
+ fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pX[i+1]), fmaUnroll[1]);
140
+ fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pX[i+2]), fmaUnroll[2]);
141
+ fmaUnroll[3] = _mm256_add_pd(_mm256_mul_pd(pX[i+3], pX[i+3]), fmaUnroll[3]);
142
+ }
143
+ switch(avx2len-fmaLen) {
144
+ case 3:
145
+ fmaUnroll[2] = _mm256_add_pd(_mm256_mul_pd(pX[i+2], pX[i+2]), fmaUnroll[2]);
146
+ case 2:
147
+ fmaUnroll[1] = _mm256_add_pd(_mm256_mul_pd(pX[i+1], pX[i+1]), fmaUnroll[1]);
148
+ case 1:
149
+ fmaUnroll[0] = _mm256_add_pd(_mm256_mul_pd(pX[i], pX[i]), fmaUnroll[0]);
150
+ }
151
+ ans = _mm256_add_pd(
152
+ _mm256_add_pd(fmaUnroll[0], fmaUnroll[1]),
153
+ _mm256_add_pd(fmaUnroll[2], fmaUnroll[3]));
154
+ }
155
+ else {
156
+ ans = _mm256_setzero_pd();
157
+ switch(avx2len) {
158
+ case 3:
159
+ ans = _mm256_mul_pd(pX[2], pX[2]);
160
+ case 2:
161
+ ans = _mm256_add_pd(_mm256_mul_pd(pX[1], pX[1]), ans);
162
+ case 1:
163
+ ans = _mm256_add_pd(_mm256_mul_pd(pX[0], pX[0]), ans);
164
+ }
165
+ }
166
+ const __m128d s = _mm_add_pd(_mm256_extractf128_pd(ans, 0), _mm256_extractf128_pd(ans, 1));
167
+ const double *pComps = (const double*)&s;
168
+ double dot =pComps[0] + pComps[1];
169
+ const ae_int_t tail = avx2len<<2;
170
+ switch(n-tail) {
171
+ case 1: {
172
+ dot += x[tail]*x[tail];
173
+ break;
174
+ }
175
+ case 2: {
176
+ dot += x[tail+0]*x[tail+0];
177
+ dot += x[tail+1]*x[tail+1];
178
+ break;
179
+ }
180
+ case 3: {
181
+ dot += x[tail+0]*x[tail+0];
182
+ dot += x[tail+1]*x[tail+1];
183
+ dot += x[tail+2]*x[tail+2];
184
+ break;
185
+ }
186
+ }
187
+ return dot;
188
+ }
189
+
190
+ void rcopyv_avx2(ae_int_t n,
191
+ /* Real */ const double* __restrict x,
192
+ /* Real */ double* __restrict y,
193
+ ae_state* __restrict _state)
194
+ {
195
+ ae_int_t i;
196
+
197
+ const ae_int_t avx2len = n>>2;
198
+ const ae_int_t tail = avx2len<<2;
199
+ const __m256d* __restrict pSrc = (const __m256d*)(x);
200
+ __m256d* __restrict pDest = (__m256d*)(y);
201
+ for(i=0; i<avx2len; i++) {
202
+ pDest[i] = pSrc[i];
203
+ }
204
+ switch(n-tail) {
205
+ case 1:
206
+ *(double*)(pDest+i) = *(const double*)(pSrc+i);
207
+ break;
208
+ case 2:
209
+ *(__m128d*)(pDest+i) = *(const __m128d*)(pSrc+i);
210
+ break;
211
+ case 3:
212
+ *(__m128d*)(pDest+i) = *(const __m128d*)(pSrc+i);
213
+ y[tail+2] = x[tail+2];
214
+ break;
215
+ }
216
+ }
217
+
218
+ void rcopymulv_avx2(const ae_int_t n,
219
+ const double v,
220
+ /* Real */ const double* __restrict x,
221
+ /* Real */ double* __restrict y,
222
+ const ae_state* __restrict _state)
223
+ {
224
+ ae_int_t i;
225
+
226
+ const ae_int_t avx2len = n>>2;
227
+ const ae_int_t tail = avx2len<<2;
228
+ const __m256d* __restrict pSrc = (const __m256d*)(x);
229
+ __m256d* __restrict pDest = (__m256d*)(y);
230
+ const __m256d avx2v = _mm256_set1_pd(v);
231
+ for(i=0; i<avx2len; i++) {
232
+ pDest[i] = _mm256_mul_pd(avx2v, pSrc[i]);
233
+ }
234
+ switch(n-tail) {
235
+ case 1:
236
+ *(double*)(pDest+i) = v * (*(const double*)(pSrc+i));
237
+ break;
238
+ case 2:
239
+ *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pSrc+i));
240
+ break;
241
+ case 3:
242
+ *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pSrc+i));
243
+ y[tail+2] = v*x[tail+2];
244
+ break;
245
+ }
246
+ }
247
+
248
+ void icopyv_avx2(const ae_int_t n, const ae_int_t* __restrict x,
249
+ ae_int_t* __restrict y, ae_state* __restrict _state)
250
+ {
251
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 31;
252
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
253
+ __m256i *__restrict pDest = (__m256i*)y;
254
+ const __m256i* __restrict pSrc = (const __m256i*)x;
255
+ const ae_int_t nVec = even>>5;
256
+ ae_int_t i;
257
+ for(i=0; i<nVec; i++) {
258
+ pDest[i] = pSrc[i];
259
+ }
260
+ i = even/sizeof(ae_int_t);
261
+ if(tail & 16) {
262
+ *(__m128i*)(y+i) = *(const __m128i*)(x+i);
263
+ i += 16/sizeof(ae_int_t);
264
+ }
265
+ if(tail & 8) {
266
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
267
+ i += 8/sizeof(ae_int_t);
268
+ }
269
+ if(tail & 4) {
270
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
271
+ }
272
+ }
273
+
274
+ void bcopyv_avx2(const ae_int_t n, const ae_bool* __restrict x,
275
+ ae_bool* __restrict y, ae_state* __restrict _state)
276
+ {
277
+ const ae_int_t tail = n & 31;
278
+ const ae_int_t even = n-tail;
279
+ __m256i *__restrict pDest = (__m256i*)y;
280
+ const __m256i* __restrict pSrc = (const __m256i*)x;
281
+ const ae_int_t nVec = even>>5;
282
+ ae_int_t i;
283
+ for(i=0; i<nVec; i++) {
284
+ pDest[i] = pSrc[i];
285
+ }
286
+ i = even;
287
+ if(tail & 16) {
288
+ *(__m128i*)(y+i) = *(const __m128i*)(x+i);
289
+ i += 16;
290
+ }
291
+ if(tail & 8) {
292
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
293
+ i += 8;
294
+ }
295
+ if(tail & 4) {
296
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
297
+ i += 4;
298
+ }
299
+ if(tail & 2) {
300
+ *(y+i+0) = *(x+i+0);
301
+ *(y+i+1) = *(x+i+1);
302
+ i += 2;
303
+ }
304
+ if(tail & 1) {
305
+ *(y+i) = *(x+i);
306
+ }
307
+ }
308
+
309
+ void rsetv_avx2(const ae_int_t n,
310
+ const double v,
311
+ /* Real */ double* __restrict x,
312
+ const ae_state* __restrict _state)
313
+ {
314
+ ae_int_t i;
315
+
316
+ const ae_int_t avx2len = n>>2;
317
+ __m256d* __restrict pDest = (__m256d*)(x);
318
+ const __m256d avx2v = _mm256_set1_pd(v);
319
+ for(i=0; i<avx2len; i++) {
320
+ pDest[i] = avx2v;
321
+ }
322
+ const ae_int_t tail = avx2len<<2;
323
+ switch(n-tail) {
324
+ case 1:
325
+ *(double*)(pDest+i) = v;
326
+ break;
327
+ case 2:
328
+ *(__m128d*)(pDest+i) = _mm256_extractf128_pd(avx2v, 0);
329
+ break;
330
+ case 3:
331
+ *(__m128d*)(pDest+i) = _mm256_extractf128_pd(avx2v, 0);
332
+ x[tail+2] = v;
333
+ break;
334
+ }
335
+ }
336
+
337
+ void rsetvx_avx2(const ae_int_t n, const double v, double* __restrict x,
338
+ const ae_state* __restrict _state)
339
+ {
340
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
341
+ if( n<=4 )
342
+ {
343
+ ae_int_t j;
344
+ for(j=0; j<=n-1; j++)
345
+ x[j] = v;
346
+ return;
347
+ }
348
+ switch(unal)
349
+ {
350
+ case 0:
351
+ rsetv_avx2(n, v, x, _state);
352
+ return;
353
+ case 8:
354
+ x[2] = v;
355
+ case 16:
356
+ x[1] = v;
357
+ case 24:
358
+ {
359
+ x[0] = v;
360
+ const ptrdiff_t nDone = 4-(unal>>3);
361
+ rsetv_avx2(n-nDone, v, x+nDone, _state);
362
+ return;
363
+ }
364
+ }
365
+ }
366
+
367
+ void isetv_avx2(const ae_int_t n, const ae_int_t v,
368
+ ae_int_t* __restrict x, ae_state* __restrict _state)
369
+ {
370
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 31;
371
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
372
+ __m256i *__restrict pDest = (__m256i*)x;
373
+ const __m256i avx2v = ((sizeof(v) == 4) ? _mm256_set1_epi32((ae_int32_t)v) : _mm256_set1_epi64x(v));
374
+ const ae_int_t nVec = even>>5;
375
+ ae_int_t i;
376
+ for(i=0; i<nVec; i++) {
377
+ pDest[i] = avx2v;
378
+ }
379
+ memmove(pDest+i, &avx2v, tail);
380
+ }
381
+
382
+ void bsetv_avx2(const ae_int_t n, const ae_bool v, ae_bool* __restrict x,
383
+ ae_state* __restrict _state)
384
+ {
385
+ const ae_int_t tail = n & 31;
386
+ const ae_int_t even = n-tail;
387
+ __m256i *__restrict pDest = (__m256i*)x;
388
+ const __m256i avx2v = _mm256_set1_epi8(v);
389
+ const ae_int_t nVec = even>>5;
390
+ ae_int_t i;
391
+ for(i=0; i<nVec; i++) {
392
+ pDest[i] = avx2v;
393
+ }
394
+ /* _mm256_extracti128_si256() has a too high latency on latest processors (Skylake+) */
395
+ memset(x+even, v, tail);
396
+ }
397
+
398
+ void rmulv_avx2(const ae_int_t n, const double v, double* __restrict x,
399
+ const ae_state* __restrict _state)
400
+ {
401
+ ae_int_t i;
402
+
403
+ const ae_int_t avx2len = n>>2;
404
+ __m256d* __restrict pDest = (__m256d*)(x);
405
+ const __m256d avx2v = _mm256_set1_pd(v);
406
+ for(i=0; i<avx2len; i++) {
407
+ pDest[i] = _mm256_mul_pd(avx2v, pDest[i]);
408
+ }
409
+ const ae_int_t tail = avx2len<<2;
410
+ switch(n-tail) {
411
+ case 1:
412
+ *(double*)(pDest+i) = v * (*(const double*)(pDest+i));
413
+ break;
414
+ case 2:
415
+ *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pDest+i));
416
+ break;
417
+ case 3:
418
+ *(__m128d*)(pDest+i) = _mm_mul_pd(_mm256_extractf128_pd(avx2v, 0), *(const __m128d*)(pDest+i));
419
+ x[tail+2] *= v;
420
+ break;
421
+ }
422
+ }
423
+
424
+ void rsqrtv_avx2(const ae_int_t n, double* __restrict x, const ae_state* __restrict _state)
425
+ {
426
+ ae_int_t i;
427
+
428
+ const ae_int_t avx2len = n>>2;
429
+ const ae_int_t tail = avx2len<<2;
430
+ __m256d* __restrict pDest = (__m256d*)(x);
431
+ for(i=0; i<avx2len; i++)
432
+ pDest[i] = _mm256_sqrt_pd(pDest[i]);
433
+ for(i=tail; i<n; i++)
434
+ x[i] = sqrt(x[i]);
435
+ }
436
+
437
+ void rmulvx_avx2(const ae_int_t n, const double v, double* __restrict x,
438
+ const ae_state* __restrict _state)
439
+ {
440
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
441
+ if( n<=4 )
442
+ {
443
+ ae_int_t i;
444
+ for(i=0; i<=n-1; i++)
445
+ x[i] *= v;
446
+ return;
447
+ }
448
+ switch(unal) {
449
+ case 0:
450
+ rmulv_avx2(n, v, x, _state);
451
+ return;
452
+ case 8:
453
+ x[2] = v*x[2];
454
+ case 16:
455
+ x[1] = v*x[1];
456
+ case 24: {
457
+ x[0] = v*x[0];
458
+ const ptrdiff_t nDone = 4-(unal>>3);
459
+ rmulv_avx2(n-nDone, v, x+nDone, _state);
460
+ return;
461
+ }
462
+ }
463
+ }
464
+
465
+ void raddv_avx2(const ae_int_t n,
466
+ const double alpha,
467
+ /* Real */ const double* __restrict y,
468
+ /* Real */ double* __restrict x,
469
+ const ae_state* __restrict _state)
470
+ {
471
+ ae_int_t i;
472
+
473
+ const ae_int_t avx2len = n>>2;
474
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
475
+ __m256d* __restrict pDest = (__m256d*)(x);
476
+ const __m256d avx2alpha = _mm256_set1_pd(alpha);
477
+ for(i=0; i<avx2len; i++) {
478
+ pDest[i] = _mm256_add_pd(_mm256_mul_pd(avx2alpha, pSrc[i]), pDest[i]);
479
+ }
480
+ const ae_int_t tail = avx2len<<2;
481
+ switch(n-tail) {
482
+ case 1:
483
+ *(double*)(pDest+i) = alpha * (*(const double*)(pSrc+i))
484
+ + (*(const double*)(pDest+i));
485
+ break;
486
+ case 2:
487
+ *(__m128d*)(pDest+i) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0), *(const __m128d*)(pSrc+i)),*(const __m128d*)(pDest+i));
488
+ break;
489
+ case 3:
490
+ *(__m128d*)(pDest+i) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0), *(const __m128d*)(pSrc+i)),*(const __m128d*)(pDest+i));
491
+ x[tail+2] += alpha*y[tail+2];
492
+ break;
493
+ }
494
+ }
495
+
496
+ void raddvx_avx_xaligned(const ae_int_t n, const double alpha, const double* __restrict y, double* __restrict x, ae_state *_state)
497
+ {
498
+ ae_int_t i;
499
+ const ae_int_t vecLen = (n>>2)<<2;
500
+ const __m256d avx2alpha = _mm256_set1_pd(alpha);
501
+ __m256d* __restrict pDest = (__m256d*)x;
502
+ for(i=0; i<vecLen; i+=4)
503
+ {
504
+ const ae_int_t iDest = i>>2;
505
+ pDest[iDest] = _mm256_add_pd(_mm256_mul_pd(avx2alpha, _mm256_loadu_pd(y+i)), pDest[iDest]);
506
+ }
507
+ switch(n-vecLen) {
508
+ case 1:
509
+ x[i] += alpha*y[i];
510
+ break;
511
+ case 2: {
512
+ const ae_int_t iDest = i>>2;
513
+ *(__m128d*)(pDest+iDest) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0),_mm_loadu_pd(y+i)),*(const __m128d*)(pDest+iDest));
514
+ break;
515
+ }
516
+ case 3:
517
+ {
518
+ const ae_int_t iDest = i>>2;
519
+ *(__m128d*)(pDest+iDest) = _mm_add_pd(_mm_mul_pd(_mm256_extractf128_pd(avx2alpha, 0),_mm_loadu_pd(y+i)),*(const __m128d*)(pDest+iDest));
520
+ x[i+2] += alpha*y[i+2];
521
+ break;
522
+ }
523
+ }
524
+ }
525
+
526
+ void raddvx_avx2(const ae_int_t n, const double alpha, const double* __restrict y,
527
+ double* __restrict x, ae_state *_state)
528
+ {
529
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
530
+ if( n<=4 )
531
+ {
532
+ ae_int_t i;
533
+ for(i=0; i<=n-1; i++)
534
+ x[i] += alpha*y[i];
535
+ return;
536
+ }
537
+ switch(unal)
538
+ {
539
+ case 0:
540
+ raddvx_avx_xaligned(n, alpha, y, x, _state);
541
+ return;
542
+ case 8:
543
+ x[2] += alpha*y[2];
544
+ case 16:
545
+ x[1] += alpha*y[1];
546
+ case 24:
547
+ {
548
+ x[0] += alpha*y[0];
549
+ const ptrdiff_t nDone = 4-(unal>>3);
550
+ raddvx_avx_xaligned(n-nDone, alpha, y+nDone, x+nDone, _state);
551
+ return;
552
+ }
553
+ }
554
+ }
555
+
556
+ void rmergemulv_avx2(ae_int_t n,
557
+ /* Real */ const double* __restrict y,
558
+ /* Real */ double* __restrict x,
559
+ const ae_state* __restrict _state)
560
+ {
561
+ ae_int_t i;
562
+
563
+ const ae_int_t avx2len = n>>2;
564
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
565
+ __m256d* __restrict pDest = (__m256d*)(x);
566
+ for(i=0; i<avx2len; i++) {
567
+ pDest[i] = _mm256_mul_pd(pSrc[i], pDest[i]);
568
+ }
569
+ const ae_int_t tail = avx2len<<2;
570
+ switch(n-tail) {
571
+ case 1:
572
+ *(double*)(pDest+i) = *(const double*)(pSrc+i)
573
+ * (*(const double*)(pDest+i));
574
+ break;
575
+ case 2:
576
+ *(__m128d*)(pDest+i) = _mm_mul_pd(
577
+ *(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
578
+ break;
579
+ case 3: {
580
+ *(__m128d*)(pDest+i) = _mm_mul_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
581
+ ((double*)(pDest+i))[2] *= ((const double*)(pSrc+i))[2];
582
+ break;
583
+ }
584
+ }
585
+ }
586
+
587
+ void rmergedivv_avx2(ae_int_t n,
588
+ /* Real */ const double* __restrict y,
589
+ /* Real */ double* __restrict x,
590
+ const ae_state* __restrict _state)
591
+ {
592
+ ae_int_t i;
593
+
594
+ const ae_int_t avx2len = n>>2;
595
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
596
+ __m256d* __restrict pDest = (__m256d*)(x);
597
+ for(i=0; i<avx2len; i++) {
598
+ pDest[i] = _mm256_div_pd(pDest[i], pSrc[i]);
599
+ }
600
+ const ae_int_t tail = avx2len<<2;
601
+ switch(n-tail) {
602
+ case 1:
603
+ *(double*)(pDest+i) = (*(const double*)(pDest+i)) / (*(const double*)(pSrc+i));
604
+ break;
605
+ case 2:
606
+ *(__m128d*)(pDest+i) = _mm_div_pd(*(const __m128d*)(pDest+i), *(const __m128d*)(pSrc+i));
607
+ break;
608
+ case 3: {
609
+ *(__m128d*)(pDest+i) = _mm_div_pd(*(const __m128d*)(pDest+i), *(const __m128d*)(pSrc+i));
610
+ ((double*)(pDest+i))[2] /= ((const double*)(pSrc+i))[2];
611
+ break;
612
+ }
613
+ }
614
+ }
615
+
616
+ void rmergemaxv_avx2(ae_int_t n,
617
+ /* Real */ const double* __restrict y,
618
+ /* Real */ double* __restrict x,
619
+ ae_state* __restrict _state)
620
+ {
621
+ ae_int_t i;
622
+
623
+ const ae_int_t avx2len = n>>2;
624
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
625
+ __m256d* __restrict pDest = (__m256d*)(x);
626
+ for(i=0; i<avx2len; i++) {
627
+ pDest[i] = _mm256_max_pd(pSrc[i], pDest[i]);
628
+ }
629
+ const ae_int_t tail = avx2len<<2;
630
+ switch(n-tail) {
631
+ case 1:
632
+ *(double*)(pDest+i) = *(const double*)(pSrc+i)>*(const double*)(pDest+i) ? *(const double*)(pSrc+i) : *(const double*)(pDest+i);
633
+ break;
634
+ case 2:
635
+ *(__m128d*)(pDest+i) = _mm_max_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
636
+ break;
637
+ case 3:
638
+ {
639
+ double s2 = ((const double*)(pSrc+i))[2];
640
+ double *d2 = ((double*)(pDest+i))+2;
641
+ *(__m128d*)(pDest+i) = _mm_max_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
642
+ *d2 = s2>*d2 ? s2 : *d2;
643
+ break;
644
+ }
645
+ }
646
+ }
647
+
648
+ void rmergeminv_avx2(ae_int_t n,
649
+ /* Real */ const double* __restrict y,
650
+ /* Real */ double* __restrict x,
651
+ ae_state* __restrict _state)
652
+ {
653
+ ae_int_t i;
654
+
655
+ const ae_int_t avx2len = n>>2;
656
+ const __m256d* __restrict pSrc = (const __m256d*)(y);
657
+ __m256d* __restrict pDest = (__m256d*)(x);
658
+ for(i=0; i<avx2len; i++) {
659
+ pDest[i] = _mm256_min_pd(pSrc[i], pDest[i]);
660
+ }
661
+ const ae_int_t tail = avx2len<<2;
662
+ switch(n-tail) {
663
+ case 1:
664
+ *(double*)(pDest+i) = ae_minreal(*(const double*)(pSrc+i),
665
+ *(const double*)(pDest+i), _state);
666
+ break;
667
+ case 2:
668
+ *(__m128d*)(pDest+i) = _mm_min_pd(
669
+ *(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
670
+ break;
671
+ case 3: {
672
+ double s2 = ((const double*)(pSrc+i))[2];
673
+ double *d2 = ((double*)(pDest+i))+2;
674
+ *(__m128d*)(pDest+i) = _mm_min_pd(*(const __m128d*)(pSrc+i), *(const __m128d*)(pDest+i));
675
+ *d2 = s2<*d2 ? s2 : *d2;
676
+ break;
677
+ }
678
+ }
679
+ }
680
+
681
+ double rmaxv_avx2(ae_int_t n, /* Real */ const double* __restrict x, ae_state* __restrict _state)
682
+ {
683
+ ae_int_t i;
684
+ const ae_int_t avx2len = n>>2;
685
+ const ae_int_t tail = avx2len<<2;
686
+ const __m256d* __restrict pSrc = (const __m256d*)(x);
687
+ if( n<=4 )
688
+ {
689
+ double result;
690
+ if(n == 0)
691
+ return 0.0;
692
+ result = x[0];
693
+ for(i=1; i<=n-1; i++)
694
+ {
695
+ double v = x[i];
696
+ if( v>result )
697
+ result = v;
698
+ }
699
+ return result;
700
+ }
701
+ __m256d curMax = pSrc[0];
702
+ for(i=1; i<avx2len; i++) {
703
+ curMax = _mm256_max_pd(curMax, pSrc[i]);
704
+ }
705
+ const __m128d sseMax = _mm_max_pd(_mm256_extractf128_pd(curMax, 0), _mm256_extractf128_pd(curMax, 1));
706
+ const double *pComps = (const double *)&sseMax;
707
+ double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
708
+ const double *p_tail = (const double*)(pSrc+i);
709
+ switch(n-tail)
710
+ {
711
+ case 1:
712
+ {
713
+ dMax = p_tail[0]>dMax ? p_tail[0] : dMax;
714
+ break;
715
+ }
716
+ case 2: {
717
+ dMax = p_tail[0]>dMax ? p_tail[0] : dMax;
718
+ dMax = p_tail[1]>dMax ? p_tail[1] : dMax;
719
+ break;
720
+ }
721
+ case 3: {
722
+ dMax = p_tail[0]>dMax ? p_tail[0] : dMax;
723
+ dMax = p_tail[1]>dMax ? p_tail[1] : dMax;
724
+ dMax = p_tail[2]>dMax ? p_tail[2] : dMax;
725
+ break;
726
+ }
727
+ }
728
+ return dMax;
729
+ }
730
+
731
+ double rmaxabsv_avx2(ae_int_t n, /* Real */ const double* __restrict x, ae_state* __restrict _state)
732
+ {
733
+ const __m256d signMask = _mm256_set1_pd(-0.); // -0. = 1 << 63
734
+ const ae_int_t avx2len = n>>2;
735
+ const __m256d* __restrict pSrc = (const __m256d*)(x);
736
+ if( n<=4 )
737
+ {
738
+ double result;
739
+ ae_int_t i;
740
+ result = 0;
741
+ for(i=0; i<=n-1; i++)
742
+ {
743
+ double v = fabs(x[i]);
744
+ if( v>result )
745
+ result = v;
746
+ }
747
+ return result;
748
+ }
749
+ __m256d curMax = _mm256_andnot_pd(signMask, pSrc[0]); // abs
750
+ ae_int_t i;
751
+ for(i=1; i<avx2len; i++) {
752
+ curMax = _mm256_max_pd(curMax, _mm256_andnot_pd(signMask, pSrc[i])); // abs
753
+ }
754
+ const ae_int_t tail = avx2len<<2;
755
+ const __m128d sseMax = _mm_max_pd(_mm256_extractf128_pd(curMax, 0), _mm256_extractf128_pd(curMax, 1));
756
+ const double *p_tail = (const double*)(pSrc+i);
757
+ const double *pComps = (const double *)&sseMax;
758
+ double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
759
+ switch(n-tail)
760
+ {
761
+ case 1:
762
+ {
763
+ double a0 = fabs(p_tail[0]);
764
+ dMax = a0>dMax ? a0 : dMax;
765
+ break;
766
+ }
767
+ case 2:
768
+ {
769
+ double a0 = fabs(p_tail[0]);
770
+ double a1 = fabs(p_tail[1]);
771
+ dMax = a0>dMax ? a0 : dMax;
772
+ dMax = a1>dMax ? a1 : dMax;
773
+ break;
774
+ }
775
+ case 3:
776
+ {
777
+ double a0 = fabs(p_tail[0]);
778
+ double a1 = fabs(p_tail[1]);
779
+ double a2 = fabs(p_tail[2]);
780
+ dMax = a0>dMax ? a0 : dMax;
781
+ dMax = a1>dMax ? a1 : dMax;
782
+ dMax = a2>dMax ? a2 : dMax;
783
+ break;
784
+ }
785
+ }
786
+ return dMax;
787
+ }
788
+
789
+ static void rcopyvx_avx2_xaligned(const ae_int_t n, const double* __restrict x,
790
+ double* __restrict y, ae_state *_state)
791
+ {
792
+ ae_int_t i;
793
+ const ae_int_t vecLen = (n>>2)<<2;
794
+ const __m256d* __restrict pSrc = (const __m256d*)x;
795
+ for(i=0; i<vecLen; i+=4) {
796
+ const ae_int_t iSrc = i>>2;
797
+ _mm256_storeu_pd(y+i, pSrc[iSrc]);
798
+ }
799
+ switch(n-vecLen) {
800
+ case 1:
801
+ y[i] = x[i];
802
+ break;
803
+ case 2: {
804
+ const ae_int_t iSrc = i>>2;
805
+ _mm_storeu_pd(y+i, *(const __m128d*)(pSrc+iSrc));
806
+ break;
807
+ }
808
+ case 3: {
809
+ const ae_int_t iSrc = i>>2;
810
+ const __m256d t = pSrc[iSrc];
811
+ _mm_storeu_pd(y+i, _mm256_extractf128_pd(t, 0));
812
+ y[i+2] = *(((const double*)&t)+2);
813
+ break;
814
+ }
815
+ }
816
+ }
817
+
818
+ void rcopyvx_avx2(const ae_int_t n, const double* __restrict x,
819
+ double* __restrict y, ae_state *_state)
820
+ {
821
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
822
+ if( n<=4 )
823
+ {
824
+ ae_int_t j;
825
+ for(j=0; j<n; j++)
826
+ y[j] = x[j];
827
+ return;
828
+ }
829
+ switch(unal) {
830
+ case 0:
831
+ rcopyvx_avx2_xaligned(n, x, y, _state);
832
+ return;
833
+ case 8:
834
+ y[2] = x[2];
835
+ case 16:
836
+ y[1] = x[1];
837
+ case 24: {
838
+ y[0] = x[0];
839
+ const ptrdiff_t nDone = 4-(unal>>3);
840
+ rcopyvx_avx2_xaligned(n-nDone, x+nDone, y+nDone, _state);
841
+ return;
842
+ }
843
+ }
844
+ }
845
+
846
+ static void icopyvx_avx2_xaligned(const ae_int_t n, const ae_int_t* __restrict x,
847
+ ae_int_t* __restrict y, ae_state* __restrict _state)
848
+ {
849
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 31;
850
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
851
+ const __m256i* __restrict pSrc = (const __m256i*)x;
852
+ const ae_int_t nVec = even>>5;
853
+ const ae_int_t shift_by = 3-sizeof(ae_int_t)/8;
854
+ ae_int_t i;
855
+ for(i=0; i<nVec; i++) {
856
+ const ae_int_t j = i<<shift_by;
857
+ _mm256_storeu_si256((__m256i*)(y+j), pSrc[i]);
858
+ }
859
+ i = even/sizeof(ae_int_t);
860
+ if(tail & 16) {
861
+ _mm_storeu_si128((__m128i*)(y+i), *(const __m128i*)(x+i));
862
+ i += 16/sizeof(ae_int_t);
863
+ }
864
+ if(tail & 8) {
865
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
866
+ i += 8/sizeof(ae_int_t);
867
+ }
868
+ if(tail & 4) {
869
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
870
+ }
871
+ }
872
+
873
+ void icopyvx_avx2(const ae_int_t n, const ae_int_t* __restrict x,
874
+ ae_int_t* __restrict y, ae_state* __restrict _state)
875
+ {
876
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
877
+ if( n<=8 )
878
+ {
879
+ ae_int_t j;
880
+ for(j=0; j<=n-1; j++)
881
+ y[j] = x[j];
882
+ return;
883
+ }
884
+ if(unal == 0)
885
+ {
886
+ icopyvx_avx2_xaligned(n, x, y, _state);
887
+ return;
888
+ }
889
+ const ae_int_t offset = 32-unal;
890
+ memmove(y, x, offset);
891
+ const ae_int_t nDone = offset / sizeof(ae_int_t);
892
+ icopyvx_avx2_xaligned(n-nDone, x+nDone, y+nDone, _state);
893
+ }
894
+
895
+ void rgemv_straight_avx2(const ae_int_t m, const ae_int_t n,
896
+ const double alpha, const ae_matrix* __restrict a,
897
+ const double* __restrict x, double* __restrict y, ae_state* _state)
898
+ {
899
+ ae_int_t i;
900
+ ae_int_t j;
901
+ const __m256d* __restrict pX = (const __m256d*)x;
902
+ const ae_int_t nVec = n >> 2;
903
+ const ae_int_t nUnroll = nVec >> 3;
904
+ __m256d sum = _mm256_setzero_pd();
905
+ for(i=0; i<m; i++) {
906
+ const __m256d* __restrict pRow = (const __m256d*)a->ptr.pp_double[i];
907
+ if(nUnroll >= 1) {
908
+ __m256d u0 = _mm256_mul_pd(pRow[0], pX[0]);
909
+ __m256d u1 = _mm256_mul_pd(pRow[1], pX[1]);
910
+ __m256d u2 = _mm256_mul_pd(pRow[2], pX[2]);
911
+ __m256d u3 = _mm256_mul_pd(pRow[3], pX[3]);
912
+ __m256d u4 = _mm256_mul_pd(pRow[4], pX[4]);
913
+ __m256d u5 = _mm256_mul_pd(pRow[5], pX[5]);
914
+ __m256d u6 = _mm256_mul_pd(pRow[6], pX[6]);
915
+ __m256d u7 = _mm256_mul_pd(pRow[7], pX[7]);
916
+ for(j=1; j<nUnroll; j++) {
917
+ const ae_int_t at = j<<3;
918
+ u0 = _mm256_add_pd(u0, _mm256_mul_pd(pRow[at], pX[at]));
919
+ u1 = _mm256_add_pd(u1, _mm256_mul_pd(pRow[at+1], pX[at+1]));
920
+ u2 = _mm256_add_pd(u2, _mm256_mul_pd(pRow[at+2], pX[at+2]));
921
+ u3 = _mm256_add_pd(u3, _mm256_mul_pd(pRow[at+3], pX[at+3]));
922
+ u4 = _mm256_add_pd(u4, _mm256_mul_pd(pRow[at+4], pX[at+4]));
923
+ u5 = _mm256_add_pd(u5, _mm256_mul_pd(pRow[at+5], pX[at+5]));
924
+ u6 = _mm256_add_pd(u6, _mm256_mul_pd(pRow[at+6], pX[at+6]));
925
+ u7 = _mm256_add_pd(u7, _mm256_mul_pd(pRow[at+7], pX[at+7]));
926
+ }
927
+ const ae_int_t at = j<<3;
928
+ switch(nVec-at) {
929
+ case 7:
930
+ u6 = _mm256_add_pd(_mm256_mul_pd(pX[at+6], pRow[at+6]), u6);
931
+ case 6:
932
+ u5 = _mm256_add_pd(_mm256_mul_pd(pX[at+5], pRow[at+5]), u5);
933
+ case 5:
934
+ u4 = _mm256_add_pd(_mm256_mul_pd(pX[at+4], pRow[at+4]), u4);
935
+ case 4:
936
+ u3 = _mm256_add_pd(_mm256_mul_pd(pX[at+3], pRow[at+3]), u3);
937
+ case 3:
938
+ u2 = _mm256_add_pd(_mm256_mul_pd(pX[at+2], pRow[at+2]), u2);
939
+ case 2:
940
+ u1 = _mm256_add_pd(_mm256_mul_pd(pX[at+1], pRow[at+1]), u1);
941
+ case 1:
942
+ u0 = _mm256_add_pd(_mm256_mul_pd(pX[at+0], pRow[at+0]), u0);
943
+ }
944
+ sum = _mm256_add_pd(
945
+ _mm256_add_pd(_mm256_add_pd(u0, u1), _mm256_add_pd(u2, u3)),
946
+ _mm256_add_pd(_mm256_add_pd(u4, u5), _mm256_add_pd(u6, u7)));
947
+ }
948
+ else {
949
+ switch(nVec) {
950
+ case 0:
951
+ sum = _mm256_setzero_pd();
952
+ break;
953
+ case 1:
954
+ sum = _mm256_mul_pd(pX[0], pRow[0]);
955
+ break;
956
+ case 2:
957
+ sum = _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1]));
958
+ break;
959
+ case 3:
960
+ sum = _mm256_add_pd(
961
+ _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
962
+ _mm256_mul_pd(pX[2], pRow[2]));
963
+ break;
964
+ case 4:
965
+ sum = _mm256_add_pd(
966
+ _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
967
+ _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3])));
968
+ break;
969
+ case 5:
970
+ sum = _mm256_add_pd(
971
+ _mm256_add_pd(
972
+ _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
973
+ _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3]))),
974
+ _mm256_mul_pd(pX[4], pRow[4]));
975
+ break;
976
+ case 6:
977
+ sum = _mm256_add_pd(
978
+ _mm256_add_pd(
979
+ _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
980
+ _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3]))),
981
+ _mm256_add_pd(_mm256_mul_pd(pX[4], pRow[4]), _mm256_mul_pd(pX[5], pRow[5])));
982
+ break;
983
+ case 7:
984
+ sum = _mm256_add_pd(
985
+ _mm256_add_pd(
986
+ _mm256_add_pd(_mm256_mul_pd(pX[0], pRow[0]), _mm256_mul_pd(pX[1], pRow[1])),
987
+ _mm256_add_pd(_mm256_mul_pd(pX[2], pRow[2]), _mm256_mul_pd(pX[3], pRow[3]))),
988
+ _mm256_add_pd(
989
+ _mm256_add_pd(_mm256_mul_pd(pX[4], pRow[4]), _mm256_mul_pd(pX[5], pRow[5])),
990
+ _mm256_mul_pd(pX[6], pRow[6])));
991
+ break;
992
+ }
993
+ }
994
+ const __m128d t = _mm_add_pd(_mm256_extractf128_pd(sum, 0), _mm256_extractf128_pd(sum, 1));
995
+ const double* pComps = (const double*)&t;
996
+ double ans = pComps[0] + pComps[1];
997
+ const ae_int_t tail = nVec<<2;
998
+ for(j=tail; j<n; j++) {
999
+ ans += a->ptr.pp_double[i][j] * x[j];
1000
+ }
1001
+ y[i] += alpha*ans;
1002
+ }
1003
+ }
1004
+
1005
+ void rgemv_transposed_avx2(const ae_int_t m, const ae_int_t n,
1006
+ const double alpha, const ae_matrix* __restrict a,
1007
+ const double* __restrict x, double* __restrict y, ae_state* _state)
1008
+ {
1009
+ ae_int_t i;
1010
+ ae_int_t j;
1011
+ __m256d* __restrict pY = (__m256d*)y;
1012
+ const ae_int_t nVec = m >> 2;
1013
+
1014
+ for(i=0; i<=n-1; i++)
1015
+ {
1016
+ const __m256d* __restrict pRow = (const __m256d*)a->ptr.pp_double[i];
1017
+ const double v = alpha*x[i];
1018
+ const __m256d vV = _mm256_set1_pd(v);
1019
+ for(j=0; j<nVec; j++)
1020
+ {
1021
+ pY[j] = _mm256_add_pd(_mm256_mul_pd(vV, pRow[j]), pY[j]);
1022
+ }
1023
+ const ae_int_t tail = nVec<<2;
1024
+ for(j=tail; j<m; j++) {
1025
+ y[j] += v*a->ptr.pp_double[i][j];
1026
+ }
1027
+ }
1028
+ }
1029
+
1030
+ void rgemvx_straight_avx2_xaligned(const ae_int_t m, const ae_int_t n,
1031
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
1032
+ const ae_int_t ja, const double* __restrict x,
1033
+ double* __restrict y, ae_state* _state)
1034
+ {
1035
+ ae_int_t i;
1036
+ ae_int_t j;
1037
+ const __m256d* __restrict pX = (const __m256d*)x;
1038
+ const ae_int_t nVec = n >> 2;
1039
+ const ae_int_t nUnroll = nVec >> 3;
1040
+ __m256d sum = _mm256_setzero_pd();
1041
+ for(i=0; i<m; i++) {
1042
+ const __m256d* __restrict pRow = (const __m256d*)(a->ptr.pp_double[i+ia]+ja);
1043
+ if(nUnroll >= 1) {
1044
+ __m256d u0 = _mm256_mul_pd(ULOAD256PD(pRow[0]), pX[0]);
1045
+ __m256d u1 = _mm256_mul_pd(ULOAD256PD(pRow[1]), pX[1]);
1046
+ __m256d u2 = _mm256_mul_pd(ULOAD256PD(pRow[2]), pX[2]);
1047
+ __m256d u3 = _mm256_mul_pd(ULOAD256PD(pRow[3]), pX[3]);
1048
+ __m256d u4 = _mm256_mul_pd(ULOAD256PD(pRow[4]), pX[4]);
1049
+ __m256d u5 = _mm256_mul_pd(ULOAD256PD(pRow[5]), pX[5]);
1050
+ __m256d u6 = _mm256_mul_pd(ULOAD256PD(pRow[6]), pX[6]);
1051
+ __m256d u7 = _mm256_mul_pd(ULOAD256PD(pRow[7]), pX[7]);
1052
+ for(j=1; j<nUnroll; j++) {
1053
+ const ae_int_t at = j<<3;
1054
+ u0 = _mm256_add_pd(u0, _mm256_mul_pd(ULOAD256PD(pRow[at]), pX[at]));
1055
+ u1 = _mm256_add_pd(u1, _mm256_mul_pd(ULOAD256PD(pRow[at+1]), pX[at+1]));
1056
+ u2 = _mm256_add_pd(u2, _mm256_mul_pd(ULOAD256PD(pRow[at+2]), pX[at+2]));
1057
+ u3 = _mm256_add_pd(u3, _mm256_mul_pd(ULOAD256PD(pRow[at+3]), pX[at+3]));
1058
+ u4 = _mm256_add_pd(u4, _mm256_mul_pd(ULOAD256PD(pRow[at+4]), pX[at+4]));
1059
+ u5 = _mm256_add_pd(u5, _mm256_mul_pd(ULOAD256PD(pRow[at+5]), pX[at+5]));
1060
+ u6 = _mm256_add_pd(u6, _mm256_mul_pd(ULOAD256PD(pRow[at+6]), pX[at+6]));
1061
+ u7 = _mm256_add_pd(u7, _mm256_mul_pd(ULOAD256PD(pRow[at+7]), pX[at+7]));
1062
+ }
1063
+ const ae_int_t at = j<<3;
1064
+ switch(nVec-at) {
1065
+ case 7:
1066
+ u6 = _mm256_add_pd(_mm256_mul_pd(pX[at+6], ULOAD256PD(pRow[at+6])), u6);
1067
+ case 6:
1068
+ u5 = _mm256_add_pd(_mm256_mul_pd(pX[at+5], ULOAD256PD(pRow[at+5])), u5);
1069
+ case 5:
1070
+ u4 = _mm256_add_pd(_mm256_mul_pd(pX[at+4], ULOAD256PD(pRow[at+4])), u4);
1071
+ case 4:
1072
+ u3 = _mm256_add_pd(_mm256_mul_pd(pX[at+3], ULOAD256PD(pRow[at+3])), u3);
1073
+ case 3:
1074
+ u2 = _mm256_add_pd(_mm256_mul_pd(pX[at+2], ULOAD256PD(pRow[at+2])), u2);
1075
+ case 2:
1076
+ u1 = _mm256_add_pd(_mm256_mul_pd(pX[at+1], ULOAD256PD(pRow[at+1])), u1);
1077
+ case 1:
1078
+ u0 = _mm256_add_pd(_mm256_mul_pd(pX[at+0], ULOAD256PD(pRow[at+0])), u0);
1079
+ }
1080
+ sum = _mm256_add_pd(
1081
+ _mm256_add_pd(_mm256_add_pd(u0, u1), _mm256_add_pd(u2, u3)),
1082
+ _mm256_add_pd(_mm256_add_pd(u4, u5), _mm256_add_pd(u6, u7)));
1083
+ }
1084
+ else {
1085
+ switch(nVec) {
1086
+ case 0:
1087
+ sum = _mm256_setzero_pd();
1088
+ break;
1089
+ case 1:
1090
+ sum = _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0]));
1091
+ break;
1092
+ case 2:
1093
+ sum = _mm256_add_pd(
1094
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1095
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1])));
1096
+ break;
1097
+ case 3:
1098
+ sum = _mm256_add_pd(
1099
+ _mm256_add_pd(
1100
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1101
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
1102
+ _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])));
1103
+ break;
1104
+ case 4:
1105
+ sum = _mm256_add_pd(
1106
+ _mm256_add_pd(
1107
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1108
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
1109
+ _mm256_add_pd(
1110
+ _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
1111
+ _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3]))));
1112
+ break;
1113
+ case 5:
1114
+ sum = _mm256_add_pd(
1115
+ _mm256_add_pd(
1116
+ _mm256_add_pd(
1117
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1118
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
1119
+ _mm256_add_pd(
1120
+ _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
1121
+ _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
1122
+ _mm256_mul_pd(pX[4], ULOAD256PD(pRow[4])));
1123
+ break;
1124
+ case 6:
1125
+ sum = _mm256_add_pd(
1126
+ _mm256_add_pd(
1127
+ _mm256_add_pd(
1128
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1129
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
1130
+ _mm256_add_pd(
1131
+ _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
1132
+ _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
1133
+ _mm256_add_pd(
1134
+ _mm256_mul_pd(pX[4], ULOAD256PD(pRow[4])),
1135
+ _mm256_mul_pd(pX[5], ULOAD256PD(pRow[5]))));
1136
+ break;
1137
+ case 7:
1138
+ sum = _mm256_add_pd(
1139
+ _mm256_add_pd(
1140
+ _mm256_add_pd(
1141
+ _mm256_mul_pd(pX[0], ULOAD256PD(pRow[0])),
1142
+ _mm256_mul_pd(pX[1], ULOAD256PD(pRow[1]))),
1143
+ _mm256_add_pd(
1144
+ _mm256_mul_pd(pX[2], ULOAD256PD(pRow[2])),
1145
+ _mm256_mul_pd(pX[3], ULOAD256PD(pRow[3])))),
1146
+ _mm256_add_pd(
1147
+ _mm256_add_pd(
1148
+ _mm256_mul_pd(pX[4], ULOAD256PD(pRow[4])),
1149
+ _mm256_mul_pd(pX[5], ULOAD256PD(pRow[5]))),
1150
+ _mm256_mul_pd(pX[6], ULOAD256PD(pRow[6]))));
1151
+ break;
1152
+ }
1153
+ }
1154
+ const __m128d t = _mm_add_pd(_mm256_extractf128_pd(sum, 0), _mm256_extractf128_pd(sum, 1));
1155
+ const double* pComps = (const double*)&t;
1156
+ double ans = pComps[0] + pComps[1];
1157
+ const ae_int_t tail = nVec<<2;
1158
+ for(j=tail; j<n; j++) {
1159
+ ans += a->ptr.pp_double[i+ia][j+ja] * x[j];
1160
+ }
1161
+ y[i] += alpha*ans;
1162
+ }
1163
+ }
1164
+
1165
+ void rgemvx_straight_avx2(const ae_int_t m, const ae_int_t n,
1166
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
1167
+ const ae_int_t ja, const double* __restrict x,
1168
+ double* __restrict y, ae_state* _state)
1169
+ {
1170
+ ae_int_t i;
1171
+ ae_int_t j;
1172
+ if( n<=3 ) {
1173
+ for(i=0; i<m; i++) {
1174
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
1175
+ double v = 0.0;
1176
+ for(j=0; j<n; j++) {
1177
+ v += p_a[j] * x[j];
1178
+ }
1179
+ y[i] += alpha*v;
1180
+ }
1181
+ return;
1182
+ }
1183
+
1184
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 31;
1185
+ if(unal == 0)
1186
+ {
1187
+ rgemvx_straight_avx2_xaligned(m, n, alpha, a, ia, ja, x, y, _state);
1188
+ return;
1189
+ }
1190
+ const ptrdiff_t shift = 4-(unal>>3);
1191
+ for(i=0; i<m; i++) {
1192
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
1193
+ double v = 0.0;
1194
+ for(j=0; j<shift; j++) {
1195
+ v += p_a[j] * x[j];
1196
+ }
1197
+ y[i] += alpha*v;
1198
+ }
1199
+ rgemvx_straight_avx2_xaligned(m, n-shift, alpha, a, ia, ja+shift, x+shift, y, _state);
1200
+ }
1201
+
1202
+ void rgemvx_transposed_avx2_yaligned(const ae_int_t m, const ae_int_t n,
1203
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
1204
+ const ae_int_t ja, const double* __restrict x, double* __restrict y,
1205
+ ae_state* _state)
1206
+ {
1207
+ ae_int_t i;
1208
+ ae_int_t j;
1209
+ __m256d* __restrict pY = (__m256d*)y;
1210
+ const ae_int_t nVec = m >> 2;
1211
+
1212
+ for(i=0; i<=n-1; i++)
1213
+ {
1214
+ const __m256d* __restrict pRow = (const __m256d*)(a->ptr.pp_double[i+ia]+ja);
1215
+ const double v = alpha*x[i];
1216
+ const __m256d vV = _mm256_set1_pd(v);
1217
+ for(j=0; j<nVec; j++)
1218
+ {
1219
+ pY[j] = _mm256_add_pd(_mm256_mul_pd(vV, ULOAD256PD(pRow[j])), pY[j]);
1220
+ }
1221
+ const ae_int_t tail = nVec<<2;
1222
+ for(j=tail; j<m; j++) {
1223
+ y[j] += v*a->ptr.pp_double[i+ia][j+ja];
1224
+ }
1225
+ }
1226
+ }
1227
+
1228
+ void rgemvx_transposed_avx2(const ae_int_t m, const ae_int_t n,
1229
+ const double alpha, const ae_matrix* __restrict a, const ae_int_t ia,
1230
+ const ae_int_t ja, const double* __restrict x, double* __restrict y,
1231
+ ae_state* _state)
1232
+ {
1233
+ ae_int_t i;
1234
+ ae_int_t j;
1235
+ if( m<=3 ) {
1236
+ for(i=0; i<n; i++) {
1237
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
1238
+ const double v = alpha*x[i];
1239
+ for(j=0; j<m; j++) {
1240
+ y[j] += v*p_a[j];
1241
+ }
1242
+ }
1243
+ return;
1244
+ }
1245
+
1246
+ const ptrdiff_t unal = ((ptrdiff_t)y) & 31;
1247
+ if(unal == 0)
1248
+ {
1249
+ rgemvx_transposed_avx2_yaligned(m, n, alpha, a, ia, ja, x, y, _state);
1250
+ return;
1251
+ }
1252
+ const ptrdiff_t shift = 4-(unal>>3);
1253
+ for(i=0; i<n; i++) {
1254
+ const double *p_a = a->ptr.pp_double[ia+i]+ja;
1255
+ const double v = alpha*x[i];
1256
+ for(j=0; j<shift; j++) {
1257
+ y[j] += v*p_a[j];
1258
+ }
1259
+ }
1260
+ rgemvx_transposed_avx2_yaligned(m-shift, n, alpha, a, ia, ja+shift, x, y+shift, _state);
1261
+ }
1262
+
1263
+ /*************************************************************************
1264
+ Block packing function for fast rGEMM. Loads long WIDTH*LENGTH submatrix
1265
+ with LENGTH<=BLOCK_SIZE and WIDTH<=MICRO_SIZE into contiguous MICRO_SIZE*
1266
+ BLOCK_SIZE row-wise 'horizontal' storage (hence H in the function name).
1267
+
1268
+ The matrix occupies first ROUND_LENGTH cols of the storage (with LENGTH
1269
+ being rounded up to nearest SIMD granularity). ROUND_LENGTH is returned
1270
+ as result. It is guaranteed that ROUND_LENGTH depends only on LENGTH, and
1271
+ that it will be same for all function calls.
1272
+
1273
+ Unused rows and columns in [LENGTH,ROUND_LENGTH) range are filled by zeros;
1274
+ unused cols in [ROUND_LENGTH,BLOCK_SIZE) range are ignored.
1275
+
1276
+ * op=0 means that source is an WIDTH*LENGTH matrix stored with src_stride
1277
+ stride. The matrix is NOT transposed on load.
1278
+ * op=1 means that source is an LENGTH*WIDTH matrix stored with src_stride
1279
+ that is loaded with transposition
1280
+ * present version of the function supports only MICRO_SIZE=2, the behavior
1281
+ is undefined for other micro sizes.
1282
+ * the target is properly aligned; the source can be unaligned.
1283
+
1284
+ Requires AVX2, does NOT check its presense.
1285
+
1286
+ The function is present in two versions, one with variable opsrc_length
1287
+ and another one with opsrc_length==block_size==32.
1288
+
1289
+ -- ALGLIB routine --
1290
+ 19.07.2021
1291
+ Bochkanov Sergey
1292
+ *************************************************************************/
1293
+ ae_int_t ablasf_packblkh_avx2(
1294
+ const double *src,
1295
+ ae_int_t src_stride,
1296
+ ae_int_t op,
1297
+ ae_int_t opsrc_length,
1298
+ ae_int_t opsrc_width,
1299
+ double *dst,
1300
+ ae_int_t block_size,
1301
+ ae_int_t micro_size)
1302
+ {
1303
+ ae_int_t i;
1304
+
1305
+ /*
1306
+ * Write to the storage
1307
+ */
1308
+ if( op==0 )
1309
+ {
1310
+ /*
1311
+ * Copy without transposition
1312
+ */
1313
+ const ae_int_t len8=(opsrc_length>>3)<<3;
1314
+ const double *src1 = src+src_stride;
1315
+ double *dst1 = dst+block_size;
1316
+ if( opsrc_width==2 )
1317
+ {
1318
+ /*
1319
+ * Width=2
1320
+ */
1321
+ for(i=0; i<len8; i+=8)
1322
+ {
1323
+ _mm256_store_pd(dst+i, _mm256_loadu_pd(src+i));
1324
+ _mm256_store_pd(dst+i+4, _mm256_loadu_pd(src+i+4));
1325
+ _mm256_store_pd(dst1+i, _mm256_loadu_pd(src1+i));
1326
+ _mm256_store_pd(dst1+i+4, _mm256_loadu_pd(src1+i+4));
1327
+ }
1328
+ for(i=len8; i<opsrc_length; i++)
1329
+ {
1330
+ dst[i] = src[i];
1331
+ dst1[i] = src1[i];
1332
+ }
1333
+ }
1334
+ else
1335
+ {
1336
+ /*
1337
+ * Width=1, pad by zeros
1338
+ */
1339
+ __m256d vz = _mm256_setzero_pd();
1340
+ for(i=0; i<len8; i+=8)
1341
+ {
1342
+ _mm256_store_pd(dst+i, _mm256_loadu_pd(src+i));
1343
+ _mm256_store_pd(dst+i+4, _mm256_loadu_pd(src+i+4));
1344
+ _mm256_store_pd(dst1+i, vz);
1345
+ _mm256_store_pd(dst1+i+4, vz);
1346
+ }
1347
+ for(i=len8; i<opsrc_length; i++)
1348
+ {
1349
+ dst[i] = src[i];
1350
+ dst1[i] = 0.0;
1351
+ }
1352
+ }
1353
+ }
1354
+ else
1355
+ {
1356
+ /*
1357
+ * Copy with transposition
1358
+ */
1359
+ const ae_int_t stride2 = src_stride<<1;
1360
+ const ae_int_t stride3 = src_stride+stride2;
1361
+ const ae_int_t stride4 = src_stride<<2;
1362
+ const ae_int_t len4=(opsrc_length>>2)<<2;
1363
+ const double *srci = src;
1364
+ double *dst1 = dst+block_size;
1365
+ if( opsrc_width==2 )
1366
+ {
1367
+ /*
1368
+ * Width=2
1369
+ */
1370
+ for(i=0; i<len4; i+=4)
1371
+ {
1372
+ __m128d s0 = _mm_loadu_pd(srci), s1 = _mm_loadu_pd(srci+src_stride);
1373
+ __m128d s2 = _mm_loadu_pd(srci+stride2), s3 = _mm_loadu_pd(srci+stride3);
1374
+ _mm_store_pd(dst+i, _mm_unpacklo_pd(s0,s1));
1375
+ _mm_store_pd(dst1+i, _mm_unpackhi_pd(s0,s1));
1376
+ _mm_store_pd(dst+i+2, _mm_unpacklo_pd(s2,s3));
1377
+ _mm_store_pd(dst1+i+2, _mm_unpackhi_pd(s2,s3));
1378
+ srci += stride4;
1379
+ }
1380
+ for(i=len4; i<opsrc_length; i++)
1381
+ {
1382
+ dst[i] = srci[0];
1383
+ dst1[i] = srci[1];
1384
+ srci += src_stride;
1385
+ }
1386
+ }
1387
+ else
1388
+ {
1389
+ /*
1390
+ * Width=1, pad by zeros
1391
+ */
1392
+ __m128d vz = _mm_setzero_pd();
1393
+ for(i=0; i<len4; i+=4)
1394
+ {
1395
+ __m128d s0 = _mm_load_sd(srci), s1 = _mm_load_sd(srci+src_stride);
1396
+ __m128d s2 = _mm_load_sd(srci+stride2), s3 = _mm_load_sd(srci+stride3);
1397
+ _mm_store_pd(dst+i, _mm_unpacklo_pd(s0,s1));
1398
+ _mm_store_pd(dst+i+2, _mm_unpacklo_pd(s2,s3));
1399
+ _mm_store_pd(dst1+i, vz);
1400
+ _mm_store_pd(dst1+i+2, vz);
1401
+ srci += stride4;
1402
+ }
1403
+ for(i=len4; i<opsrc_length; i++)
1404
+ {
1405
+ dst[i] = srci[0];
1406
+ dst1[i] = 0.0;
1407
+ srci += src_stride;
1408
+ }
1409
+ }
1410
+ }
1411
+
1412
+ /*
1413
+ * Pad by zeros, if needed
1414
+ */
1415
+ ae_int_t round_length = ((opsrc_length+3)>>2)<<2;
1416
+ for(i=opsrc_length; i<round_length; i++)
1417
+ {
1418
+ dst[i] = 0;
1419
+ dst[i+block_size] = 0;
1420
+ }
1421
+ return round_length;
1422
+ }
1423
+
1424
+ ae_int_t ablasf_packblkh32_avx2(
1425
+ const double *src,
1426
+ ae_int_t src_stride,
1427
+ ae_int_t op,
1428
+ ae_int_t ignore_opsrc_length,
1429
+ ae_int_t opsrc_width,
1430
+ double *dst,
1431
+ ae_int_t ignore_block_size,
1432
+ ae_int_t micro_size)
1433
+ {
1434
+ ae_int_t i;
1435
+
1436
+ /*
1437
+ * Write to the storage
1438
+ */
1439
+ if( op==0 )
1440
+ {
1441
+ /*
1442
+ * Copy without transposition
1443
+ */
1444
+ const double *src1 = src+src_stride;
1445
+ double *dst1 = dst+32;
1446
+ if( opsrc_width==2 )
1447
+ {
1448
+ /*
1449
+ * Width=2
1450
+ */
1451
+ for(i=0; i<32; i+=8)
1452
+ {
1453
+ _mm256_store_pd(dst+i, _mm256_loadu_pd(src+i));
1454
+ _mm256_store_pd(dst+i+4, _mm256_loadu_pd(src+i+4));
1455
+ _mm256_store_pd(dst1+i, _mm256_loadu_pd(src1+i));
1456
+ _mm256_store_pd(dst1+i+4, _mm256_loadu_pd(src1+i+4));
1457
+ }
1458
+ }
1459
+ else
1460
+ {
1461
+ /*
1462
+ * Width=1, pad by zeros
1463
+ */
1464
+ __m256d vz = _mm256_setzero_pd();
1465
+ for(i=0; i<32; i+=8)
1466
+ {
1467
+ _mm256_store_pd(dst+i, _mm256_loadu_pd(src+i));
1468
+ _mm256_store_pd(dst+i+4, _mm256_loadu_pd(src+i+4));
1469
+ _mm256_store_pd(dst1+i, vz);
1470
+ _mm256_store_pd(dst1+i+4, vz);
1471
+ }
1472
+ }
1473
+ }
1474
+ else
1475
+ {
1476
+ /*
1477
+ * Copy with transposition
1478
+ */
1479
+ const ae_int_t stride2 = src_stride<<1;
1480
+ const ae_int_t stride3 = src_stride+stride2;
1481
+ const ae_int_t stride4 = src_stride<<2;
1482
+ const double *srci = src;
1483
+ double *dst1 = dst+32;
1484
+ if( opsrc_width==2 )
1485
+ {
1486
+ /*
1487
+ * Width=2
1488
+ */
1489
+ for(i=0; i<32; i+=4)
1490
+ {
1491
+ __m128d s0 = _mm_loadu_pd(srci), s1 = _mm_loadu_pd(srci+src_stride);
1492
+ __m128d s2 = _mm_loadu_pd(srci+stride2), s3 = _mm_loadu_pd(srci+stride3);
1493
+ _mm_store_pd(dst+i, _mm_unpacklo_pd(s0,s1));
1494
+ _mm_store_pd(dst1+i, _mm_unpackhi_pd(s0,s1));
1495
+ _mm_store_pd(dst+i+2, _mm_unpacklo_pd(s2,s3));
1496
+ _mm_store_pd(dst1+i+2, _mm_unpackhi_pd(s2,s3));
1497
+ srci += stride4;
1498
+ }
1499
+ }
1500
+ else
1501
+ {
1502
+ /*
1503
+ * Width=1, pad by zeros
1504
+ */
1505
+ __m128d vz = _mm_setzero_pd();
1506
+ for(i=0; i<32; i+=4)
1507
+ {
1508
+ __m128d s0 = _mm_load_sd(srci), s1 = _mm_load_sd(srci+src_stride);
1509
+ __m128d s2 = _mm_load_sd(srci+stride2), s3 = _mm_load_sd(srci+stride3);
1510
+ _mm_store_pd(dst+i, _mm_unpacklo_pd(s0,s1));
1511
+ _mm_store_pd(dst+i+2, _mm_unpacklo_pd(s2,s3));
1512
+ _mm_store_pd(dst1+i, vz);
1513
+ _mm_store_pd(dst1+i+2, vz);
1514
+ srci += stride4;
1515
+ }
1516
+ }
1517
+ }
1518
+ return 32;
1519
+ }
1520
+
1521
+ /*************************************************************************
1522
+ Computes product A*transpose(B) of two MICRO_SIZE*ROUND_LENGTH rowwise
1523
+ 'horizontal' matrices, stored with stride=block_size, and writes it to the
1524
+ row-wise matrix C.
1525
+
1526
+ ROUND_LENGTH is expected to be properly SIMD-rounded length, as returned
1527
+ by ablasf_packblkh_avx2().
1528
+
1529
+ Present version of the function supports only MICRO_SIZE=2, the behavior
1530
+ is undefined for other micro sizes.
1531
+
1532
+ Requires AVX2, does NOT check its presense.
1533
+
1534
+ -- ALGLIB routine --
1535
+ 19.07.2021
1536
+ Bochkanov Sergey
1537
+ *************************************************************************/
1538
+ void ablasf_dotblkh_avx2(
1539
+ const double *src_a,
1540
+ const double *src_b,
1541
+ ae_int_t round_length,
1542
+ ae_int_t block_size,
1543
+ ae_int_t micro_size,
1544
+ double *dst,
1545
+ ae_int_t dst_stride)
1546
+ {
1547
+ ae_int_t z;
1548
+ __m256d r00 = _mm256_setzero_pd(), r01 = _mm256_setzero_pd(), r10 = _mm256_setzero_pd(), r11 = _mm256_setzero_pd();
1549
+ if( round_length&0x7 )
1550
+ {
1551
+ /*
1552
+ * round_length is multiple of 4, but not multiple of 8
1553
+ */
1554
+ for(z=0; z<round_length; z+=4, src_a+=4, src_b+=4)
1555
+ {
1556
+ __m256d a0 = _mm256_load_pd(src_a);
1557
+ __m256d a1 = _mm256_load_pd(src_a+block_size);
1558
+ __m256d b0 = _mm256_load_pd(src_b);
1559
+ __m256d b1 = _mm256_load_pd(src_b+block_size);
1560
+ r00 = _mm256_add_pd(_mm256_mul_pd(a0, b0), r00);
1561
+ r01 = _mm256_add_pd(_mm256_mul_pd(a0, b1), r01);
1562
+ r10 = _mm256_add_pd(_mm256_mul_pd(a1, b0), r10);
1563
+ r11 = _mm256_add_pd(_mm256_mul_pd(a1, b1), r11);
1564
+ }
1565
+ }
1566
+ else
1567
+ {
1568
+ /*
1569
+ * round_length is multiple of 8
1570
+ */
1571
+ for(z=0; z<round_length; z+=8, src_a+=8, src_b+=8)
1572
+ {
1573
+ __m256d a0 = _mm256_load_pd(src_a);
1574
+ __m256d a1 = _mm256_load_pd(src_a+block_size);
1575
+ __m256d b0 = _mm256_load_pd(src_b);
1576
+ __m256d b1 = _mm256_load_pd(src_b+block_size);
1577
+ __m256d c0 = _mm256_load_pd(src_a+4);
1578
+ __m256d c1 = _mm256_load_pd(src_a+block_size+4);
1579
+ __m256d d0 = _mm256_load_pd(src_b+4);
1580
+ __m256d d1 = _mm256_load_pd(src_b+block_size+4);
1581
+ r00 = _mm256_add_pd(_mm256_add_pd(r00, _mm256_mul_pd(a0, b0)), _mm256_mul_pd(c0, d0));
1582
+ r01 = _mm256_add_pd(_mm256_add_pd(r01, _mm256_mul_pd(a0, b1)), _mm256_mul_pd(c0, d1));
1583
+ r10 = _mm256_add_pd(_mm256_add_pd(r10, _mm256_mul_pd(a1, b0)), _mm256_mul_pd(c1, d0));
1584
+ r11 = _mm256_add_pd(_mm256_add_pd(r11, _mm256_mul_pd(a1, b1)), _mm256_mul_pd(c1, d1));
1585
+ }
1586
+ }
1587
+ __m256d sum0 = _mm256_hadd_pd(r00,r01);
1588
+ __m256d sum1 = _mm256_hadd_pd(r10,r11);
1589
+ _mm_store_pd(dst, _mm_add_pd(_mm256_castpd256_pd128(sum0), _mm256_extractf128_pd(sum0,1)));
1590
+ _mm_store_pd(dst+dst_stride, _mm_add_pd(_mm256_castpd256_pd128(sum1), _mm256_extractf128_pd(sum1,1)));
1591
+ }
1592
+
1593
+ /*************************************************************************
1594
+ Y := alpha*X + beta*Y
1595
+
1596
+ Requires AVX2, does NOT check its presense.
1597
+
1598
+ -- ALGLIB routine --
1599
+ 19.07.2021
1600
+ Bochkanov Sergey
1601
+ *************************************************************************/
1602
+ void ablasf_daxpby_avx2(
1603
+ ae_int_t n,
1604
+ double alpha,
1605
+ const double *src,
1606
+ double beta,
1607
+ double *dst)
1608
+ {
1609
+ if( beta==1.0 )
1610
+ {
1611
+ /*
1612
+ * The most optimized case: DST := alpha*SRC + DST
1613
+ *
1614
+ * First, we process leading elements with generic C code until DST is aligned.
1615
+ * Then, we process central part, assuming that DST is properly aligned.
1616
+ * Finally, we process tail.
1617
+ */
1618
+ ae_int_t i, n4;
1619
+ __m256d avx_alpha = _mm256_set1_pd(alpha);
1620
+ while( n>0 && (((ptrdiff_t)dst)&31) )
1621
+ {
1622
+ *dst += alpha*(*src);
1623
+ n--;
1624
+ dst++;
1625
+ src++;
1626
+ }
1627
+ n4=(n>>2)<<2;
1628
+ for(i=0; i<n4; i+=4)
1629
+ {
1630
+ __m256d r = _mm256_add_pd(_mm256_mul_pd(avx_alpha, _mm256_loadu_pd(src+i)), _mm256_load_pd(dst+i));
1631
+ _mm256_store_pd(dst+i, r);
1632
+ }
1633
+ for(i=n4; i<n; i++)
1634
+ dst[i] = alpha*src[i]+dst[i];
1635
+ }
1636
+ else if( beta!=0.0 )
1637
+ {
1638
+ /*
1639
+ * Well optimized: DST := alpha*SRC + beta*DST
1640
+ */
1641
+ ae_int_t i, n4;
1642
+ __m256d avx_alpha = _mm256_set1_pd(alpha);
1643
+ __m256d avx_beta = _mm256_set1_pd(beta);
1644
+ while( n>0 && (((ptrdiff_t)dst)&31) )
1645
+ {
1646
+ *dst = alpha*(*src) + beta*(*dst);
1647
+ n--;
1648
+ dst++;
1649
+ src++;
1650
+ }
1651
+ n4=(n>>2)<<2;
1652
+ for(i=0; i<n4; i+=4)
1653
+ {
1654
+ __m256d r = _mm256_add_pd(
1655
+ _mm256_mul_pd(avx_alpha, _mm256_loadu_pd(src+i)),
1656
+ _mm256_mul_pd(avx_beta,_mm256_load_pd(dst+i)));
1657
+ _mm256_store_pd(dst+i, r);
1658
+ }
1659
+ for(i=n4; i<n; i++)
1660
+ dst[i] = alpha*src[i]+beta*dst[i];
1661
+ }
1662
+ else
1663
+ {
1664
+ /*
1665
+ * Easy case: DST := alpha*SRC
1666
+ */
1667
+ ae_int_t i;
1668
+ for(i=0; i<n; i++)
1669
+ dst[i] = alpha*src[i];
1670
+ }
1671
+ }
1672
+
1673
+ ae_bool spchol_updatekernelabc4_avx2(double* rowstorage,
1674
+ ae_int_t offss,
1675
+ ae_int_t twidth,
1676
+ ae_int_t offsu,
1677
+ ae_int_t uheight,
1678
+ ae_int_t urank,
1679
+ ae_int_t urowstride,
1680
+ ae_int_t uwidth,
1681
+ const double* diagd,
1682
+ ae_int_t offsd,
1683
+ const ae_int_t* raw2smap,
1684
+ const ae_int_t* superrowidx,
1685
+ ae_int_t urbase,
1686
+ ae_state *_state)
1687
+ {
1688
+ ae_int_t k;
1689
+ ae_int_t targetrow;
1690
+ ae_int_t targetcol;
1691
+
1692
+ /*
1693
+ * Filter out unsupported combinations (ones that are too sparse for the non-SIMD code)
1694
+ */
1695
+ if( twidth<3||twidth>4 )
1696
+ {
1697
+ return ae_false;
1698
+ }
1699
+ if( uwidth<1||uwidth>4 )
1700
+ {
1701
+ return ae_false;
1702
+ }
1703
+ if( urank>4 )
1704
+ {
1705
+ return ae_false;
1706
+ }
1707
+
1708
+ /*
1709
+ * Shift input arrays to the beginning of the working area.
1710
+ * Prepare SIMD masks
1711
+ */
1712
+ __m256i v_rankmask = _mm256_cmpgt_epi64(_mm256_set_epi64x(urank, urank, urank, urank), _mm256_set_epi64x(3, 2, 1, 0));
1713
+ double *update_storage = rowstorage+offsu;
1714
+ double *target_storage = rowstorage+offss;
1715
+ superrowidx += urbase;
1716
+
1717
+ /*
1718
+ * Load head of the update matrix
1719
+ */
1720
+ __m256d v_d0123 = _mm256_maskload_pd(diagd+offsd, v_rankmask);
1721
+ __m256d u_0_0123 = _mm256_setzero_pd();
1722
+ __m256d u_1_0123 = _mm256_setzero_pd();
1723
+ __m256d u_2_0123 = _mm256_setzero_pd();
1724
+ __m256d u_3_0123 = _mm256_setzero_pd();
1725
+ for(k=0; k<=uwidth-1; k++)
1726
+ {
1727
+ targetcol = raw2smap[superrowidx[k]];
1728
+ if( targetcol==0 )
1729
+ u_0_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
1730
+ if( targetcol==1 )
1731
+ u_1_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
1732
+ if( targetcol==2 )
1733
+ u_2_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
1734
+ if( targetcol==3 )
1735
+ u_3_0123 = _mm256_mul_pd(v_d0123, _mm256_maskload_pd(update_storage+k*urowstride, v_rankmask));
1736
+ }
1737
+
1738
+ /*
1739
+ * Transpose head
1740
+ */
1741
+ __m256d u01_lo = _mm256_unpacklo_pd(u_0_0123,u_1_0123);
1742
+ __m256d u01_hi = _mm256_unpackhi_pd(u_0_0123,u_1_0123);
1743
+ __m256d u23_lo = _mm256_unpacklo_pd(u_2_0123,u_3_0123);
1744
+ __m256d u23_hi = _mm256_unpackhi_pd(u_2_0123,u_3_0123);
1745
+ __m256d u_0123_0 = _mm256_permute2f128_pd(u01_lo, u23_lo, 0x20);
1746
+ __m256d u_0123_1 = _mm256_permute2f128_pd(u01_hi, u23_hi, 0x20);
1747
+ __m256d u_0123_2 = _mm256_permute2f128_pd(u23_lo, u01_lo, 0x13);
1748
+ __m256d u_0123_3 = _mm256_permute2f128_pd(u23_hi, u01_hi, 0x13);
1749
+
1750
+ /*
1751
+ * Run update
1752
+ */
1753
+ if( urank==1 )
1754
+ {
1755
+ for(k=0; k<=uheight-1; k++)
1756
+ {
1757
+ targetrow = raw2smap[superrowidx[k]]*4;
1758
+ double *update_row = rowstorage+offsu+k*urowstride;
1759
+ _mm256_store_pd(target_storage+targetrow,
1760
+ _mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
1761
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
1762
+ }
1763
+ }
1764
+ if( urank==2 )
1765
+ {
1766
+ for(k=0; k<=uheight-1; k++)
1767
+ {
1768
+ targetrow = raw2smap[superrowidx[k]]*4;
1769
+ double *update_row = rowstorage+offsu+k*urowstride;
1770
+ _mm256_store_pd(target_storage+targetrow,
1771
+ _mm256_sub_pd(_mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
1772
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+1), u_0123_1)),
1773
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
1774
+ }
1775
+ }
1776
+ if( urank==3 )
1777
+ {
1778
+ for(k=0; k<=uheight-1; k++)
1779
+ {
1780
+ targetrow = raw2smap[superrowidx[k]]*4;
1781
+ double *update_row = rowstorage+offsu+k*urowstride;
1782
+ _mm256_store_pd(target_storage+targetrow,
1783
+ _mm256_sub_pd(_mm256_sub_pd(_mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
1784
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+2), u_0123_2)),
1785
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+1), u_0123_1)),
1786
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
1787
+ }
1788
+ }
1789
+ if( urank==4 )
1790
+ {
1791
+ for(k=0; k<=uheight-1; k++)
1792
+ {
1793
+ targetrow = raw2smap[superrowidx[k]]*4;
1794
+ double *update_row = rowstorage+offsu+k*urowstride;
1795
+ _mm256_store_pd(target_storage+targetrow,
1796
+ _mm256_sub_pd(_mm256_sub_pd(_mm256_sub_pd(_mm256_sub_pd(_mm256_load_pd(target_storage+targetrow),
1797
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+3), u_0123_3)),
1798
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+2), u_0123_2)),
1799
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+1), u_0123_1)),
1800
+ _mm256_mul_pd(_mm256_broadcast_sd(update_row+0), u_0123_0)));
1801
+ }
1802
+ }
1803
+ return ae_true;
1804
+ }
1805
+
1806
+ ae_bool spchol_updatekernel4444_avx2(
1807
+ double* rowstorage,
1808
+ ae_int_t offss,
1809
+ ae_int_t sheight,
1810
+ ae_int_t offsu,
1811
+ ae_int_t uheight,
1812
+ const double* diagd,
1813
+ ae_int_t offsd,
1814
+ const ae_int_t* raw2smap,
1815
+ const ae_int_t* superrowidx,
1816
+ ae_int_t urbase,
1817
+ ae_state *_state)
1818
+ {
1819
+ ae_int_t k;
1820
+ ae_int_t targetrow;
1821
+ ae_int_t offsk;
1822
+ __m256d v_negd_u0, v_negd_u1, v_negd_u2, v_negd_u3, v_negd;
1823
+ __m256d v_w0, v_w1, v_w2, v_w3, u01_lo, u01_hi, u23_lo, u23_hi;
1824
+
1825
+ /*
1826
+ * Compute W = -D*transpose(U[0:3])
1827
+ */
1828
+ v_negd = _mm256_mul_pd(_mm256_loadu_pd(diagd+offsd),_mm256_set1_pd(-1.0));
1829
+ v_negd_u0 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+0*4),v_negd);
1830
+ v_negd_u1 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+1*4),v_negd);
1831
+ v_negd_u2 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+2*4),v_negd);
1832
+ v_negd_u3 = _mm256_mul_pd(_mm256_load_pd(rowstorage+offsu+3*4),v_negd);
1833
+ u01_lo = _mm256_unpacklo_pd(v_negd_u0,v_negd_u1);
1834
+ u01_hi = _mm256_unpackhi_pd(v_negd_u0,v_negd_u1);
1835
+ u23_lo = _mm256_unpacklo_pd(v_negd_u2,v_negd_u3);
1836
+ u23_hi = _mm256_unpackhi_pd(v_negd_u2,v_negd_u3);
1837
+ v_w0 = _mm256_permute2f128_pd(u01_lo, u23_lo, 0x20);
1838
+ v_w1 = _mm256_permute2f128_pd(u01_hi, u23_hi, 0x20);
1839
+ v_w2 = _mm256_permute2f128_pd(u23_lo, u01_lo, 0x13);
1840
+ v_w3 = _mm256_permute2f128_pd(u23_hi, u01_hi, 0x13);
1841
+
1842
+ //
1843
+ // Compute update S:= S + row_scatter(U*W)
1844
+ //
1845
+ if( sheight==uheight )
1846
+ {
1847
+ /*
1848
+ * No row scatter, the most efficient code
1849
+ */
1850
+ for(k=0; k<=uheight-1; k++)
1851
+ {
1852
+ __m256d target;
1853
+
1854
+ targetrow = offss+k*4;
1855
+ offsk = offsu+k*4;
1856
+
1857
+ target = _mm256_load_pd(rowstorage+targetrow);
1858
+ target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+0),v_w0),target);
1859
+ target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+1),v_w1),target);
1860
+ target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+2),v_w2),target);
1861
+ target = _mm256_add_pd(_mm256_mul_pd(_mm256_broadcast_sd(rowstorage+offsk+3),v_w3),target);
1862
+ _mm256_store_pd(rowstorage+targetrow, target);
1863
+ }
1864
+ }
1865
+ else
1866
+ {
1867
+ /*
1868
+ * Row scatter is performed, less efficient code using double mapping to determine target row index
1869
+ */
1870
+ for(k=0; k<=uheight-1; k++)
1871
+ {
1872
+ __m256d v_uk0, v_uk1, v_uk2, v_uk3, target;
1873
+
1874
+ targetrow = offss+raw2smap[superrowidx[urbase+k]]*4;
1875
+ offsk = offsu+k*4;
1876
+
1877
+ target = _mm256_load_pd(rowstorage+targetrow);
1878
+ v_uk0 = _mm256_broadcast_sd(rowstorage+offsk+0);
1879
+ v_uk1 = _mm256_broadcast_sd(rowstorage+offsk+1);
1880
+ v_uk2 = _mm256_broadcast_sd(rowstorage+offsk+2);
1881
+ v_uk3 = _mm256_broadcast_sd(rowstorage+offsk+3);
1882
+ target = _mm256_add_pd(_mm256_mul_pd(v_uk0,v_w0),target);
1883
+ target = _mm256_add_pd(_mm256_mul_pd(v_uk1,v_w1),target);
1884
+ target = _mm256_add_pd(_mm256_mul_pd(v_uk2,v_w2),target);
1885
+ target = _mm256_add_pd(_mm256_mul_pd(v_uk3,v_w3),target);
1886
+ _mm256_store_pd(rowstorage+targetrow, target);
1887
+ }
1888
+ }
1889
+ return ae_true;
1890
+ }
1891
+
1892
+
1893
+ /*************************************************************************
1894
+ Fast kernel for biharmonic panel with NY=1
1895
+
1896
+ INPUT PARAMETERS:
1897
+ D0, D1, D2 - evaluation point minus (Panel.C0,Panel.C1,Panel.C2)
1898
+
1899
+ OUTPUT PARAMETERS:
1900
+ F - model value
1901
+ InvPowRPPlus1 - 1/(R^(P+1))
1902
+
1903
+ -- ALGLIB --
1904
+ Copyright 26.08.2022 by Sergey Bochkanov
1905
+ *************************************************************************/
1906
+ ae_bool rbfv3farfields_bhpaneleval1fastkernel16_avx2(double d0,
1907
+ double d1,
1908
+ double d2,
1909
+ const double* pnma,
1910
+ const double* pnmb,
1911
+ const double* pmmcdiag,
1912
+ const double* ynma,
1913
+ const double* tblrmodmn,
1914
+ double* f,
1915
+ double* invpowrpplus1,
1916
+ ae_state *_state)
1917
+ {
1918
+ ae_int_t n;
1919
+ double r, r2, r01, invr;
1920
+ double sintheta, costheta;
1921
+ ae_complex expiphi, expiphi2, expiphi3, expiphi4;
1922
+ ae_int_t jj;
1923
+ ae_bool result;
1924
+
1925
+ *f = 0.0;
1926
+ *invpowrpplus1 = 0.0;
1927
+ result = ae_true;
1928
+
1929
+ /*
1930
+ *Convert to spherical polar coordinates.
1931
+ *
1932
+ * NOTE: we make sure that R is non-zero by adding extremely small perturbation
1933
+ */
1934
+ r2 = d0*d0+d1*d1+d2*d2+ae_minrealnumber;
1935
+ r = ae_sqrt(r2, _state);
1936
+ r01 = ae_sqrt(d0*d0+d1*d1+ae_minrealnumber, _state);
1937
+ costheta = d2/r;
1938
+ sintheta = r01/r;
1939
+ expiphi.x = d0/r01;
1940
+ expiphi.y = d1/r01;
1941
+ invr = (double)1/r;
1942
+
1943
+ /*
1944
+ * prepare precomputed quantities
1945
+ */
1946
+ double powsintheta2 = sintheta*sintheta;
1947
+ double powsintheta3 = powsintheta2*sintheta;
1948
+ double powsintheta4 = powsintheta2*powsintheta2;
1949
+ expiphi2.x = expiphi.x*expiphi.x-expiphi.y*expiphi.y;
1950
+ expiphi2.y = 2*expiphi.x*expiphi.y;
1951
+ expiphi3.x = expiphi2.x*expiphi.x-expiphi2.y*expiphi.y;
1952
+ expiphi3.y = expiphi2.x*expiphi.y+expiphi.x*expiphi2.y;
1953
+ expiphi4.x = expiphi2.x*expiphi2.x-expiphi2.y*expiphi2.y;
1954
+ expiphi4.y = 2*expiphi2.x*expiphi2.y;
1955
+
1956
+ /*
1957
+ * Compute far field expansion for a cluster of basis functions f=r
1958
+ *
1959
+ * NOTE: the original paper by Beatson et al. uses f=r as the basis function,
1960
+ * whilst ALGLIB uses f=-r due to conditional positive definiteness requirement.
1961
+ * We will perform conversion later.
1962
+ */
1963
+ __m256d v_costheta = _mm256_set1_pd(costheta);
1964
+ __m256d v_r2 = _mm256_set1_pd(r2);
1965
+ __m256d v_f = _mm256_setzero_pd();
1966
+ __m256d v_invr = _mm256_set1_pd(invr);
1967
+ __m256d v_powsinthetaj = _mm256_set_pd(powsintheta3, powsintheta2, sintheta, 1.0);
1968
+ __m256d v_powsintheta4 = _mm256_set1_pd(powsintheta4);
1969
+ __m256d v_expijphix = _mm256_set_pd(expiphi3.x, expiphi2.x, expiphi.x, 1.0);
1970
+ __m256d v_expijphiy = _mm256_set_pd(expiphi3.y, expiphi2.y, expiphi.y, 0.0);
1971
+ __m256d v_expi4phix = _mm256_set1_pd(expiphi4.x);
1972
+ __m256d v_expi4phiy = _mm256_set1_pd(expiphi4.y);
1973
+ *f = (double)(0);
1974
+ for(jj=0; jj<4; jj++)
1975
+ {
1976
+ __m256d pnm_cur = _mm256_setzero_pd(), pnm_prev = _mm256_setzero_pd(), pnm_new;
1977
+ __m256d v_powrminusj1 = _mm256_set1_pd(invr);
1978
+ for(n=0; n<jj*4; n++)
1979
+ v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
1980
+ for(n=jj*4; n<16; n++)
1981
+ {
1982
+ ae_int_t j0=jj*4;
1983
+
1984
+
1985
+ pnm_new = _mm256_mul_pd(v_powsinthetaj, _mm256_load_pd(pmmcdiag+n*16+j0));
1986
+ pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(v_costheta,_mm256_mul_pd(pnm_cur,_mm256_load_pd(pnma+n*16+j0))));
1987
+ pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(pnm_prev,_mm256_load_pd(pnmb+n*16+j0)));
1988
+ pnm_prev = pnm_cur;
1989
+ pnm_cur = pnm_new;
1990
+
1991
+ __m256d v_tmp = _mm256_mul_pd(pnm_cur, _mm256_load_pd(ynma+n*16+j0));
1992
+ __m256d v_sphericalx = _mm256_mul_pd(v_tmp, v_expijphix);
1993
+ __m256d v_sphericaly = _mm256_mul_pd(v_tmp, v_expijphiy);
1994
+
1995
+ __m256d v_summnx = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(tblrmodmn+n*64+j0+32)),_mm256_load_pd(tblrmodmn+n*64+j0));
1996
+ __m256d v_summny = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(tblrmodmn+n*64+j0+48)),_mm256_load_pd(tblrmodmn+n*64+j0+16));
1997
+
1998
+ __m256d v_z = _mm256_sub_pd(_mm256_mul_pd(v_sphericalx,v_summnx),_mm256_mul_pd(v_sphericaly,v_summny));
1999
+
2000
+ v_f = _mm256_add_pd(v_f, _mm256_mul_pd(v_powrminusj1, v_z));
2001
+ v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
2002
+ }
2003
+ __m256d v_expijphix_new = _mm256_sub_pd(_mm256_mul_pd(v_expijphix,v_expi4phix),_mm256_mul_pd(v_expijphiy,v_expi4phiy));
2004
+ __m256d v_expijphiy_new = _mm256_add_pd(_mm256_mul_pd(v_expijphix,v_expi4phiy),_mm256_mul_pd(v_expijphiy,v_expi4phix));
2005
+ v_powsinthetaj = _mm256_mul_pd(v_powsinthetaj, v_powsintheta4);
2006
+ v_expijphix = v_expijphix_new;
2007
+ v_expijphiy = v_expijphiy_new;
2008
+ }
2009
+
2010
+ double ttt[4];
2011
+ _mm256_storeu_pd(ttt, v_f);
2012
+ for(int k=0; k<4; k++)
2013
+ *f += ttt[k];
2014
+
2015
+ double r4 = r2*r2;
2016
+ double r8 = r4*r4;
2017
+ double r16 = r8*r8;
2018
+ *invpowrpplus1 = 1/r16;
2019
+
2020
+ return result;
2021
+ }
2022
+
2023
+
2024
+ /*************************************************************************
2025
+ Fast kernel for biharmonic panel with general NY
2026
+
2027
+ INPUT PARAMETERS:
2028
+ D0, D1, D2 - evaluation point minus (Panel.C0,Panel.C1,Panel.C2)
2029
+
2030
+ OUTPUT PARAMETERS:
2031
+ F - array[NY], model value
2032
+ InvPowRPPlus1 - 1/(R^(P+1))
2033
+
2034
+ -- ALGLIB --
2035
+ Copyright 26.08.2022 by Sergey Bochkanov
2036
+ *************************************************************************/
2037
+ ae_bool rbfv3farfields_bhpanelevalfastkernel16_avx2(double d0,
2038
+ double d1,
2039
+ double d2,
2040
+ ae_int_t ny,
2041
+ const double* pnma,
2042
+ const double* pnmb,
2043
+ const double* pmmcdiag,
2044
+ const double* ynma,
2045
+ const double* tblrmodmn,
2046
+ double* f,
2047
+ double* invpowrpplus1,
2048
+ ae_state *_state)
2049
+ {
2050
+ ae_int_t n;
2051
+ double r, r2, r01, invr;
2052
+ double sintheta, costheta;
2053
+ ae_complex expiphi, expiphi2, expiphi3, expiphi4;
2054
+ ae_int_t jj;
2055
+
2056
+ /*
2057
+ * Precomputed buffer which is enough for NY up to 16
2058
+ */
2059
+ __m256d v_f[16];
2060
+ if( ny>16 )
2061
+ return ae_false;
2062
+ for(int k=0; k<ny; k++)
2063
+ {
2064
+ v_f[k] = _mm256_setzero_pd();
2065
+ f[k] = 0.0;
2066
+ }
2067
+
2068
+ /*
2069
+ *Convert to spherical polar coordinates.
2070
+ *
2071
+ * NOTE: we make sure that R is non-zero by adding extremely small perturbation
2072
+ */
2073
+ r2 = d0*d0+d1*d1+d2*d2+ae_minrealnumber;
2074
+ r = ae_sqrt(r2, _state);
2075
+ r01 = ae_sqrt(d0*d0+d1*d1+ae_minrealnumber, _state);
2076
+ costheta = d2/r;
2077
+ sintheta = r01/r;
2078
+ expiphi.x = d0/r01;
2079
+ expiphi.y = d1/r01;
2080
+ invr = (double)1/r;
2081
+
2082
+ /*
2083
+ * prepare precomputed quantities
2084
+ */
2085
+ double powsintheta2 = sintheta*sintheta;
2086
+ double powsintheta3 = powsintheta2*sintheta;
2087
+ double powsintheta4 = powsintheta2*powsintheta2;
2088
+ expiphi2.x = expiphi.x*expiphi.x-expiphi.y*expiphi.y;
2089
+ expiphi2.y = 2*expiphi.x*expiphi.y;
2090
+ expiphi3.x = expiphi2.x*expiphi.x-expiphi2.y*expiphi.y;
2091
+ expiphi3.y = expiphi2.x*expiphi.y+expiphi.x*expiphi2.y;
2092
+ expiphi4.x = expiphi2.x*expiphi2.x-expiphi2.y*expiphi2.y;
2093
+ expiphi4.y = 2*expiphi2.x*expiphi2.y;
2094
+
2095
+ /*
2096
+ * Compute far field expansion for a cluster of basis functions f=r
2097
+ *
2098
+ * NOTE: the original paper by Beatson et al. uses f=r as the basis function,
2099
+ * whilst ALGLIB uses f=-r due to conditional positive definiteness requirement.
2100
+ * We will perform conversion later.
2101
+ */
2102
+ __m256d v_costheta = _mm256_set1_pd(costheta);
2103
+ __m256d v_r2 = _mm256_set1_pd(r2);
2104
+ __m256d v_invr = _mm256_set1_pd(invr);
2105
+ __m256d v_powsinthetaj = _mm256_set_pd(powsintheta3, powsintheta2, sintheta, 1.0);
2106
+ __m256d v_powsintheta4 = _mm256_set1_pd(powsintheta4);
2107
+ __m256d v_expijphix = _mm256_set_pd(expiphi3.x, expiphi2.x, expiphi.x, 1.0);
2108
+ __m256d v_expijphiy = _mm256_set_pd(expiphi3.y, expiphi2.y, expiphi.y, 0.0);
2109
+ __m256d v_expi4phix = _mm256_set1_pd(expiphi4.x);
2110
+ __m256d v_expi4phiy = _mm256_set1_pd(expiphi4.y);
2111
+ *f = (double)(0);
2112
+ for(jj=0; jj<4; jj++)
2113
+ {
2114
+ __m256d pnm_cur = _mm256_setzero_pd(), pnm_prev = _mm256_setzero_pd(), pnm_new;
2115
+ __m256d v_powrminusj1 = _mm256_set1_pd(invr);
2116
+ for(n=0; n<jj*4; n++)
2117
+ v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
2118
+ for(n=jj*4; n<16; n++)
2119
+ {
2120
+ ae_int_t j0=jj*4;
2121
+
2122
+ pnm_new = _mm256_mul_pd(v_powsinthetaj, _mm256_load_pd(pmmcdiag+n*16+j0));
2123
+ pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(v_costheta,_mm256_mul_pd(pnm_cur,_mm256_load_pd(pnma+n*16+j0))));
2124
+ pnm_new = _mm256_add_pd(pnm_new, _mm256_mul_pd(pnm_prev,_mm256_load_pd(pnmb+n*16+j0)));
2125
+ pnm_prev = pnm_cur;
2126
+ pnm_cur = pnm_new;
2127
+
2128
+ __m256d v_tmp = _mm256_mul_pd(pnm_cur, _mm256_load_pd(ynma+n*16+j0));
2129
+ __m256d v_sphericalx = _mm256_mul_pd(v_tmp, v_expijphix);
2130
+ __m256d v_sphericaly = _mm256_mul_pd(v_tmp, v_expijphiy);
2131
+
2132
+ const double *p_rmodmn = tblrmodmn+n*64+j0;
2133
+ for(int k=0; k<ny; k++)
2134
+ {
2135
+ __m256d v_summnx = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(p_rmodmn+32)),_mm256_load_pd(p_rmodmn));
2136
+ __m256d v_summny = _mm256_add_pd(_mm256_mul_pd(v_r2,_mm256_load_pd(p_rmodmn+48)),_mm256_load_pd(p_rmodmn+16));
2137
+ __m256d v_z = _mm256_sub_pd(_mm256_mul_pd(v_sphericalx,v_summnx),_mm256_mul_pd(v_sphericaly,v_summny));
2138
+ v_f[k] = _mm256_add_pd(v_f[k], _mm256_mul_pd(v_powrminusj1, v_z));
2139
+ p_rmodmn += 1024;
2140
+ }
2141
+ v_powrminusj1 = _mm256_mul_pd(v_powrminusj1, v_invr);
2142
+ }
2143
+ __m256d v_expijphix_new = _mm256_sub_pd(_mm256_mul_pd(v_expijphix,v_expi4phix),_mm256_mul_pd(v_expijphiy,v_expi4phiy));
2144
+ __m256d v_expijphiy_new = _mm256_add_pd(_mm256_mul_pd(v_expijphix,v_expi4phiy),_mm256_mul_pd(v_expijphiy,v_expi4phix));
2145
+ v_powsinthetaj = _mm256_mul_pd(v_powsinthetaj, v_powsintheta4);
2146
+ v_expijphix = v_expijphix_new;
2147
+ v_expijphiy = v_expijphiy_new;
2148
+ }
2149
+
2150
+ for(int t=0; t<ny; t++)
2151
+ {
2152
+ double ttt[4];
2153
+ _mm256_storeu_pd(ttt, v_f[t]);
2154
+ for(int k=0; k<4; k++)
2155
+ f[t] += ttt[k];
2156
+ }
2157
+
2158
+ double r4 = r2*r2;
2159
+ double r8 = r4*r4;
2160
+ double r16 = r8*r8;
2161
+ *invpowrpplus1 = 1/r16;
2162
+
2163
+ return ae_true;
2164
+ }
2165
+
2166
+ /* ALGLIB_NO_FAST_KERNELS, _ALGLIB_HAS_AVX2_INTRINSICS */
2167
+ #endif
2168
+
2169
+
2170
+ }
2171
+