alglib4 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +47 -0
  3. data/ext/alglib/alglib.cpp +537 -0
  4. data/ext/alglib/alglib_array_converters.cpp +86 -0
  5. data/ext/alglib/alglib_array_converters.h +15 -0
  6. data/ext/alglib/alglib_utils.cpp +10 -0
  7. data/ext/alglib/alglib_utils.h +6 -0
  8. data/ext/alglib/alglibinternal.cpp +21749 -0
  9. data/ext/alglib/alglibinternal.h +2168 -0
  10. data/ext/alglib/alglibmisc.cpp +9106 -0
  11. data/ext/alglib/alglibmisc.h +2114 -0
  12. data/ext/alglib/ap.cpp +20094 -0
  13. data/ext/alglib/ap.h +7244 -0
  14. data/ext/alglib/dataanalysis.cpp +52588 -0
  15. data/ext/alglib/dataanalysis.h +10601 -0
  16. data/ext/alglib/diffequations.cpp +1342 -0
  17. data/ext/alglib/diffequations.h +282 -0
  18. data/ext/alglib/extconf.rb +5 -0
  19. data/ext/alglib/fasttransforms.cpp +4696 -0
  20. data/ext/alglib/fasttransforms.h +1018 -0
  21. data/ext/alglib/integration.cpp +4249 -0
  22. data/ext/alglib/integration.h +869 -0
  23. data/ext/alglib/interpolation.cpp +74502 -0
  24. data/ext/alglib/interpolation.h +12264 -0
  25. data/ext/alglib/kernels_avx2.cpp +2171 -0
  26. data/ext/alglib/kernels_avx2.h +201 -0
  27. data/ext/alglib/kernels_fma.cpp +1065 -0
  28. data/ext/alglib/kernels_fma.h +137 -0
  29. data/ext/alglib/kernels_sse2.cpp +735 -0
  30. data/ext/alglib/kernels_sse2.h +100 -0
  31. data/ext/alglib/linalg.cpp +65182 -0
  32. data/ext/alglib/linalg.h +9927 -0
  33. data/ext/alglib/optimization.cpp +135331 -0
  34. data/ext/alglib/optimization.h +19235 -0
  35. data/ext/alglib/solvers.cpp +20488 -0
  36. data/ext/alglib/solvers.h +4781 -0
  37. data/ext/alglib/specialfunctions.cpp +10672 -0
  38. data/ext/alglib/specialfunctions.h +2305 -0
  39. data/ext/alglib/statistics.cpp +19791 -0
  40. data/ext/alglib/statistics.h +1359 -0
  41. data/ext/alglib/stdafx.h +2 -0
  42. data/gpl2.txt +339 -0
  43. data/gpl3.txt +674 -0
  44. data/lib/alglib/version.rb +3 -0
  45. data/lib/alglib.rb +4 -0
  46. metadata +101 -0
@@ -0,0 +1,735 @@
1
+ /*************************************************************************
2
+ ALGLIB 4.04.0 (source code generated 2024-12-21)
3
+ Copyright (c) Sergey Bochkanov (ALGLIB project).
4
+
5
+ >>> SOURCE LICENSE >>>
6
+ This program is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation (www.fsf.org); either version 2 of the
9
+ License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ A copy of the GNU General Public License is available at
17
+ http://www.fsf.org/licensing/licenses
18
+ >>> END OF LICENSE >>>
19
+ *************************************************************************/
20
+ #ifdef _MSC_VER
21
+ #define _CRT_SECURE_NO_WARNINGS
22
+ #endif
23
+ #include "stdafx.h"
24
+
25
+ //
26
+ // Must be defined before we include kernel header
27
+ //
28
+ #define _ALGLIB_IMPL_DEFINES
29
+ #define _ALGLIB_INTEGRITY_CHECKS_ONCE
30
+
31
+ #include "kernels_sse2.h"
32
+
33
+ // disable some irrelevant warnings
34
+ #if (AE_COMPILER==AE_MSVC) && !defined(AE_ALL_WARNINGS)
35
+ #pragma warning(disable:4100)
36
+ #pragma warning(disable:4127)
37
+ #pragma warning(disable:4611)
38
+ #pragma warning(disable:4702)
39
+ #pragma warning(disable:4996)
40
+ #endif
41
+
42
+ namespace alglib_impl
43
+ {
44
+
45
+
46
+
47
+ #if !defined(ALGLIB_NO_FAST_KERNELS) && defined(_ALGLIB_HAS_SSE2_INTRINSICS)
48
+
49
+ double rdotv_sse2(ae_int_t n,
50
+ /* Real */ const double* x,
51
+ /* Real */ const double* y,
52
+ ae_state *_state)
53
+ {
54
+ ae_int_t i;
55
+
56
+ const ae_int_t sse2len = n>>1;
57
+ const ae_int_t unrollLen = (sse2len>>3)<<3;
58
+ const __m128d* __restrict pX = (const __m128d*)(x);
59
+ const __m128d* __restrict pY = (const __m128d*)(y);
60
+ __m128d ans;
61
+ if(unrollLen >= 8) {
62
+ __m128d unroll0 = _mm_mul_pd(pX[0], pY[0]);
63
+ __m128d unroll1 = _mm_mul_pd(pX[1], pY[1]);
64
+ __m128d unroll2 = _mm_mul_pd(pX[2], pY[2]);
65
+ __m128d unroll3 = _mm_mul_pd(pX[3], pY[3]);
66
+ __m128d unroll4 = _mm_mul_pd(pX[4], pY[4]);
67
+ __m128d unroll5 = _mm_mul_pd(pX[5], pY[5]);
68
+ __m128d unroll6 = _mm_mul_pd(pX[6], pY[6]);
69
+ __m128d unroll7 = _mm_mul_pd(pX[7], pY[7]);
70
+ for(i=8; i<unrollLen; i+=8) {
71
+ unroll0 = _mm_add_pd(_mm_mul_pd(pX[i], pY[i]), unroll0);
72
+ unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pY[i+1]), unroll1);
73
+ unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pY[i+2]), unroll2);
74
+ unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pY[i+3]), unroll3);
75
+ unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pY[i+4]), unroll4);
76
+ unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pY[i+5]), unroll5);
77
+ unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pY[i+6]), unroll6);
78
+ unroll7 = _mm_add_pd(_mm_mul_pd(pX[i+7], pY[i+7]), unroll7);
79
+ }
80
+ switch(sse2len-unrollLen) {
81
+ case 7:
82
+ unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pY[i+6]), unroll6);
83
+ case 6:
84
+ unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pY[i+5]), unroll5);
85
+ case 5:
86
+ unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pY[i+4]), unroll4);
87
+ case 4:
88
+ unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pY[i+3]), unroll3);
89
+ case 3:
90
+ unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pY[i+2]), unroll2);
91
+ case 2:
92
+ unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pY[i+1]), unroll1);
93
+ case 1:
94
+ unroll0 = _mm_add_pd(_mm_mul_pd(pX[i+0], pY[i+0]), unroll0);
95
+ }
96
+ ans = _mm_add_pd(
97
+ _mm_add_pd(_mm_add_pd(unroll0, unroll1), _mm_add_pd(unroll2, unroll3)),
98
+ _mm_add_pd(_mm_add_pd(unroll4, unroll5), _mm_add_pd(unroll6, unroll7)));
99
+ }
100
+ else {
101
+ switch(sse2len) {
102
+ case 0:
103
+ if(n == 0) {
104
+ return 0;
105
+ } else {
106
+ return x[0]*y[0];
107
+ }
108
+ case 1:
109
+ ans = _mm_mul_pd(pX[0], pY[0]);
110
+ break;
111
+ case 2:
112
+ ans = _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1]));
113
+ break;
114
+ case 3:
115
+ ans = _mm_add_pd(
116
+ _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
117
+ _mm_mul_pd(pX[2], pY[2]));
118
+ break;
119
+ case 4:
120
+ ans = _mm_add_pd(
121
+ _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
122
+ _mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3])));
123
+ break;
124
+ case 5:
125
+ ans = _mm_add_pd(
126
+ _mm_add_pd(
127
+ _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
128
+ _mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3]))),
129
+ _mm_mul_pd(pX[4], pY[4]));
130
+ break;
131
+ case 6:
132
+ ans = _mm_add_pd(
133
+ _mm_add_pd(
134
+ _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
135
+ _mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3]))),
136
+ _mm_add_pd(_mm_mul_pd(pX[4], pY[4]), _mm_mul_pd(pX[5], pY[5])));
137
+ break;
138
+ case 7:
139
+ ans = _mm_add_pd(
140
+ _mm_add_pd(
141
+ _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
142
+ _mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3]))),
143
+ _mm_add_pd(
144
+ _mm_add_pd(_mm_mul_pd(pX[4], pY[4]), _mm_mul_pd(pX[5], pY[5])),
145
+ _mm_mul_pd(pX[6], pY[6])));
146
+ break;
147
+ }
148
+ }
149
+
150
+ const double *pComps = (const double*)&ans;
151
+ double scalar = pComps[0] + pComps[1];
152
+ const ae_int_t tail = sse2len<<1;
153
+ if(n-tail) {
154
+ return scalar + x[tail]*y[tail];
155
+ } else {
156
+ return scalar;
157
+ }
158
+ }
159
+
160
+ double rdotv2_sse2(ae_int_t n,
161
+ /* Real */ const double* x,
162
+ ae_state *_state)
163
+ {
164
+ ae_int_t i;
165
+
166
+ const ae_int_t sse2len = n>>1;
167
+ const ae_int_t unrollLen = (sse2len>>3)<<3;
168
+ const __m128d* __restrict pX = (const __m128d*)(x);
169
+ __m128d ans;
170
+ if(unrollLen >= 8) {
171
+ __m128d unroll0 = _mm_mul_pd(pX[0], pX[0]);
172
+ __m128d unroll1 = _mm_mul_pd(pX[1], pX[1]);
173
+ __m128d unroll2 = _mm_mul_pd(pX[2], pX[2]);
174
+ __m128d unroll3 = _mm_mul_pd(pX[3], pX[3]);
175
+ __m128d unroll4 = _mm_mul_pd(pX[4], pX[4]);
176
+ __m128d unroll5 = _mm_mul_pd(pX[5], pX[5]);
177
+ __m128d unroll6 = _mm_mul_pd(pX[6], pX[6]);
178
+ __m128d unroll7 = _mm_mul_pd(pX[7], pX[7]);
179
+ for(i=8; i<unrollLen; i+=8) {
180
+ unroll0 = _mm_add_pd(_mm_mul_pd(pX[i], pX[i]), unroll0);
181
+ unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pX[i+1]), unroll1);
182
+ unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pX[i+2]), unroll2);
183
+ unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pX[i+3]), unroll3);
184
+ unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pX[i+4]), unroll4);
185
+ unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pX[i+5]), unroll5);
186
+ unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pX[i+6]), unroll6);
187
+ unroll7 = _mm_add_pd(_mm_mul_pd(pX[i+7], pX[i+7]), unroll7);
188
+ }
189
+ switch(sse2len-unrollLen) {
190
+ case 7:
191
+ unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pX[i+6]), unroll6);
192
+ case 6:
193
+ unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pX[i+5]), unroll5);
194
+ case 5:
195
+ unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pX[i+4]), unroll4);
196
+ case 4:
197
+ unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pX[i+3]), unroll3);
198
+ case 3:
199
+ unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pX[i+2]), unroll2);
200
+ case 2:
201
+ unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pX[i+1]), unroll1);
202
+ case 1:
203
+ unroll0 = _mm_add_pd(_mm_mul_pd(pX[i+0], pX[i+0]), unroll0);
204
+ }
205
+ ans = _mm_add_pd(
206
+ _mm_add_pd(_mm_add_pd(unroll0, unroll1), _mm_add_pd(unroll2, unroll3)),
207
+ _mm_add_pd(_mm_add_pd(unroll4, unroll5), _mm_add_pd(unroll6, unroll7)));
208
+ }
209
+ else {
210
+ switch(sse2len) {
211
+ case 0:
212
+ if(n == 0) {
213
+ return 0;
214
+ } else {
215
+ return x[0]*x[0];
216
+ }
217
+ case 1:
218
+ ans = _mm_mul_pd(pX[0], pX[0]);
219
+ break;
220
+ case 2:
221
+ ans = _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1]));
222
+ break;
223
+ case 3:
224
+ ans = _mm_add_pd(
225
+ _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
226
+ _mm_mul_pd(pX[2], pX[2]));
227
+ break;
228
+ case 4:
229
+ ans = _mm_add_pd(
230
+ _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
231
+ _mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3])));
232
+ break;
233
+ case 5:
234
+ ans = _mm_add_pd(
235
+ _mm_add_pd(
236
+ _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
237
+ _mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3]))),
238
+ _mm_mul_pd(pX[4], pX[4]));
239
+ break;
240
+ case 6:
241
+ ans = _mm_add_pd(
242
+ _mm_add_pd(
243
+ _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
244
+ _mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3]))),
245
+ _mm_add_pd(_mm_mul_pd(pX[4], pX[4]), _mm_mul_pd(pX[5], pX[5])));
246
+ break;
247
+ case 7:
248
+ ans = _mm_add_pd(
249
+ _mm_add_pd(
250
+ _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
251
+ _mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3]))),
252
+ _mm_add_pd(
253
+ _mm_add_pd(_mm_mul_pd(pX[4], pX[4]), _mm_mul_pd(pX[5], pX[5])),
254
+ _mm_mul_pd(pX[6], pX[6])));
255
+ break;
256
+ }
257
+ }
258
+
259
+ const double *pComps = (const double*)&ans;
260
+ double scalar = pComps[0] + pComps[1];
261
+ const ae_int_t tail = sse2len<<1;
262
+ if(n-tail) {
263
+ return scalar + x[tail]*x[tail];
264
+ } else {
265
+ return scalar;
266
+ }
267
+ }
268
+
269
+ void rcopyv_sse2(const ae_int_t n,
270
+ /* Real */ const double* __restrict x,
271
+ /* Real */ double* __restrict y,
272
+ ae_state* __restrict _state)
273
+ {
274
+ ae_int_t i;
275
+ const ae_int_t sse2len = n>>1;
276
+ const ae_int_t tail = sse2len<<1;
277
+ const __m128d* __restrict pSrc = (const __m128d*)(x);
278
+ __m128d* __restrict pDest = (__m128d*)(y);
279
+
280
+ for(i=0; i<sse2len; i++)
281
+ pDest[i] = pSrc[i];
282
+ if( n-tail )
283
+ *(double*)(pDest+i) = *(const double*)(pSrc+i);
284
+ }
285
+
286
+ void rcopymulv_sse2(const ae_int_t n,
287
+ const double v,
288
+ /* Real */ const double* __restrict x,
289
+ /* Real */ double* __restrict y,
290
+ const ae_state* __restrict _state)
291
+ {
292
+ ae_int_t i;
293
+
294
+ const ae_int_t sse2len = n>>1;
295
+ const __m128d* __restrict pSrc = (const __m128d*)(x);
296
+ __m128d* __restrict pDest = (__m128d*)(y);
297
+ const __m128d sse2v = _mm_set1_pd(v);
298
+ const ae_int_t tail = sse2len<<1;
299
+ for(i=0; i<sse2len; i++) {
300
+ pDest[i] = _mm_mul_pd(sse2v, pSrc[i]);
301
+ }
302
+ if(n-tail) {
303
+ *(double*)(pDest+i) = v * (*(const double*)(pSrc+i));
304
+ }
305
+ }
306
+
307
+ void icopyv_sse2(const ae_int_t n, const ae_int_t* __restrict x,
308
+ ae_int_t* __restrict y, ae_state* __restrict _state)
309
+ {
310
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 15;
311
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
312
+ __m128i *__restrict pDest = (__m128i*)y;
313
+ const __m128i* __restrict pSrc = (const __m128i*)x;
314
+ const ae_int_t nVec = even>>4;
315
+ ae_int_t i;
316
+ for(i=0; i<nVec; i++) {
317
+ pDest[i] = pSrc[i];
318
+ }
319
+ i = even/sizeof(ae_int_t);
320
+ if(tail & 8) {
321
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
322
+ i += 8/sizeof(ae_int_t);
323
+ }
324
+ if(tail & 4) {
325
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
326
+ }
327
+ }
328
+
329
+ void bcopyv_sse2(const ae_int_t n, const ae_bool* __restrict x,
330
+ ae_bool* __restrict y, ae_state* __restrict _state)
331
+ {
332
+ const ae_int_t tail = n & 15;
333
+ const ae_int_t even = n-tail;
334
+ __m128i *__restrict pDest = (__m128i*)y;
335
+ const __m128i* __restrict pSrc = (const __m128i*)x;
336
+ const ae_int_t nVec = even>>4;
337
+ ae_int_t i;
338
+ for(i=0; i<nVec; i++) {
339
+ pDest[i] = pSrc[i];
340
+ }
341
+ i = even;
342
+ if(tail & 8) {
343
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
344
+ i += 8;
345
+ }
346
+ if(tail & 4) {
347
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
348
+ i += 4;
349
+ }
350
+ if(tail & 2)
351
+ {
352
+ *(y+i+0) = *(x+i+0);
353
+ *(y+i+1) = *(x+i+1);
354
+ i += 2;
355
+ }
356
+ if(tail & 1) {
357
+ *(y+i) = *(x+i);
358
+ }
359
+ }
360
+
361
+ void rsetv_sse2(const ae_int_t n,
362
+ const double v,
363
+ /* Real */ double* __restrict x,
364
+ const ae_state* __restrict _state)
365
+ {
366
+ ae_int_t i;
367
+
368
+ const ae_int_t sse2len = n>>1;
369
+ __m128d* __restrict pDest = (__m128d*)(x);
370
+ const __m128d sse2v = _mm_set1_pd(v);
371
+ for(i=0; i<sse2len; i++) {
372
+ pDest[i] = sse2v;
373
+ }
374
+ const ae_int_t tail = sse2len<<1;
375
+ if(n-tail) {
376
+ *(double*)(pDest+i) = v;
377
+ }
378
+ }
379
+
380
+ void rsetvx_sse2(const ae_int_t n,
381
+ const double v,
382
+ /* Real */ double* __restrict x,
383
+ const ae_state* __restrict _state)
384
+ {
385
+ if( n<=4 )
386
+ {
387
+ ae_int_t j;
388
+ for(j=0; j<=n-1; j++)
389
+ x[j] = v;
390
+ return;
391
+ }
392
+ if((((ptrdiff_t)x) & 15) == 0)
393
+ {
394
+ rsetv_sse2(n, v, x, _state);
395
+ return;
396
+ }
397
+ x[0] = v;
398
+ rsetv_sse2(n-1, v, x+1, _state);
399
+ }
400
+
401
+ void isetv_sse2(const ae_int_t n, const ae_int_t v,
402
+ ae_int_t* __restrict x, ae_state* __restrict _state)
403
+ {
404
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 15;
405
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
406
+ __m128i *__restrict pDest = (__m128i*)x;
407
+ const ae_int_t v2[2] = {v, v};
408
+ const __m128i sse2v = ((sizeof(v) == 4) ? _mm_set1_epi32((ae_int32_t)v) : _mm_loadu_si128((const __m128i*)(&v2[0])));
409
+ const ae_int_t nVec = even>>4;
410
+ ae_int_t i;
411
+ for(i=0; i<nVec; i++) {
412
+ pDest[i] = sse2v;
413
+ }
414
+ memmove(pDest+i, &sse2v, tail);
415
+ }
416
+
417
+ void bsetv_sse2(const ae_int_t n, const ae_bool v, ae_bool* __restrict x,
418
+ ae_state* __restrict _state)
419
+ {
420
+ const ae_int_t tail = n & 15;
421
+ const ae_int_t even = n-tail;
422
+ __m128i *__restrict pDest = (__m128i*)x;
423
+ const __m128i sse2v = _mm_set1_epi8(v);
424
+ const ae_int_t nVec = even>>4;
425
+ ae_int_t i;
426
+ for(i=0; i<nVec; i++) {
427
+ pDest[i] = sse2v;
428
+ }
429
+ /* _mm_storel_epi64() has a too high latency and too low throughput on the recent (Skylake+) processors */
430
+ memset(x+even, v, tail);
431
+ }
432
+
433
+ void rmulv_sse2(const ae_int_t n, const double v, double* __restrict x,
434
+ const ae_state* __restrict _state)
435
+ {
436
+ ae_int_t i;
437
+
438
+ const ae_int_t sse2len = n>>1;
439
+ __m128d* __restrict pDest = (__m128d*)(x);
440
+ const __m128d sse2v = _mm_set1_pd(v);
441
+ for(i=0; i<sse2len; i++) {
442
+ pDest[i] = _mm_mul_pd(sse2v, pDest[i]);
443
+ }
444
+ const ae_int_t tail = sse2len<<1;
445
+ if(n-tail) {
446
+ *(double*)(pDest+i) = v * (*(const double*)(pDest+i));
447
+ }
448
+ }
449
+
450
+ void rmulvx_sse2(const ae_int_t n, const double v, double* __restrict x,
451
+ const ae_state* __restrict _state)
452
+ {
453
+ if( n<=4 )
454
+ {
455
+ ae_int_t i;
456
+ for(i=0; i<=n-1; i++)
457
+ x[i] *= v;
458
+ return;
459
+ }
460
+ if((((ptrdiff_t)x) & 15) == 0)
461
+ {
462
+ rmulv_sse2(n, v, x, _state);
463
+ return;
464
+ }
465
+ x[0] = v * x[0];
466
+ rmulv_sse2(n-1, v, x+1, _state);
467
+ }
468
+
469
+ void raddv_sse2(const ae_int_t n,
470
+ const double alpha,
471
+ /* Real */ const double* __restrict y,
472
+ /* Real */ double* __restrict x,
473
+ const ae_state* __restrict _state)
474
+ {
475
+ ae_int_t i;
476
+
477
+ const ae_int_t sse2len = n>>1;
478
+ const __m128d* __restrict pSrc = (const __m128d*)(y);
479
+ __m128d* __restrict pDest = (__m128d*)(x);
480
+ const __m128d sse2alpha = _mm_set1_pd(alpha);
481
+ for(i=0; i<sse2len; i++) {
482
+ pDest[i] = _mm_add_pd(_mm_mul_pd(sse2alpha, pSrc[i]), pDest[i]);
483
+ }
484
+ const ae_int_t tail = sse2len<<1;
485
+ if(n-tail) {
486
+ *(double*)(pDest+i) = alpha * (*(const double*)(pSrc+i))
487
+ + (*(const double*)(pDest+i));
488
+ }
489
+ }
490
+
491
+ void raddvx_sse2_xaligned(const ae_int_t n, const double alpha,
492
+ const double* __restrict y, double* __restrict x, ae_state *_state)
493
+ {
494
+ ae_int_t i;
495
+
496
+ const ae_int_t vecLen = (n>>1)<<1;
497
+ const __m128d sse2alpha = _mm_set1_pd(alpha);
498
+ __m128d * __restrict pDest = (__m128d*)x;
499
+ for(i=0; i<vecLen; i+=2)
500
+ {
501
+ const ae_int_t iDest = i>>1;
502
+ pDest[iDest] = _mm_add_pd(_mm_mul_pd(sse2alpha, _mm_loadu_pd(y+i)), pDest[iDest]);
503
+ }
504
+ if(n-vecLen)
505
+ x[i] += alpha*y[i];
506
+ }
507
+
508
+ void raddvx_sse2(const ae_int_t n, const double alpha,
509
+ const double* __restrict y, double* __restrict x, ae_state *_state)
510
+ {
511
+ if( n<=4 )
512
+ {
513
+ ae_int_t i;
514
+ for(i=0; i<=n-1; i++)
515
+ x[i] += alpha*y[i];
516
+ return;
517
+ }
518
+ if((((ptrdiff_t)x) & 15) == 0)
519
+ {
520
+ raddvx_sse2_xaligned(n, alpha, y, x, _state);
521
+ return;
522
+ }
523
+ x[0] += alpha*y[0];
524
+ raddvx_sse2_xaligned(n-1, alpha, y+1, x+1, _state);
525
+ }
526
+
527
+ void rmergemulv_sse2(const ae_int_t n,
528
+ /* Real */ const double* __restrict y,
529
+ /* Real */ double* __restrict x,
530
+ const ae_state* __restrict _state)
531
+ {
532
+ ae_int_t i;
533
+
534
+ const ae_int_t sse2len = n>>1;
535
+ const __m128d* __restrict pSrc = (const __m128d*)(y);
536
+ __m128d* __restrict pDest = (__m128d*)(x);
537
+ for(i=0; i<sse2len; i++) {
538
+ pDest[i] = _mm_mul_pd(pSrc[i], pDest[i]);
539
+ }
540
+ const ae_int_t tail = sse2len<<1;
541
+ if(n-tail) {
542
+ *(double*)(pDest+i) = (*(const double*)(pSrc+i))
543
+ * (*(const double*)(pDest+i));
544
+ }
545
+ }
546
+
547
+ void rmergemaxv_sse2(const ae_int_t n,
548
+ /* Real */ const double* __restrict y,
549
+ /* Real */ double* __restrict x,
550
+ ae_state* __restrict _state)
551
+ {
552
+ ae_int_t i;
553
+
554
+ const ae_int_t sse2len = n>>1;
555
+ const __m128d* __restrict pSrc = (const __m128d*)(y);
556
+ __m128d* __restrict pDest = (__m128d*)(x);
557
+ for(i=0; i<sse2len; i++) {
558
+ pDest[i] = _mm_max_pd(pSrc[i], pDest[i]);
559
+ }
560
+ const ae_int_t tail = sse2len<<1;
561
+ if(n-tail) {
562
+ *(double*)(pDest+i) = ae_maxreal(*(const double*)(pSrc+i),
563
+ *(const double*)(pDest+i), _state);
564
+ }
565
+ }
566
+
567
+ void rmergeminv_sse2(const ae_int_t n,
568
+ /* Real */ const double* __restrict y,
569
+ /* Real */ double* __restrict x,
570
+ ae_state* __restrict _state)
571
+ {
572
+ ae_int_t i;
573
+
574
+ const ae_int_t sse2len = n>>1;
575
+ const __m128d* __restrict pSrc = (const __m128d*)(y);
576
+ __m128d* __restrict pDest = (__m128d*)(x);
577
+ for(i=0; i<sse2len; i++) {
578
+ pDest[i] = _mm_min_pd(pSrc[i], pDest[i]);
579
+ }
580
+ const ae_int_t tail = sse2len<<1;
581
+ if(n-tail) {
582
+ *(double*)(pDest+i) = ae_minreal(*(const double*)(pSrc+i),
583
+ *(const double*)(pDest+i), _state);
584
+ }
585
+ }
586
+
587
+ double rmaxv_sse2(ae_int_t n, /* Real */ const double* __restrict x, ae_state* __restrict _state)
588
+ {
589
+ ae_int_t i;
590
+
591
+ const ae_int_t sse2len = n>>1;
592
+ const __m128d* __restrict pSrc = (const __m128d*)(x);
593
+ if( n<=4 )
594
+ {
595
+ double result;
596
+ if(n == 0)
597
+ return 0.0;
598
+ result = x[0];
599
+ for(i=1; i<=n-1; i++)
600
+ {
601
+ double v = x[i];
602
+ if( v>result )
603
+ result = v;
604
+ }
605
+ return result;
606
+ }
607
+ __m128d curMax = pSrc[0];
608
+ for(i=1; i<sse2len; i++) {
609
+ curMax = _mm_max_pd(curMax, pSrc[i]);
610
+ }
611
+ const double *pComps = (const double *)&curMax;
612
+ const double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
613
+ const ae_int_t tail = sse2len<<1;
614
+ if(n-tail) {
615
+ const double candidate = *(const double*)(pSrc+i);
616
+ return (candidate > dMax) ? candidate : dMax;
617
+ }
618
+ else {
619
+ return dMax;
620
+ }
621
+ }
622
+
623
+ double rmaxabsv_sse2(ae_int_t n, /* Real */ const double* __restrict x, ae_state* __restrict _state)
624
+ {
625
+ const __m128d signMask = _mm_set1_pd(-0.); // -0. = 1 << 63
626
+ const ae_int_t sse2len = n>>1;
627
+ const __m128d* __restrict pSrc = (const __m128d*)(x);
628
+ if( n<=4 )
629
+ {
630
+ double result;
631
+ ae_int_t i;
632
+ result = 0;
633
+ for(i=0; i<=n-1; i++)
634
+ {
635
+ double v = fabs(x[i]);
636
+ if( v>result )
637
+ result = v;
638
+ }
639
+ return result;
640
+ }
641
+ __m128d curMax = _mm_andnot_pd(signMask, pSrc[0]); // abs
642
+ ae_int_t i;
643
+ for(i=1; i<sse2len; i++)
644
+ curMax = _mm_max_pd(curMax, _mm_andnot_pd(signMask, pSrc[i])); // abs
645
+ const double *pComps = (const double *)&curMax;
646
+ const double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
647
+ const ae_int_t tail = sse2len<<1;
648
+ if(n-tail) {
649
+ const double candidate = ae_fabs(*(const double*)(pSrc+i), _state);
650
+ return (candidate > dMax) ? candidate : dMax;
651
+ }
652
+ else {
653
+ return dMax;
654
+ }
655
+ }
656
+
657
+ static void rcopyvx_sse2_xaligned(const ae_int_t n, const double* __restrict x,
658
+ double* __restrict y, ae_state *_state)
659
+ {
660
+ ae_int_t i;
661
+
662
+ const ae_int_t vecLen = (n>>1)<<1;
663
+ const __m128d * __restrict pSrc = (const __m128d*)x;
664
+ for(i=0; i<vecLen; i+=2) {
665
+ const ae_int_t iSrc = i>>1;
666
+ _mm_storeu_pd(y+i, pSrc[iSrc]);
667
+ }
668
+ if(n-vecLen) {
669
+ y[i] = x[i];
670
+ }
671
+ }
672
+
673
+ void rcopyvx_sse2(const ae_int_t n, const double* __restrict x,
674
+ double* __restrict y, ae_state *_state)
675
+ {
676
+ if((((ptrdiff_t)x) & 15) == 0)
677
+ {
678
+ rcopyvx_sse2_xaligned(n, x, y, _state);
679
+ return;
680
+ }
681
+ y[0] = x[0];
682
+ rcopyvx_sse2_xaligned(n-1, x+1, y+1, _state);
683
+ }
684
+
685
+ static void icopyvx_sse2_xaligned(const ae_int_t n, const ae_int_t* __restrict x,
686
+ ae_int_t* __restrict y, ae_state* __restrict _state)
687
+ {
688
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 15;
689
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
690
+ const __m128i* __restrict pSrc = (const __m128i*)x;
691
+ const ae_int_t nVec = even>>4;
692
+ const ae_int_t shift_by = 2-sizeof(ae_int_t)/8;
693
+ ae_int_t i;
694
+ for(i=0; i<nVec; i++) {
695
+ const ae_int_t j = i<<shift_by;
696
+ _mm_storeu_si128((__m128i*)(y+j), pSrc[i]);
697
+ }
698
+ i = even/sizeof(ae_int_t);
699
+ if(tail & 8) {
700
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
701
+ i += 8/sizeof(ae_int_t);
702
+ }
703
+ if(tail & 4) {
704
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
705
+ }
706
+ }
707
+
708
+ void icopyvx_sse2(const ae_int_t n, const ae_int_t* __restrict x,
709
+ ae_int_t* __restrict y, ae_state* __restrict _state)
710
+ {
711
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 15;
712
+ if( n<=8 )
713
+ {
714
+ ae_int_t j;
715
+ for(j=0; j<=n-1; j++)
716
+ y[j] = x[j];
717
+ return;
718
+ }
719
+ if(unal == 0)
720
+ {
721
+ icopyvx_sse2_xaligned(n, x, y, _state);
722
+ return;
723
+ }
724
+ const ae_int_t offset = 16-unal;
725
+ memmove(y, x, offset);
726
+ const ae_int_t nDone = offset / sizeof(ae_int_t);
727
+ icopyvx_sse2_xaligned(n-nDone, x+nDone, y+nDone, _state);
728
+ }
729
+
730
+ /* ALGLIB_NO_FAST_KERNELS, _ALGLIB_HAS_SSE2_INTRINSICS */
731
+ #endif
732
+
733
+
734
+ }
735
+