alglib4 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +47 -0
  3. data/ext/alglib/alglib.cpp +537 -0
  4. data/ext/alglib/alglib_array_converters.cpp +86 -0
  5. data/ext/alglib/alglib_array_converters.h +15 -0
  6. data/ext/alglib/alglib_utils.cpp +10 -0
  7. data/ext/alglib/alglib_utils.h +6 -0
  8. data/ext/alglib/alglibinternal.cpp +21749 -0
  9. data/ext/alglib/alglibinternal.h +2168 -0
  10. data/ext/alglib/alglibmisc.cpp +9106 -0
  11. data/ext/alglib/alglibmisc.h +2114 -0
  12. data/ext/alglib/ap.cpp +20094 -0
  13. data/ext/alglib/ap.h +7244 -0
  14. data/ext/alglib/dataanalysis.cpp +52588 -0
  15. data/ext/alglib/dataanalysis.h +10601 -0
  16. data/ext/alglib/diffequations.cpp +1342 -0
  17. data/ext/alglib/diffequations.h +282 -0
  18. data/ext/alglib/extconf.rb +5 -0
  19. data/ext/alglib/fasttransforms.cpp +4696 -0
  20. data/ext/alglib/fasttransforms.h +1018 -0
  21. data/ext/alglib/integration.cpp +4249 -0
  22. data/ext/alglib/integration.h +869 -0
  23. data/ext/alglib/interpolation.cpp +74502 -0
  24. data/ext/alglib/interpolation.h +12264 -0
  25. data/ext/alglib/kernels_avx2.cpp +2171 -0
  26. data/ext/alglib/kernels_avx2.h +201 -0
  27. data/ext/alglib/kernels_fma.cpp +1065 -0
  28. data/ext/alglib/kernels_fma.h +137 -0
  29. data/ext/alglib/kernels_sse2.cpp +735 -0
  30. data/ext/alglib/kernels_sse2.h +100 -0
  31. data/ext/alglib/linalg.cpp +65182 -0
  32. data/ext/alglib/linalg.h +9927 -0
  33. data/ext/alglib/optimization.cpp +135331 -0
  34. data/ext/alglib/optimization.h +19235 -0
  35. data/ext/alglib/solvers.cpp +20488 -0
  36. data/ext/alglib/solvers.h +4781 -0
  37. data/ext/alglib/specialfunctions.cpp +10672 -0
  38. data/ext/alglib/specialfunctions.h +2305 -0
  39. data/ext/alglib/statistics.cpp +19791 -0
  40. data/ext/alglib/statistics.h +1359 -0
  41. data/ext/alglib/stdafx.h +2 -0
  42. data/gpl2.txt +339 -0
  43. data/gpl3.txt +674 -0
  44. data/lib/alglib/version.rb +3 -0
  45. data/lib/alglib.rb +4 -0
  46. metadata +101 -0
@@ -0,0 +1,735 @@
1
+ /*************************************************************************
2
+ ALGLIB 4.04.0 (source code generated 2024-12-21)
3
+ Copyright (c) Sergey Bochkanov (ALGLIB project).
4
+
5
+ >>> SOURCE LICENSE >>>
6
+ This program is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation (www.fsf.org); either version 2 of the
9
+ License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ A copy of the GNU General Public License is available at
17
+ http://www.fsf.org/licensing/licenses
18
+ >>> END OF LICENSE >>>
19
+ *************************************************************************/
20
+ #ifdef _MSC_VER
21
+ #define _CRT_SECURE_NO_WARNINGS
22
+ #endif
23
+ #include "stdafx.h"
24
+
25
+ //
26
+ // Must be defined before we include kernel header
27
+ //
28
+ #define _ALGLIB_IMPL_DEFINES
29
+ #define _ALGLIB_INTEGRITY_CHECKS_ONCE
30
+
31
+ #include "kernels_sse2.h"
32
+
33
+ // disable some irrelevant warnings
34
+ #if (AE_COMPILER==AE_MSVC) && !defined(AE_ALL_WARNINGS)
35
+ #pragma warning(disable:4100)
36
+ #pragma warning(disable:4127)
37
+ #pragma warning(disable:4611)
38
+ #pragma warning(disable:4702)
39
+ #pragma warning(disable:4996)
40
+ #endif
41
+
42
+ namespace alglib_impl
43
+ {
44
+
45
+
46
+
47
+ #if !defined(ALGLIB_NO_FAST_KERNELS) && defined(_ALGLIB_HAS_SSE2_INTRINSICS)
48
+
49
+ double rdotv_sse2(ae_int_t n,
50
+ /* Real */ const double* x,
51
+ /* Real */ const double* y,
52
+ ae_state *_state)
53
+ {
54
+ ae_int_t i;
55
+
56
+ const ae_int_t sse2len = n>>1;
57
+ const ae_int_t unrollLen = (sse2len>>3)<<3;
58
+ const __m128d* __restrict pX = (const __m128d*)(x);
59
+ const __m128d* __restrict pY = (const __m128d*)(y);
60
+ __m128d ans;
61
+ if(unrollLen >= 8) {
62
+ __m128d unroll0 = _mm_mul_pd(pX[0], pY[0]);
63
+ __m128d unroll1 = _mm_mul_pd(pX[1], pY[1]);
64
+ __m128d unroll2 = _mm_mul_pd(pX[2], pY[2]);
65
+ __m128d unroll3 = _mm_mul_pd(pX[3], pY[3]);
66
+ __m128d unroll4 = _mm_mul_pd(pX[4], pY[4]);
67
+ __m128d unroll5 = _mm_mul_pd(pX[5], pY[5]);
68
+ __m128d unroll6 = _mm_mul_pd(pX[6], pY[6]);
69
+ __m128d unroll7 = _mm_mul_pd(pX[7], pY[7]);
70
+ for(i=8; i<unrollLen; i+=8) {
71
+ unroll0 = _mm_add_pd(_mm_mul_pd(pX[i], pY[i]), unroll0);
72
+ unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pY[i+1]), unroll1);
73
+ unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pY[i+2]), unroll2);
74
+ unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pY[i+3]), unroll3);
75
+ unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pY[i+4]), unroll4);
76
+ unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pY[i+5]), unroll5);
77
+ unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pY[i+6]), unroll6);
78
+ unroll7 = _mm_add_pd(_mm_mul_pd(pX[i+7], pY[i+7]), unroll7);
79
+ }
80
+ switch(sse2len-unrollLen) {
81
+ case 7:
82
+ unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pY[i+6]), unroll6);
83
+ case 6:
84
+ unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pY[i+5]), unroll5);
85
+ case 5:
86
+ unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pY[i+4]), unroll4);
87
+ case 4:
88
+ unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pY[i+3]), unroll3);
89
+ case 3:
90
+ unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pY[i+2]), unroll2);
91
+ case 2:
92
+ unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pY[i+1]), unroll1);
93
+ case 1:
94
+ unroll0 = _mm_add_pd(_mm_mul_pd(pX[i+0], pY[i+0]), unroll0);
95
+ }
96
+ ans = _mm_add_pd(
97
+ _mm_add_pd(_mm_add_pd(unroll0, unroll1), _mm_add_pd(unroll2, unroll3)),
98
+ _mm_add_pd(_mm_add_pd(unroll4, unroll5), _mm_add_pd(unroll6, unroll7)));
99
+ }
100
+ else {
101
+ switch(sse2len) {
102
+ case 0:
103
+ if(n == 0) {
104
+ return 0;
105
+ } else {
106
+ return x[0]*y[0];
107
+ }
108
+ case 1:
109
+ ans = _mm_mul_pd(pX[0], pY[0]);
110
+ break;
111
+ case 2:
112
+ ans = _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1]));
113
+ break;
114
+ case 3:
115
+ ans = _mm_add_pd(
116
+ _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
117
+ _mm_mul_pd(pX[2], pY[2]));
118
+ break;
119
+ case 4:
120
+ ans = _mm_add_pd(
121
+ _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
122
+ _mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3])));
123
+ break;
124
+ case 5:
125
+ ans = _mm_add_pd(
126
+ _mm_add_pd(
127
+ _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
128
+ _mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3]))),
129
+ _mm_mul_pd(pX[4], pY[4]));
130
+ break;
131
+ case 6:
132
+ ans = _mm_add_pd(
133
+ _mm_add_pd(
134
+ _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
135
+ _mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3]))),
136
+ _mm_add_pd(_mm_mul_pd(pX[4], pY[4]), _mm_mul_pd(pX[5], pY[5])));
137
+ break;
138
+ case 7:
139
+ ans = _mm_add_pd(
140
+ _mm_add_pd(
141
+ _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
142
+ _mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3]))),
143
+ _mm_add_pd(
144
+ _mm_add_pd(_mm_mul_pd(pX[4], pY[4]), _mm_mul_pd(pX[5], pY[5])),
145
+ _mm_mul_pd(pX[6], pY[6])));
146
+ break;
147
+ }
148
+ }
149
+
150
+ const double *pComps = (const double*)&ans;
151
+ double scalar = pComps[0] + pComps[1];
152
+ const ae_int_t tail = sse2len<<1;
153
+ if(n-tail) {
154
+ return scalar + x[tail]*y[tail];
155
+ } else {
156
+ return scalar;
157
+ }
158
+ }
159
+
160
+ double rdotv2_sse2(ae_int_t n,
161
+ /* Real */ const double* x,
162
+ ae_state *_state)
163
+ {
164
+ ae_int_t i;
165
+
166
+ const ae_int_t sse2len = n>>1;
167
+ const ae_int_t unrollLen = (sse2len>>3)<<3;
168
+ const __m128d* __restrict pX = (const __m128d*)(x);
169
+ __m128d ans;
170
+ if(unrollLen >= 8) {
171
+ __m128d unroll0 = _mm_mul_pd(pX[0], pX[0]);
172
+ __m128d unroll1 = _mm_mul_pd(pX[1], pX[1]);
173
+ __m128d unroll2 = _mm_mul_pd(pX[2], pX[2]);
174
+ __m128d unroll3 = _mm_mul_pd(pX[3], pX[3]);
175
+ __m128d unroll4 = _mm_mul_pd(pX[4], pX[4]);
176
+ __m128d unroll5 = _mm_mul_pd(pX[5], pX[5]);
177
+ __m128d unroll6 = _mm_mul_pd(pX[6], pX[6]);
178
+ __m128d unroll7 = _mm_mul_pd(pX[7], pX[7]);
179
+ for(i=8; i<unrollLen; i+=8) {
180
+ unroll0 = _mm_add_pd(_mm_mul_pd(pX[i], pX[i]), unroll0);
181
+ unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pX[i+1]), unroll1);
182
+ unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pX[i+2]), unroll2);
183
+ unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pX[i+3]), unroll3);
184
+ unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pX[i+4]), unroll4);
185
+ unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pX[i+5]), unroll5);
186
+ unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pX[i+6]), unroll6);
187
+ unroll7 = _mm_add_pd(_mm_mul_pd(pX[i+7], pX[i+7]), unroll7);
188
+ }
189
+ switch(sse2len-unrollLen) {
190
+ case 7:
191
+ unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pX[i+6]), unroll6);
192
+ case 6:
193
+ unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pX[i+5]), unroll5);
194
+ case 5:
195
+ unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pX[i+4]), unroll4);
196
+ case 4:
197
+ unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pX[i+3]), unroll3);
198
+ case 3:
199
+ unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pX[i+2]), unroll2);
200
+ case 2:
201
+ unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pX[i+1]), unroll1);
202
+ case 1:
203
+ unroll0 = _mm_add_pd(_mm_mul_pd(pX[i+0], pX[i+0]), unroll0);
204
+ }
205
+ ans = _mm_add_pd(
206
+ _mm_add_pd(_mm_add_pd(unroll0, unroll1), _mm_add_pd(unroll2, unroll3)),
207
+ _mm_add_pd(_mm_add_pd(unroll4, unroll5), _mm_add_pd(unroll6, unroll7)));
208
+ }
209
+ else {
210
+ switch(sse2len) {
211
+ case 0:
212
+ if(n == 0) {
213
+ return 0;
214
+ } else {
215
+ return x[0]*x[0];
216
+ }
217
+ case 1:
218
+ ans = _mm_mul_pd(pX[0], pX[0]);
219
+ break;
220
+ case 2:
221
+ ans = _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1]));
222
+ break;
223
+ case 3:
224
+ ans = _mm_add_pd(
225
+ _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
226
+ _mm_mul_pd(pX[2], pX[2]));
227
+ break;
228
+ case 4:
229
+ ans = _mm_add_pd(
230
+ _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
231
+ _mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3])));
232
+ break;
233
+ case 5:
234
+ ans = _mm_add_pd(
235
+ _mm_add_pd(
236
+ _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
237
+ _mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3]))),
238
+ _mm_mul_pd(pX[4], pX[4]));
239
+ break;
240
+ case 6:
241
+ ans = _mm_add_pd(
242
+ _mm_add_pd(
243
+ _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
244
+ _mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3]))),
245
+ _mm_add_pd(_mm_mul_pd(pX[4], pX[4]), _mm_mul_pd(pX[5], pX[5])));
246
+ break;
247
+ case 7:
248
+ ans = _mm_add_pd(
249
+ _mm_add_pd(
250
+ _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
251
+ _mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3]))),
252
+ _mm_add_pd(
253
+ _mm_add_pd(_mm_mul_pd(pX[4], pX[4]), _mm_mul_pd(pX[5], pX[5])),
254
+ _mm_mul_pd(pX[6], pX[6])));
255
+ break;
256
+ }
257
+ }
258
+
259
+ const double *pComps = (const double*)&ans;
260
+ double scalar = pComps[0] + pComps[1];
261
+ const ae_int_t tail = sse2len<<1;
262
+ if(n-tail) {
263
+ return scalar + x[tail]*x[tail];
264
+ } else {
265
+ return scalar;
266
+ }
267
+ }
268
+
269
+ void rcopyv_sse2(const ae_int_t n,
270
+ /* Real */ const double* __restrict x,
271
+ /* Real */ double* __restrict y,
272
+ ae_state* __restrict _state)
273
+ {
274
+ ae_int_t i;
275
+ const ae_int_t sse2len = n>>1;
276
+ const ae_int_t tail = sse2len<<1;
277
+ const __m128d* __restrict pSrc = (const __m128d*)(x);
278
+ __m128d* __restrict pDest = (__m128d*)(y);
279
+
280
+ for(i=0; i<sse2len; i++)
281
+ pDest[i] = pSrc[i];
282
+ if( n-tail )
283
+ *(double*)(pDest+i) = *(const double*)(pSrc+i);
284
+ }
285
+
286
+ void rcopymulv_sse2(const ae_int_t n,
287
+ const double v,
288
+ /* Real */ const double* __restrict x,
289
+ /* Real */ double* __restrict y,
290
+ const ae_state* __restrict _state)
291
+ {
292
+ ae_int_t i;
293
+
294
+ const ae_int_t sse2len = n>>1;
295
+ const __m128d* __restrict pSrc = (const __m128d*)(x);
296
+ __m128d* __restrict pDest = (__m128d*)(y);
297
+ const __m128d sse2v = _mm_set1_pd(v);
298
+ const ae_int_t tail = sse2len<<1;
299
+ for(i=0; i<sse2len; i++) {
300
+ pDest[i] = _mm_mul_pd(sse2v, pSrc[i]);
301
+ }
302
+ if(n-tail) {
303
+ *(double*)(pDest+i) = v * (*(const double*)(pSrc+i));
304
+ }
305
+ }
306
+
307
+ void icopyv_sse2(const ae_int_t n, const ae_int_t* __restrict x,
308
+ ae_int_t* __restrict y, ae_state* __restrict _state)
309
+ {
310
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 15;
311
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
312
+ __m128i *__restrict pDest = (__m128i*)y;
313
+ const __m128i* __restrict pSrc = (const __m128i*)x;
314
+ const ae_int_t nVec = even>>4;
315
+ ae_int_t i;
316
+ for(i=0; i<nVec; i++) {
317
+ pDest[i] = pSrc[i];
318
+ }
319
+ i = even/sizeof(ae_int_t);
320
+ if(tail & 8) {
321
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
322
+ i += 8/sizeof(ae_int_t);
323
+ }
324
+ if(tail & 4) {
325
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
326
+ }
327
+ }
328
+
329
+ void bcopyv_sse2(const ae_int_t n, const ae_bool* __restrict x,
330
+ ae_bool* __restrict y, ae_state* __restrict _state)
331
+ {
332
+ const ae_int_t tail = n & 15;
333
+ const ae_int_t even = n-tail;
334
+ __m128i *__restrict pDest = (__m128i*)y;
335
+ const __m128i* __restrict pSrc = (const __m128i*)x;
336
+ const ae_int_t nVec = even>>4;
337
+ ae_int_t i;
338
+ for(i=0; i<nVec; i++) {
339
+ pDest[i] = pSrc[i];
340
+ }
341
+ i = even;
342
+ if(tail & 8) {
343
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
344
+ i += 8;
345
+ }
346
+ if(tail & 4) {
347
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
348
+ i += 4;
349
+ }
350
+ if(tail & 2)
351
+ {
352
+ *(y+i+0) = *(x+i+0);
353
+ *(y+i+1) = *(x+i+1);
354
+ i += 2;
355
+ }
356
+ if(tail & 1) {
357
+ *(y+i) = *(x+i);
358
+ }
359
+ }
360
+
361
+ void rsetv_sse2(const ae_int_t n,
362
+ const double v,
363
+ /* Real */ double* __restrict x,
364
+ const ae_state* __restrict _state)
365
+ {
366
+ ae_int_t i;
367
+
368
+ const ae_int_t sse2len = n>>1;
369
+ __m128d* __restrict pDest = (__m128d*)(x);
370
+ const __m128d sse2v = _mm_set1_pd(v);
371
+ for(i=0; i<sse2len; i++) {
372
+ pDest[i] = sse2v;
373
+ }
374
+ const ae_int_t tail = sse2len<<1;
375
+ if(n-tail) {
376
+ *(double*)(pDest+i) = v;
377
+ }
378
+ }
379
+
380
+ void rsetvx_sse2(const ae_int_t n,
381
+ const double v,
382
+ /* Real */ double* __restrict x,
383
+ const ae_state* __restrict _state)
384
+ {
385
+ if( n<=4 )
386
+ {
387
+ ae_int_t j;
388
+ for(j=0; j<=n-1; j++)
389
+ x[j] = v;
390
+ return;
391
+ }
392
+ if((((ptrdiff_t)x) & 15) == 0)
393
+ {
394
+ rsetv_sse2(n, v, x, _state);
395
+ return;
396
+ }
397
+ x[0] = v;
398
+ rsetv_sse2(n-1, v, x+1, _state);
399
+ }
400
+
401
+ void isetv_sse2(const ae_int_t n, const ae_int_t v,
402
+ ae_int_t* __restrict x, ae_state* __restrict _state)
403
+ {
404
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 15;
405
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
406
+ __m128i *__restrict pDest = (__m128i*)x;
407
+ const ae_int_t v2[2] = {v, v};
408
+ const __m128i sse2v = ((sizeof(v) == 4) ? _mm_set1_epi32((ae_int32_t)v) : _mm_loadu_si128((const __m128i*)(&v2[0])));
409
+ const ae_int_t nVec = even>>4;
410
+ ae_int_t i;
411
+ for(i=0; i<nVec; i++) {
412
+ pDest[i] = sse2v;
413
+ }
414
+ memmove(pDest+i, &sse2v, tail);
415
+ }
416
+
417
+ void bsetv_sse2(const ae_int_t n, const ae_bool v, ae_bool* __restrict x,
418
+ ae_state* __restrict _state)
419
+ {
420
+ const ae_int_t tail = n & 15;
421
+ const ae_int_t even = n-tail;
422
+ __m128i *__restrict pDest = (__m128i*)x;
423
+ const __m128i sse2v = _mm_set1_epi8(v);
424
+ const ae_int_t nVec = even>>4;
425
+ ae_int_t i;
426
+ for(i=0; i<nVec; i++) {
427
+ pDest[i] = sse2v;
428
+ }
429
+ /* _mm_storel_epi64() has a too high latency and too low throughput on the recent (Skylake+) processors */
430
+ memset(x+even, v, tail);
431
+ }
432
+
433
+ void rmulv_sse2(const ae_int_t n, const double v, double* __restrict x,
434
+ const ae_state* __restrict _state)
435
+ {
436
+ ae_int_t i;
437
+
438
+ const ae_int_t sse2len = n>>1;
439
+ __m128d* __restrict pDest = (__m128d*)(x);
440
+ const __m128d sse2v = _mm_set1_pd(v);
441
+ for(i=0; i<sse2len; i++) {
442
+ pDest[i] = _mm_mul_pd(sse2v, pDest[i]);
443
+ }
444
+ const ae_int_t tail = sse2len<<1;
445
+ if(n-tail) {
446
+ *(double*)(pDest+i) = v * (*(const double*)(pDest+i));
447
+ }
448
+ }
449
+
450
+ void rmulvx_sse2(const ae_int_t n, const double v, double* __restrict x,
451
+ const ae_state* __restrict _state)
452
+ {
453
+ if( n<=4 )
454
+ {
455
+ ae_int_t i;
456
+ for(i=0; i<=n-1; i++)
457
+ x[i] *= v;
458
+ return;
459
+ }
460
+ if((((ptrdiff_t)x) & 15) == 0)
461
+ {
462
+ rmulv_sse2(n, v, x, _state);
463
+ return;
464
+ }
465
+ x[0] = v * x[0];
466
+ rmulv_sse2(n-1, v, x+1, _state);
467
+ }
468
+
469
+ void raddv_sse2(const ae_int_t n,
470
+ const double alpha,
471
+ /* Real */ const double* __restrict y,
472
+ /* Real */ double* __restrict x,
473
+ const ae_state* __restrict _state)
474
+ {
475
+ ae_int_t i;
476
+
477
+ const ae_int_t sse2len = n>>1;
478
+ const __m128d* __restrict pSrc = (const __m128d*)(y);
479
+ __m128d* __restrict pDest = (__m128d*)(x);
480
+ const __m128d sse2alpha = _mm_set1_pd(alpha);
481
+ for(i=0; i<sse2len; i++) {
482
+ pDest[i] = _mm_add_pd(_mm_mul_pd(sse2alpha, pSrc[i]), pDest[i]);
483
+ }
484
+ const ae_int_t tail = sse2len<<1;
485
+ if(n-tail) {
486
+ *(double*)(pDest+i) = alpha * (*(const double*)(pSrc+i))
487
+ + (*(const double*)(pDest+i));
488
+ }
489
+ }
490
+
491
+ void raddvx_sse2_xaligned(const ae_int_t n, const double alpha,
492
+ const double* __restrict y, double* __restrict x, ae_state *_state)
493
+ {
494
+ ae_int_t i;
495
+
496
+ const ae_int_t vecLen = (n>>1)<<1;
497
+ const __m128d sse2alpha = _mm_set1_pd(alpha);
498
+ __m128d * __restrict pDest = (__m128d*)x;
499
+ for(i=0; i<vecLen; i+=2)
500
+ {
501
+ const ae_int_t iDest = i>>1;
502
+ pDest[iDest] = _mm_add_pd(_mm_mul_pd(sse2alpha, _mm_loadu_pd(y+i)), pDest[iDest]);
503
+ }
504
+ if(n-vecLen)
505
+ x[i] += alpha*y[i];
506
+ }
507
+
508
+ void raddvx_sse2(const ae_int_t n, const double alpha,
509
+ const double* __restrict y, double* __restrict x, ae_state *_state)
510
+ {
511
+ if( n<=4 )
512
+ {
513
+ ae_int_t i;
514
+ for(i=0; i<=n-1; i++)
515
+ x[i] += alpha*y[i];
516
+ return;
517
+ }
518
+ if((((ptrdiff_t)x) & 15) == 0)
519
+ {
520
+ raddvx_sse2_xaligned(n, alpha, y, x, _state);
521
+ return;
522
+ }
523
+ x[0] += alpha*y[0];
524
+ raddvx_sse2_xaligned(n-1, alpha, y+1, x+1, _state);
525
+ }
526
+
527
+ void rmergemulv_sse2(const ae_int_t n,
528
+ /* Real */ const double* __restrict y,
529
+ /* Real */ double* __restrict x,
530
+ const ae_state* __restrict _state)
531
+ {
532
+ ae_int_t i;
533
+
534
+ const ae_int_t sse2len = n>>1;
535
+ const __m128d* __restrict pSrc = (const __m128d*)(y);
536
+ __m128d* __restrict pDest = (__m128d*)(x);
537
+ for(i=0; i<sse2len; i++) {
538
+ pDest[i] = _mm_mul_pd(pSrc[i], pDest[i]);
539
+ }
540
+ const ae_int_t tail = sse2len<<1;
541
+ if(n-tail) {
542
+ *(double*)(pDest+i) = (*(const double*)(pSrc+i))
543
+ * (*(const double*)(pDest+i));
544
+ }
545
+ }
546
+
547
+ void rmergemaxv_sse2(const ae_int_t n,
548
+ /* Real */ const double* __restrict y,
549
+ /* Real */ double* __restrict x,
550
+ ae_state* __restrict _state)
551
+ {
552
+ ae_int_t i;
553
+
554
+ const ae_int_t sse2len = n>>1;
555
+ const __m128d* __restrict pSrc = (const __m128d*)(y);
556
+ __m128d* __restrict pDest = (__m128d*)(x);
557
+ for(i=0; i<sse2len; i++) {
558
+ pDest[i] = _mm_max_pd(pSrc[i], pDest[i]);
559
+ }
560
+ const ae_int_t tail = sse2len<<1;
561
+ if(n-tail) {
562
+ *(double*)(pDest+i) = ae_maxreal(*(const double*)(pSrc+i),
563
+ *(const double*)(pDest+i), _state);
564
+ }
565
+ }
566
+
567
+ void rmergeminv_sse2(const ae_int_t n,
568
+ /* Real */ const double* __restrict y,
569
+ /* Real */ double* __restrict x,
570
+ ae_state* __restrict _state)
571
+ {
572
+ ae_int_t i;
573
+
574
+ const ae_int_t sse2len = n>>1;
575
+ const __m128d* __restrict pSrc = (const __m128d*)(y);
576
+ __m128d* __restrict pDest = (__m128d*)(x);
577
+ for(i=0; i<sse2len; i++) {
578
+ pDest[i] = _mm_min_pd(pSrc[i], pDest[i]);
579
+ }
580
+ const ae_int_t tail = sse2len<<1;
581
+ if(n-tail) {
582
+ *(double*)(pDest+i) = ae_minreal(*(const double*)(pSrc+i),
583
+ *(const double*)(pDest+i), _state);
584
+ }
585
+ }
586
+
587
+ double rmaxv_sse2(ae_int_t n, /* Real */ const double* __restrict x, ae_state* __restrict _state)
588
+ {
589
+ ae_int_t i;
590
+
591
+ const ae_int_t sse2len = n>>1;
592
+ const __m128d* __restrict pSrc = (const __m128d*)(x);
593
+ if( n<=4 )
594
+ {
595
+ double result;
596
+ if(n == 0)
597
+ return 0.0;
598
+ result = x[0];
599
+ for(i=1; i<=n-1; i++)
600
+ {
601
+ double v = x[i];
602
+ if( v>result )
603
+ result = v;
604
+ }
605
+ return result;
606
+ }
607
+ __m128d curMax = pSrc[0];
608
+ for(i=1; i<sse2len; i++) {
609
+ curMax = _mm_max_pd(curMax, pSrc[i]);
610
+ }
611
+ const double *pComps = (const double *)&curMax;
612
+ const double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
613
+ const ae_int_t tail = sse2len<<1;
614
+ if(n-tail) {
615
+ const double candidate = *(const double*)(pSrc+i);
616
+ return (candidate > dMax) ? candidate : dMax;
617
+ }
618
+ else {
619
+ return dMax;
620
+ }
621
+ }
622
+
623
+ double rmaxabsv_sse2(ae_int_t n, /* Real */ const double* __restrict x, ae_state* __restrict _state)
624
+ {
625
+ const __m128d signMask = _mm_set1_pd(-0.); // -0. = 1 << 63
626
+ const ae_int_t sse2len = n>>1;
627
+ const __m128d* __restrict pSrc = (const __m128d*)(x);
628
+ if( n<=4 )
629
+ {
630
+ double result;
631
+ ae_int_t i;
632
+ result = 0;
633
+ for(i=0; i<=n-1; i++)
634
+ {
635
+ double v = fabs(x[i]);
636
+ if( v>result )
637
+ result = v;
638
+ }
639
+ return result;
640
+ }
641
+ __m128d curMax = _mm_andnot_pd(signMask, pSrc[0]); // abs
642
+ ae_int_t i;
643
+ for(i=1; i<sse2len; i++)
644
+ curMax = _mm_max_pd(curMax, _mm_andnot_pd(signMask, pSrc[i])); // abs
645
+ const double *pComps = (const double *)&curMax;
646
+ const double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
647
+ const ae_int_t tail = sse2len<<1;
648
+ if(n-tail) {
649
+ const double candidate = ae_fabs(*(const double*)(pSrc+i), _state);
650
+ return (candidate > dMax) ? candidate : dMax;
651
+ }
652
+ else {
653
+ return dMax;
654
+ }
655
+ }
656
+
657
+ static void rcopyvx_sse2_xaligned(const ae_int_t n, const double* __restrict x,
658
+ double* __restrict y, ae_state *_state)
659
+ {
660
+ ae_int_t i;
661
+
662
+ const ae_int_t vecLen = (n>>1)<<1;
663
+ const __m128d * __restrict pSrc = (const __m128d*)x;
664
+ for(i=0; i<vecLen; i+=2) {
665
+ const ae_int_t iSrc = i>>1;
666
+ _mm_storeu_pd(y+i, pSrc[iSrc]);
667
+ }
668
+ if(n-vecLen) {
669
+ y[i] = x[i];
670
+ }
671
+ }
672
+
673
+ void rcopyvx_sse2(const ae_int_t n, const double* __restrict x,
674
+ double* __restrict y, ae_state *_state)
675
+ {
676
+ if((((ptrdiff_t)x) & 15) == 0)
677
+ {
678
+ rcopyvx_sse2_xaligned(n, x, y, _state);
679
+ return;
680
+ }
681
+ y[0] = x[0];
682
+ rcopyvx_sse2_xaligned(n-1, x+1, y+1, _state);
683
+ }
684
+
685
+ static void icopyvx_sse2_xaligned(const ae_int_t n, const ae_int_t* __restrict x,
686
+ ae_int_t* __restrict y, ae_state* __restrict _state)
687
+ {
688
+ const ae_int_t tail = (n*sizeof(ae_int_t)) & 15;
689
+ const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
690
+ const __m128i* __restrict pSrc = (const __m128i*)x;
691
+ const ae_int_t nVec = even>>4;
692
+ const ae_int_t shift_by = 2-sizeof(ae_int_t)/8;
693
+ ae_int_t i;
694
+ for(i=0; i<nVec; i++) {
695
+ const ae_int_t j = i<<shift_by;
696
+ _mm_storeu_si128((__m128i*)(y+j), pSrc[i]);
697
+ }
698
+ i = even/sizeof(ae_int_t);
699
+ if(tail & 8) {
700
+ *(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
701
+ i += 8/sizeof(ae_int_t);
702
+ }
703
+ if(tail & 4) {
704
+ *(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
705
+ }
706
+ }
707
+
708
+ void icopyvx_sse2(const ae_int_t n, const ae_int_t* __restrict x,
709
+ ae_int_t* __restrict y, ae_state* __restrict _state)
710
+ {
711
+ const ptrdiff_t unal = ((ptrdiff_t)x) & 15;
712
+ if( n<=8 )
713
+ {
714
+ ae_int_t j;
715
+ for(j=0; j<=n-1; j++)
716
+ y[j] = x[j];
717
+ return;
718
+ }
719
+ if(unal == 0)
720
+ {
721
+ icopyvx_sse2_xaligned(n, x, y, _state);
722
+ return;
723
+ }
724
+ const ae_int_t offset = 16-unal;
725
+ memmove(y, x, offset);
726
+ const ae_int_t nDone = offset / sizeof(ae_int_t);
727
+ icopyvx_sse2_xaligned(n-nDone, x+nDone, y+nDone, _state);
728
+ }
729
+
730
+ /* ALGLIB_NO_FAST_KERNELS, _ALGLIB_HAS_SSE2_INTRINSICS */
731
+ #endif
732
+
733
+
734
+ }
735
+