alglib4 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +47 -0
- data/ext/alglib/alglib.cpp +537 -0
- data/ext/alglib/alglib_array_converters.cpp +86 -0
- data/ext/alglib/alglib_array_converters.h +15 -0
- data/ext/alglib/alglib_utils.cpp +10 -0
- data/ext/alglib/alglib_utils.h +6 -0
- data/ext/alglib/alglibinternal.cpp +21749 -0
- data/ext/alglib/alglibinternal.h +2168 -0
- data/ext/alglib/alglibmisc.cpp +9106 -0
- data/ext/alglib/alglibmisc.h +2114 -0
- data/ext/alglib/ap.cpp +20094 -0
- data/ext/alglib/ap.h +7244 -0
- data/ext/alglib/dataanalysis.cpp +52588 -0
- data/ext/alglib/dataanalysis.h +10601 -0
- data/ext/alglib/diffequations.cpp +1342 -0
- data/ext/alglib/diffequations.h +282 -0
- data/ext/alglib/extconf.rb +5 -0
- data/ext/alglib/fasttransforms.cpp +4696 -0
- data/ext/alglib/fasttransforms.h +1018 -0
- data/ext/alglib/integration.cpp +4249 -0
- data/ext/alglib/integration.h +869 -0
- data/ext/alglib/interpolation.cpp +74502 -0
- data/ext/alglib/interpolation.h +12264 -0
- data/ext/alglib/kernels_avx2.cpp +2171 -0
- data/ext/alglib/kernels_avx2.h +201 -0
- data/ext/alglib/kernels_fma.cpp +1065 -0
- data/ext/alglib/kernels_fma.h +137 -0
- data/ext/alglib/kernels_sse2.cpp +735 -0
- data/ext/alglib/kernels_sse2.h +100 -0
- data/ext/alglib/linalg.cpp +65182 -0
- data/ext/alglib/linalg.h +9927 -0
- data/ext/alglib/optimization.cpp +135331 -0
- data/ext/alglib/optimization.h +19235 -0
- data/ext/alglib/solvers.cpp +20488 -0
- data/ext/alglib/solvers.h +4781 -0
- data/ext/alglib/specialfunctions.cpp +10672 -0
- data/ext/alglib/specialfunctions.h +2305 -0
- data/ext/alglib/statistics.cpp +19791 -0
- data/ext/alglib/statistics.h +1359 -0
- data/ext/alglib/stdafx.h +2 -0
- data/gpl2.txt +339 -0
- data/gpl3.txt +674 -0
- data/lib/alglib/version.rb +3 -0
- data/lib/alglib.rb +4 -0
- metadata +101 -0
@@ -0,0 +1,735 @@
|
|
1
|
+
/*************************************************************************
|
2
|
+
ALGLIB 4.04.0 (source code generated 2024-12-21)
|
3
|
+
Copyright (c) Sergey Bochkanov (ALGLIB project).
|
4
|
+
|
5
|
+
>>> SOURCE LICENSE >>>
|
6
|
+
This program is free software; you can redistribute it and/or modify
|
7
|
+
it under the terms of the GNU General Public License as published by
|
8
|
+
the Free Software Foundation (www.fsf.org); either version 2 of the
|
9
|
+
License, or (at your option) any later version.
|
10
|
+
|
11
|
+
This program is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
GNU General Public License for more details.
|
15
|
+
|
16
|
+
A copy of the GNU General Public License is available at
|
17
|
+
http://www.fsf.org/licensing/licenses
|
18
|
+
>>> END OF LICENSE >>>
|
19
|
+
*************************************************************************/
|
20
|
+
#ifdef _MSC_VER
|
21
|
+
#define _CRT_SECURE_NO_WARNINGS
|
22
|
+
#endif
|
23
|
+
#include "stdafx.h"
|
24
|
+
|
25
|
+
//
|
26
|
+
// Must be defined before we include kernel header
|
27
|
+
//
|
28
|
+
#define _ALGLIB_IMPL_DEFINES
|
29
|
+
#define _ALGLIB_INTEGRITY_CHECKS_ONCE
|
30
|
+
|
31
|
+
#include "kernels_sse2.h"
|
32
|
+
|
33
|
+
// disable some irrelevant warnings
|
34
|
+
#if (AE_COMPILER==AE_MSVC) && !defined(AE_ALL_WARNINGS)
|
35
|
+
#pragma warning(disable:4100)
|
36
|
+
#pragma warning(disable:4127)
|
37
|
+
#pragma warning(disable:4611)
|
38
|
+
#pragma warning(disable:4702)
|
39
|
+
#pragma warning(disable:4996)
|
40
|
+
#endif
|
41
|
+
|
42
|
+
namespace alglib_impl
|
43
|
+
{
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
#if !defined(ALGLIB_NO_FAST_KERNELS) && defined(_ALGLIB_HAS_SSE2_INTRINSICS)
|
48
|
+
|
49
|
+
double rdotv_sse2(ae_int_t n,
|
50
|
+
/* Real */ const double* x,
|
51
|
+
/* Real */ const double* y,
|
52
|
+
ae_state *_state)
|
53
|
+
{
|
54
|
+
ae_int_t i;
|
55
|
+
|
56
|
+
const ae_int_t sse2len = n>>1;
|
57
|
+
const ae_int_t unrollLen = (sse2len>>3)<<3;
|
58
|
+
const __m128d* __restrict pX = (const __m128d*)(x);
|
59
|
+
const __m128d* __restrict pY = (const __m128d*)(y);
|
60
|
+
__m128d ans;
|
61
|
+
if(unrollLen >= 8) {
|
62
|
+
__m128d unroll0 = _mm_mul_pd(pX[0], pY[0]);
|
63
|
+
__m128d unroll1 = _mm_mul_pd(pX[1], pY[1]);
|
64
|
+
__m128d unroll2 = _mm_mul_pd(pX[2], pY[2]);
|
65
|
+
__m128d unroll3 = _mm_mul_pd(pX[3], pY[3]);
|
66
|
+
__m128d unroll4 = _mm_mul_pd(pX[4], pY[4]);
|
67
|
+
__m128d unroll5 = _mm_mul_pd(pX[5], pY[5]);
|
68
|
+
__m128d unroll6 = _mm_mul_pd(pX[6], pY[6]);
|
69
|
+
__m128d unroll7 = _mm_mul_pd(pX[7], pY[7]);
|
70
|
+
for(i=8; i<unrollLen; i+=8) {
|
71
|
+
unroll0 = _mm_add_pd(_mm_mul_pd(pX[i], pY[i]), unroll0);
|
72
|
+
unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pY[i+1]), unroll1);
|
73
|
+
unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pY[i+2]), unroll2);
|
74
|
+
unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pY[i+3]), unroll3);
|
75
|
+
unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pY[i+4]), unroll4);
|
76
|
+
unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pY[i+5]), unroll5);
|
77
|
+
unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pY[i+6]), unroll6);
|
78
|
+
unroll7 = _mm_add_pd(_mm_mul_pd(pX[i+7], pY[i+7]), unroll7);
|
79
|
+
}
|
80
|
+
switch(sse2len-unrollLen) {
|
81
|
+
case 7:
|
82
|
+
unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pY[i+6]), unroll6);
|
83
|
+
case 6:
|
84
|
+
unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pY[i+5]), unroll5);
|
85
|
+
case 5:
|
86
|
+
unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pY[i+4]), unroll4);
|
87
|
+
case 4:
|
88
|
+
unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pY[i+3]), unroll3);
|
89
|
+
case 3:
|
90
|
+
unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pY[i+2]), unroll2);
|
91
|
+
case 2:
|
92
|
+
unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pY[i+1]), unroll1);
|
93
|
+
case 1:
|
94
|
+
unroll0 = _mm_add_pd(_mm_mul_pd(pX[i+0], pY[i+0]), unroll0);
|
95
|
+
}
|
96
|
+
ans = _mm_add_pd(
|
97
|
+
_mm_add_pd(_mm_add_pd(unroll0, unroll1), _mm_add_pd(unroll2, unroll3)),
|
98
|
+
_mm_add_pd(_mm_add_pd(unroll4, unroll5), _mm_add_pd(unroll6, unroll7)));
|
99
|
+
}
|
100
|
+
else {
|
101
|
+
switch(sse2len) {
|
102
|
+
case 0:
|
103
|
+
if(n == 0) {
|
104
|
+
return 0;
|
105
|
+
} else {
|
106
|
+
return x[0]*y[0];
|
107
|
+
}
|
108
|
+
case 1:
|
109
|
+
ans = _mm_mul_pd(pX[0], pY[0]);
|
110
|
+
break;
|
111
|
+
case 2:
|
112
|
+
ans = _mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1]));
|
113
|
+
break;
|
114
|
+
case 3:
|
115
|
+
ans = _mm_add_pd(
|
116
|
+
_mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
|
117
|
+
_mm_mul_pd(pX[2], pY[2]));
|
118
|
+
break;
|
119
|
+
case 4:
|
120
|
+
ans = _mm_add_pd(
|
121
|
+
_mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
|
122
|
+
_mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3])));
|
123
|
+
break;
|
124
|
+
case 5:
|
125
|
+
ans = _mm_add_pd(
|
126
|
+
_mm_add_pd(
|
127
|
+
_mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
|
128
|
+
_mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3]))),
|
129
|
+
_mm_mul_pd(pX[4], pY[4]));
|
130
|
+
break;
|
131
|
+
case 6:
|
132
|
+
ans = _mm_add_pd(
|
133
|
+
_mm_add_pd(
|
134
|
+
_mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
|
135
|
+
_mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3]))),
|
136
|
+
_mm_add_pd(_mm_mul_pd(pX[4], pY[4]), _mm_mul_pd(pX[5], pY[5])));
|
137
|
+
break;
|
138
|
+
case 7:
|
139
|
+
ans = _mm_add_pd(
|
140
|
+
_mm_add_pd(
|
141
|
+
_mm_add_pd(_mm_mul_pd(pX[0], pY[0]), _mm_mul_pd(pX[1], pY[1])),
|
142
|
+
_mm_add_pd(_mm_mul_pd(pX[2], pY[2]), _mm_mul_pd(pX[3], pY[3]))),
|
143
|
+
_mm_add_pd(
|
144
|
+
_mm_add_pd(_mm_mul_pd(pX[4], pY[4]), _mm_mul_pd(pX[5], pY[5])),
|
145
|
+
_mm_mul_pd(pX[6], pY[6])));
|
146
|
+
break;
|
147
|
+
}
|
148
|
+
}
|
149
|
+
|
150
|
+
const double *pComps = (const double*)&ans;
|
151
|
+
double scalar = pComps[0] + pComps[1];
|
152
|
+
const ae_int_t tail = sse2len<<1;
|
153
|
+
if(n-tail) {
|
154
|
+
return scalar + x[tail]*y[tail];
|
155
|
+
} else {
|
156
|
+
return scalar;
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
double rdotv2_sse2(ae_int_t n,
|
161
|
+
/* Real */ const double* x,
|
162
|
+
ae_state *_state)
|
163
|
+
{
|
164
|
+
ae_int_t i;
|
165
|
+
|
166
|
+
const ae_int_t sse2len = n>>1;
|
167
|
+
const ae_int_t unrollLen = (sse2len>>3)<<3;
|
168
|
+
const __m128d* __restrict pX = (const __m128d*)(x);
|
169
|
+
__m128d ans;
|
170
|
+
if(unrollLen >= 8) {
|
171
|
+
__m128d unroll0 = _mm_mul_pd(pX[0], pX[0]);
|
172
|
+
__m128d unroll1 = _mm_mul_pd(pX[1], pX[1]);
|
173
|
+
__m128d unroll2 = _mm_mul_pd(pX[2], pX[2]);
|
174
|
+
__m128d unroll3 = _mm_mul_pd(pX[3], pX[3]);
|
175
|
+
__m128d unroll4 = _mm_mul_pd(pX[4], pX[4]);
|
176
|
+
__m128d unroll5 = _mm_mul_pd(pX[5], pX[5]);
|
177
|
+
__m128d unroll6 = _mm_mul_pd(pX[6], pX[6]);
|
178
|
+
__m128d unroll7 = _mm_mul_pd(pX[7], pX[7]);
|
179
|
+
for(i=8; i<unrollLen; i+=8) {
|
180
|
+
unroll0 = _mm_add_pd(_mm_mul_pd(pX[i], pX[i]), unroll0);
|
181
|
+
unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pX[i+1]), unroll1);
|
182
|
+
unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pX[i+2]), unroll2);
|
183
|
+
unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pX[i+3]), unroll3);
|
184
|
+
unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pX[i+4]), unroll4);
|
185
|
+
unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pX[i+5]), unroll5);
|
186
|
+
unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pX[i+6]), unroll6);
|
187
|
+
unroll7 = _mm_add_pd(_mm_mul_pd(pX[i+7], pX[i+7]), unroll7);
|
188
|
+
}
|
189
|
+
switch(sse2len-unrollLen) {
|
190
|
+
case 7:
|
191
|
+
unroll6 = _mm_add_pd(_mm_mul_pd(pX[i+6], pX[i+6]), unroll6);
|
192
|
+
case 6:
|
193
|
+
unroll5 = _mm_add_pd(_mm_mul_pd(pX[i+5], pX[i+5]), unroll5);
|
194
|
+
case 5:
|
195
|
+
unroll4 = _mm_add_pd(_mm_mul_pd(pX[i+4], pX[i+4]), unroll4);
|
196
|
+
case 4:
|
197
|
+
unroll3 = _mm_add_pd(_mm_mul_pd(pX[i+3], pX[i+3]), unroll3);
|
198
|
+
case 3:
|
199
|
+
unroll2 = _mm_add_pd(_mm_mul_pd(pX[i+2], pX[i+2]), unroll2);
|
200
|
+
case 2:
|
201
|
+
unroll1 = _mm_add_pd(_mm_mul_pd(pX[i+1], pX[i+1]), unroll1);
|
202
|
+
case 1:
|
203
|
+
unroll0 = _mm_add_pd(_mm_mul_pd(pX[i+0], pX[i+0]), unroll0);
|
204
|
+
}
|
205
|
+
ans = _mm_add_pd(
|
206
|
+
_mm_add_pd(_mm_add_pd(unroll0, unroll1), _mm_add_pd(unroll2, unroll3)),
|
207
|
+
_mm_add_pd(_mm_add_pd(unroll4, unroll5), _mm_add_pd(unroll6, unroll7)));
|
208
|
+
}
|
209
|
+
else {
|
210
|
+
switch(sse2len) {
|
211
|
+
case 0:
|
212
|
+
if(n == 0) {
|
213
|
+
return 0;
|
214
|
+
} else {
|
215
|
+
return x[0]*x[0];
|
216
|
+
}
|
217
|
+
case 1:
|
218
|
+
ans = _mm_mul_pd(pX[0], pX[0]);
|
219
|
+
break;
|
220
|
+
case 2:
|
221
|
+
ans = _mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1]));
|
222
|
+
break;
|
223
|
+
case 3:
|
224
|
+
ans = _mm_add_pd(
|
225
|
+
_mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
|
226
|
+
_mm_mul_pd(pX[2], pX[2]));
|
227
|
+
break;
|
228
|
+
case 4:
|
229
|
+
ans = _mm_add_pd(
|
230
|
+
_mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
|
231
|
+
_mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3])));
|
232
|
+
break;
|
233
|
+
case 5:
|
234
|
+
ans = _mm_add_pd(
|
235
|
+
_mm_add_pd(
|
236
|
+
_mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
|
237
|
+
_mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3]))),
|
238
|
+
_mm_mul_pd(pX[4], pX[4]));
|
239
|
+
break;
|
240
|
+
case 6:
|
241
|
+
ans = _mm_add_pd(
|
242
|
+
_mm_add_pd(
|
243
|
+
_mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
|
244
|
+
_mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3]))),
|
245
|
+
_mm_add_pd(_mm_mul_pd(pX[4], pX[4]), _mm_mul_pd(pX[5], pX[5])));
|
246
|
+
break;
|
247
|
+
case 7:
|
248
|
+
ans = _mm_add_pd(
|
249
|
+
_mm_add_pd(
|
250
|
+
_mm_add_pd(_mm_mul_pd(pX[0], pX[0]), _mm_mul_pd(pX[1], pX[1])),
|
251
|
+
_mm_add_pd(_mm_mul_pd(pX[2], pX[2]), _mm_mul_pd(pX[3], pX[3]))),
|
252
|
+
_mm_add_pd(
|
253
|
+
_mm_add_pd(_mm_mul_pd(pX[4], pX[4]), _mm_mul_pd(pX[5], pX[5])),
|
254
|
+
_mm_mul_pd(pX[6], pX[6])));
|
255
|
+
break;
|
256
|
+
}
|
257
|
+
}
|
258
|
+
|
259
|
+
const double *pComps = (const double*)&ans;
|
260
|
+
double scalar = pComps[0] + pComps[1];
|
261
|
+
const ae_int_t tail = sse2len<<1;
|
262
|
+
if(n-tail) {
|
263
|
+
return scalar + x[tail]*x[tail];
|
264
|
+
} else {
|
265
|
+
return scalar;
|
266
|
+
}
|
267
|
+
}
|
268
|
+
|
269
|
+
void rcopyv_sse2(const ae_int_t n,
|
270
|
+
/* Real */ const double* __restrict x,
|
271
|
+
/* Real */ double* __restrict y,
|
272
|
+
ae_state* __restrict _state)
|
273
|
+
{
|
274
|
+
ae_int_t i;
|
275
|
+
const ae_int_t sse2len = n>>1;
|
276
|
+
const ae_int_t tail = sse2len<<1;
|
277
|
+
const __m128d* __restrict pSrc = (const __m128d*)(x);
|
278
|
+
__m128d* __restrict pDest = (__m128d*)(y);
|
279
|
+
|
280
|
+
for(i=0; i<sse2len; i++)
|
281
|
+
pDest[i] = pSrc[i];
|
282
|
+
if( n-tail )
|
283
|
+
*(double*)(pDest+i) = *(const double*)(pSrc+i);
|
284
|
+
}
|
285
|
+
|
286
|
+
void rcopymulv_sse2(const ae_int_t n,
|
287
|
+
const double v,
|
288
|
+
/* Real */ const double* __restrict x,
|
289
|
+
/* Real */ double* __restrict y,
|
290
|
+
const ae_state* __restrict _state)
|
291
|
+
{
|
292
|
+
ae_int_t i;
|
293
|
+
|
294
|
+
const ae_int_t sse2len = n>>1;
|
295
|
+
const __m128d* __restrict pSrc = (const __m128d*)(x);
|
296
|
+
__m128d* __restrict pDest = (__m128d*)(y);
|
297
|
+
const __m128d sse2v = _mm_set1_pd(v);
|
298
|
+
const ae_int_t tail = sse2len<<1;
|
299
|
+
for(i=0; i<sse2len; i++) {
|
300
|
+
pDest[i] = _mm_mul_pd(sse2v, pSrc[i]);
|
301
|
+
}
|
302
|
+
if(n-tail) {
|
303
|
+
*(double*)(pDest+i) = v * (*(const double*)(pSrc+i));
|
304
|
+
}
|
305
|
+
}
|
306
|
+
|
307
|
+
void icopyv_sse2(const ae_int_t n, const ae_int_t* __restrict x,
|
308
|
+
ae_int_t* __restrict y, ae_state* __restrict _state)
|
309
|
+
{
|
310
|
+
const ae_int_t tail = (n*sizeof(ae_int_t)) & 15;
|
311
|
+
const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
|
312
|
+
__m128i *__restrict pDest = (__m128i*)y;
|
313
|
+
const __m128i* __restrict pSrc = (const __m128i*)x;
|
314
|
+
const ae_int_t nVec = even>>4;
|
315
|
+
ae_int_t i;
|
316
|
+
for(i=0; i<nVec; i++) {
|
317
|
+
pDest[i] = pSrc[i];
|
318
|
+
}
|
319
|
+
i = even/sizeof(ae_int_t);
|
320
|
+
if(tail & 8) {
|
321
|
+
*(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
|
322
|
+
i += 8/sizeof(ae_int_t);
|
323
|
+
}
|
324
|
+
if(tail & 4) {
|
325
|
+
*(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
|
326
|
+
}
|
327
|
+
}
|
328
|
+
|
329
|
+
void bcopyv_sse2(const ae_int_t n, const ae_bool* __restrict x,
|
330
|
+
ae_bool* __restrict y, ae_state* __restrict _state)
|
331
|
+
{
|
332
|
+
const ae_int_t tail = n & 15;
|
333
|
+
const ae_int_t even = n-tail;
|
334
|
+
__m128i *__restrict pDest = (__m128i*)y;
|
335
|
+
const __m128i* __restrict pSrc = (const __m128i*)x;
|
336
|
+
const ae_int_t nVec = even>>4;
|
337
|
+
ae_int_t i;
|
338
|
+
for(i=0; i<nVec; i++) {
|
339
|
+
pDest[i] = pSrc[i];
|
340
|
+
}
|
341
|
+
i = even;
|
342
|
+
if(tail & 8) {
|
343
|
+
*(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
|
344
|
+
i += 8;
|
345
|
+
}
|
346
|
+
if(tail & 4) {
|
347
|
+
*(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
|
348
|
+
i += 4;
|
349
|
+
}
|
350
|
+
if(tail & 2)
|
351
|
+
{
|
352
|
+
*(y+i+0) = *(x+i+0);
|
353
|
+
*(y+i+1) = *(x+i+1);
|
354
|
+
i += 2;
|
355
|
+
}
|
356
|
+
if(tail & 1) {
|
357
|
+
*(y+i) = *(x+i);
|
358
|
+
}
|
359
|
+
}
|
360
|
+
|
361
|
+
void rsetv_sse2(const ae_int_t n,
|
362
|
+
const double v,
|
363
|
+
/* Real */ double* __restrict x,
|
364
|
+
const ae_state* __restrict _state)
|
365
|
+
{
|
366
|
+
ae_int_t i;
|
367
|
+
|
368
|
+
const ae_int_t sse2len = n>>1;
|
369
|
+
__m128d* __restrict pDest = (__m128d*)(x);
|
370
|
+
const __m128d sse2v = _mm_set1_pd(v);
|
371
|
+
for(i=0; i<sse2len; i++) {
|
372
|
+
pDest[i] = sse2v;
|
373
|
+
}
|
374
|
+
const ae_int_t tail = sse2len<<1;
|
375
|
+
if(n-tail) {
|
376
|
+
*(double*)(pDest+i) = v;
|
377
|
+
}
|
378
|
+
}
|
379
|
+
|
380
|
+
void rsetvx_sse2(const ae_int_t n,
|
381
|
+
const double v,
|
382
|
+
/* Real */ double* __restrict x,
|
383
|
+
const ae_state* __restrict _state)
|
384
|
+
{
|
385
|
+
if( n<=4 )
|
386
|
+
{
|
387
|
+
ae_int_t j;
|
388
|
+
for(j=0; j<=n-1; j++)
|
389
|
+
x[j] = v;
|
390
|
+
return;
|
391
|
+
}
|
392
|
+
if((((ptrdiff_t)x) & 15) == 0)
|
393
|
+
{
|
394
|
+
rsetv_sse2(n, v, x, _state);
|
395
|
+
return;
|
396
|
+
}
|
397
|
+
x[0] = v;
|
398
|
+
rsetv_sse2(n-1, v, x+1, _state);
|
399
|
+
}
|
400
|
+
|
401
|
+
void isetv_sse2(const ae_int_t n, const ae_int_t v,
|
402
|
+
ae_int_t* __restrict x, ae_state* __restrict _state)
|
403
|
+
{
|
404
|
+
const ae_int_t tail = (n*sizeof(ae_int_t)) & 15;
|
405
|
+
const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
|
406
|
+
__m128i *__restrict pDest = (__m128i*)x;
|
407
|
+
const ae_int_t v2[2] = {v, v};
|
408
|
+
const __m128i sse2v = ((sizeof(v) == 4) ? _mm_set1_epi32((ae_int32_t)v) : _mm_loadu_si128((const __m128i*)(&v2[0])));
|
409
|
+
const ae_int_t nVec = even>>4;
|
410
|
+
ae_int_t i;
|
411
|
+
for(i=0; i<nVec; i++) {
|
412
|
+
pDest[i] = sse2v;
|
413
|
+
}
|
414
|
+
memmove(pDest+i, &sse2v, tail);
|
415
|
+
}
|
416
|
+
|
417
|
+
void bsetv_sse2(const ae_int_t n, const ae_bool v, ae_bool* __restrict x,
|
418
|
+
ae_state* __restrict _state)
|
419
|
+
{
|
420
|
+
const ae_int_t tail = n & 15;
|
421
|
+
const ae_int_t even = n-tail;
|
422
|
+
__m128i *__restrict pDest = (__m128i*)x;
|
423
|
+
const __m128i sse2v = _mm_set1_epi8(v);
|
424
|
+
const ae_int_t nVec = even>>4;
|
425
|
+
ae_int_t i;
|
426
|
+
for(i=0; i<nVec; i++) {
|
427
|
+
pDest[i] = sse2v;
|
428
|
+
}
|
429
|
+
/* _mm_storel_epi64() has a too high latency and too low throughput on the recent (Skylake+) processors */
|
430
|
+
memset(x+even, v, tail);
|
431
|
+
}
|
432
|
+
|
433
|
+
void rmulv_sse2(const ae_int_t n, const double v, double* __restrict x,
|
434
|
+
const ae_state* __restrict _state)
|
435
|
+
{
|
436
|
+
ae_int_t i;
|
437
|
+
|
438
|
+
const ae_int_t sse2len = n>>1;
|
439
|
+
__m128d* __restrict pDest = (__m128d*)(x);
|
440
|
+
const __m128d sse2v = _mm_set1_pd(v);
|
441
|
+
for(i=0; i<sse2len; i++) {
|
442
|
+
pDest[i] = _mm_mul_pd(sse2v, pDest[i]);
|
443
|
+
}
|
444
|
+
const ae_int_t tail = sse2len<<1;
|
445
|
+
if(n-tail) {
|
446
|
+
*(double*)(pDest+i) = v * (*(const double*)(pDest+i));
|
447
|
+
}
|
448
|
+
}
|
449
|
+
|
450
|
+
void rmulvx_sse2(const ae_int_t n, const double v, double* __restrict x,
|
451
|
+
const ae_state* __restrict _state)
|
452
|
+
{
|
453
|
+
if( n<=4 )
|
454
|
+
{
|
455
|
+
ae_int_t i;
|
456
|
+
for(i=0; i<=n-1; i++)
|
457
|
+
x[i] *= v;
|
458
|
+
return;
|
459
|
+
}
|
460
|
+
if((((ptrdiff_t)x) & 15) == 0)
|
461
|
+
{
|
462
|
+
rmulv_sse2(n, v, x, _state);
|
463
|
+
return;
|
464
|
+
}
|
465
|
+
x[0] = v * x[0];
|
466
|
+
rmulv_sse2(n-1, v, x+1, _state);
|
467
|
+
}
|
468
|
+
|
469
|
+
void raddv_sse2(const ae_int_t n,
|
470
|
+
const double alpha,
|
471
|
+
/* Real */ const double* __restrict y,
|
472
|
+
/* Real */ double* __restrict x,
|
473
|
+
const ae_state* __restrict _state)
|
474
|
+
{
|
475
|
+
ae_int_t i;
|
476
|
+
|
477
|
+
const ae_int_t sse2len = n>>1;
|
478
|
+
const __m128d* __restrict pSrc = (const __m128d*)(y);
|
479
|
+
__m128d* __restrict pDest = (__m128d*)(x);
|
480
|
+
const __m128d sse2alpha = _mm_set1_pd(alpha);
|
481
|
+
for(i=0; i<sse2len; i++) {
|
482
|
+
pDest[i] = _mm_add_pd(_mm_mul_pd(sse2alpha, pSrc[i]), pDest[i]);
|
483
|
+
}
|
484
|
+
const ae_int_t tail = sse2len<<1;
|
485
|
+
if(n-tail) {
|
486
|
+
*(double*)(pDest+i) = alpha * (*(const double*)(pSrc+i))
|
487
|
+
+ (*(const double*)(pDest+i));
|
488
|
+
}
|
489
|
+
}
|
490
|
+
|
491
|
+
void raddvx_sse2_xaligned(const ae_int_t n, const double alpha,
|
492
|
+
const double* __restrict y, double* __restrict x, ae_state *_state)
|
493
|
+
{
|
494
|
+
ae_int_t i;
|
495
|
+
|
496
|
+
const ae_int_t vecLen = (n>>1)<<1;
|
497
|
+
const __m128d sse2alpha = _mm_set1_pd(alpha);
|
498
|
+
__m128d * __restrict pDest = (__m128d*)x;
|
499
|
+
for(i=0; i<vecLen; i+=2)
|
500
|
+
{
|
501
|
+
const ae_int_t iDest = i>>1;
|
502
|
+
pDest[iDest] = _mm_add_pd(_mm_mul_pd(sse2alpha, _mm_loadu_pd(y+i)), pDest[iDest]);
|
503
|
+
}
|
504
|
+
if(n-vecLen)
|
505
|
+
x[i] += alpha*y[i];
|
506
|
+
}
|
507
|
+
|
508
|
+
void raddvx_sse2(const ae_int_t n, const double alpha,
|
509
|
+
const double* __restrict y, double* __restrict x, ae_state *_state)
|
510
|
+
{
|
511
|
+
if( n<=4 )
|
512
|
+
{
|
513
|
+
ae_int_t i;
|
514
|
+
for(i=0; i<=n-1; i++)
|
515
|
+
x[i] += alpha*y[i];
|
516
|
+
return;
|
517
|
+
}
|
518
|
+
if((((ptrdiff_t)x) & 15) == 0)
|
519
|
+
{
|
520
|
+
raddvx_sse2_xaligned(n, alpha, y, x, _state);
|
521
|
+
return;
|
522
|
+
}
|
523
|
+
x[0] += alpha*y[0];
|
524
|
+
raddvx_sse2_xaligned(n-1, alpha, y+1, x+1, _state);
|
525
|
+
}
|
526
|
+
|
527
|
+
void rmergemulv_sse2(const ae_int_t n,
|
528
|
+
/* Real */ const double* __restrict y,
|
529
|
+
/* Real */ double* __restrict x,
|
530
|
+
const ae_state* __restrict _state)
|
531
|
+
{
|
532
|
+
ae_int_t i;
|
533
|
+
|
534
|
+
const ae_int_t sse2len = n>>1;
|
535
|
+
const __m128d* __restrict pSrc = (const __m128d*)(y);
|
536
|
+
__m128d* __restrict pDest = (__m128d*)(x);
|
537
|
+
for(i=0; i<sse2len; i++) {
|
538
|
+
pDest[i] = _mm_mul_pd(pSrc[i], pDest[i]);
|
539
|
+
}
|
540
|
+
const ae_int_t tail = sse2len<<1;
|
541
|
+
if(n-tail) {
|
542
|
+
*(double*)(pDest+i) = (*(const double*)(pSrc+i))
|
543
|
+
* (*(const double*)(pDest+i));
|
544
|
+
}
|
545
|
+
}
|
546
|
+
|
547
|
+
void rmergemaxv_sse2(const ae_int_t n,
|
548
|
+
/* Real */ const double* __restrict y,
|
549
|
+
/* Real */ double* __restrict x,
|
550
|
+
ae_state* __restrict _state)
|
551
|
+
{
|
552
|
+
ae_int_t i;
|
553
|
+
|
554
|
+
const ae_int_t sse2len = n>>1;
|
555
|
+
const __m128d* __restrict pSrc = (const __m128d*)(y);
|
556
|
+
__m128d* __restrict pDest = (__m128d*)(x);
|
557
|
+
for(i=0; i<sse2len; i++) {
|
558
|
+
pDest[i] = _mm_max_pd(pSrc[i], pDest[i]);
|
559
|
+
}
|
560
|
+
const ae_int_t tail = sse2len<<1;
|
561
|
+
if(n-tail) {
|
562
|
+
*(double*)(pDest+i) = ae_maxreal(*(const double*)(pSrc+i),
|
563
|
+
*(const double*)(pDest+i), _state);
|
564
|
+
}
|
565
|
+
}
|
566
|
+
|
567
|
+
void rmergeminv_sse2(const ae_int_t n,
|
568
|
+
/* Real */ const double* __restrict y,
|
569
|
+
/* Real */ double* __restrict x,
|
570
|
+
ae_state* __restrict _state)
|
571
|
+
{
|
572
|
+
ae_int_t i;
|
573
|
+
|
574
|
+
const ae_int_t sse2len = n>>1;
|
575
|
+
const __m128d* __restrict pSrc = (const __m128d*)(y);
|
576
|
+
__m128d* __restrict pDest = (__m128d*)(x);
|
577
|
+
for(i=0; i<sse2len; i++) {
|
578
|
+
pDest[i] = _mm_min_pd(pSrc[i], pDest[i]);
|
579
|
+
}
|
580
|
+
const ae_int_t tail = sse2len<<1;
|
581
|
+
if(n-tail) {
|
582
|
+
*(double*)(pDest+i) = ae_minreal(*(const double*)(pSrc+i),
|
583
|
+
*(const double*)(pDest+i), _state);
|
584
|
+
}
|
585
|
+
}
|
586
|
+
|
587
|
+
double rmaxv_sse2(ae_int_t n, /* Real */ const double* __restrict x, ae_state* __restrict _state)
|
588
|
+
{
|
589
|
+
ae_int_t i;
|
590
|
+
|
591
|
+
const ae_int_t sse2len = n>>1;
|
592
|
+
const __m128d* __restrict pSrc = (const __m128d*)(x);
|
593
|
+
if( n<=4 )
|
594
|
+
{
|
595
|
+
double result;
|
596
|
+
if(n == 0)
|
597
|
+
return 0.0;
|
598
|
+
result = x[0];
|
599
|
+
for(i=1; i<=n-1; i++)
|
600
|
+
{
|
601
|
+
double v = x[i];
|
602
|
+
if( v>result )
|
603
|
+
result = v;
|
604
|
+
}
|
605
|
+
return result;
|
606
|
+
}
|
607
|
+
__m128d curMax = pSrc[0];
|
608
|
+
for(i=1; i<sse2len; i++) {
|
609
|
+
curMax = _mm_max_pd(curMax, pSrc[i]);
|
610
|
+
}
|
611
|
+
const double *pComps = (const double *)&curMax;
|
612
|
+
const double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
|
613
|
+
const ae_int_t tail = sse2len<<1;
|
614
|
+
if(n-tail) {
|
615
|
+
const double candidate = *(const double*)(pSrc+i);
|
616
|
+
return (candidate > dMax) ? candidate : dMax;
|
617
|
+
}
|
618
|
+
else {
|
619
|
+
return dMax;
|
620
|
+
}
|
621
|
+
}
|
622
|
+
|
623
|
+
double rmaxabsv_sse2(ae_int_t n, /* Real */ const double* __restrict x, ae_state* __restrict _state)
|
624
|
+
{
|
625
|
+
const __m128d signMask = _mm_set1_pd(-0.); // -0. = 1 << 63
|
626
|
+
const ae_int_t sse2len = n>>1;
|
627
|
+
const __m128d* __restrict pSrc = (const __m128d*)(x);
|
628
|
+
if( n<=4 )
|
629
|
+
{
|
630
|
+
double result;
|
631
|
+
ae_int_t i;
|
632
|
+
result = 0;
|
633
|
+
for(i=0; i<=n-1; i++)
|
634
|
+
{
|
635
|
+
double v = fabs(x[i]);
|
636
|
+
if( v>result )
|
637
|
+
result = v;
|
638
|
+
}
|
639
|
+
return result;
|
640
|
+
}
|
641
|
+
__m128d curMax = _mm_andnot_pd(signMask, pSrc[0]); // abs
|
642
|
+
ae_int_t i;
|
643
|
+
for(i=1; i<sse2len; i++)
|
644
|
+
curMax = _mm_max_pd(curMax, _mm_andnot_pd(signMask, pSrc[i])); // abs
|
645
|
+
const double *pComps = (const double *)&curMax;
|
646
|
+
const double dMax = (pComps[0] > pComps[1]) ? pComps[0] : pComps[1];
|
647
|
+
const ae_int_t tail = sse2len<<1;
|
648
|
+
if(n-tail) {
|
649
|
+
const double candidate = ae_fabs(*(const double*)(pSrc+i), _state);
|
650
|
+
return (candidate > dMax) ? candidate : dMax;
|
651
|
+
}
|
652
|
+
else {
|
653
|
+
return dMax;
|
654
|
+
}
|
655
|
+
}
|
656
|
+
|
657
|
+
static void rcopyvx_sse2_xaligned(const ae_int_t n, const double* __restrict x,
|
658
|
+
double* __restrict y, ae_state *_state)
|
659
|
+
{
|
660
|
+
ae_int_t i;
|
661
|
+
|
662
|
+
const ae_int_t vecLen = (n>>1)<<1;
|
663
|
+
const __m128d * __restrict pSrc = (const __m128d*)x;
|
664
|
+
for(i=0; i<vecLen; i+=2) {
|
665
|
+
const ae_int_t iSrc = i>>1;
|
666
|
+
_mm_storeu_pd(y+i, pSrc[iSrc]);
|
667
|
+
}
|
668
|
+
if(n-vecLen) {
|
669
|
+
y[i] = x[i];
|
670
|
+
}
|
671
|
+
}
|
672
|
+
|
673
|
+
void rcopyvx_sse2(const ae_int_t n, const double* __restrict x,
|
674
|
+
double* __restrict y, ae_state *_state)
|
675
|
+
{
|
676
|
+
if((((ptrdiff_t)x) & 15) == 0)
|
677
|
+
{
|
678
|
+
rcopyvx_sse2_xaligned(n, x, y, _state);
|
679
|
+
return;
|
680
|
+
}
|
681
|
+
y[0] = x[0];
|
682
|
+
rcopyvx_sse2_xaligned(n-1, x+1, y+1, _state);
|
683
|
+
}
|
684
|
+
|
685
|
+
static void icopyvx_sse2_xaligned(const ae_int_t n, const ae_int_t* __restrict x,
|
686
|
+
ae_int_t* __restrict y, ae_state* __restrict _state)
|
687
|
+
{
|
688
|
+
const ae_int_t tail = (n*sizeof(ae_int_t)) & 15;
|
689
|
+
const ae_int_t even = (n*sizeof(ae_int_t)) - tail;
|
690
|
+
const __m128i* __restrict pSrc = (const __m128i*)x;
|
691
|
+
const ae_int_t nVec = even>>4;
|
692
|
+
const ae_int_t shift_by = 2-sizeof(ae_int_t)/8;
|
693
|
+
ae_int_t i;
|
694
|
+
for(i=0; i<nVec; i++) {
|
695
|
+
const ae_int_t j = i<<shift_by;
|
696
|
+
_mm_storeu_si128((__m128i*)(y+j), pSrc[i]);
|
697
|
+
}
|
698
|
+
i = even/sizeof(ae_int_t);
|
699
|
+
if(tail & 8) {
|
700
|
+
*(ae_int64_t*)(y+i) = *(const ae_int64_t*)(x+i);
|
701
|
+
i += 8/sizeof(ae_int_t);
|
702
|
+
}
|
703
|
+
if(tail & 4) {
|
704
|
+
*(ae_int32_t*)(y+i) = *(const ae_int32_t*)(x+i);
|
705
|
+
}
|
706
|
+
}
|
707
|
+
|
708
|
+
void icopyvx_sse2(const ae_int_t n, const ae_int_t* __restrict x,
|
709
|
+
ae_int_t* __restrict y, ae_state* __restrict _state)
|
710
|
+
{
|
711
|
+
const ptrdiff_t unal = ((ptrdiff_t)x) & 15;
|
712
|
+
if( n<=8 )
|
713
|
+
{
|
714
|
+
ae_int_t j;
|
715
|
+
for(j=0; j<=n-1; j++)
|
716
|
+
y[j] = x[j];
|
717
|
+
return;
|
718
|
+
}
|
719
|
+
if(unal == 0)
|
720
|
+
{
|
721
|
+
icopyvx_sse2_xaligned(n, x, y, _state);
|
722
|
+
return;
|
723
|
+
}
|
724
|
+
const ae_int_t offset = 16-unal;
|
725
|
+
memmove(y, x, offset);
|
726
|
+
const ae_int_t nDone = offset / sizeof(ae_int_t);
|
727
|
+
icopyvx_sse2_xaligned(n-nDone, x+nDone, y+nDone, _state);
|
728
|
+
}
|
729
|
+
|
730
|
+
/* ALGLIB_NO_FAST_KERNELS, _ALGLIB_HAS_SSE2_INTRINSICS */
|
731
|
+
#endif
|
732
|
+
|
733
|
+
|
734
|
+
}
|
735
|
+
|