tomoto 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/LICENSE.txt +1 -1
- data/ext/tomoto/ct.cpp +1 -1
- data/ext/tomoto/dmr.cpp +1 -1
- data/ext/tomoto/dt.cpp +1 -1
- data/ext/tomoto/extconf.rb +4 -8
- data/ext/tomoto/gdmr.cpp +1 -1
- data/ext/tomoto/hdp.cpp +1 -1
- data/ext/tomoto/hlda.cpp +1 -1
- data/ext/tomoto/hpa.cpp +1 -1
- data/ext/tomoto/lda.cpp +29 -3
- data/ext/tomoto/llda.cpp +1 -1
- data/ext/tomoto/mglda.cpp +1 -1
- data/ext/tomoto/pa.cpp +1 -1
- data/ext/tomoto/plda.cpp +1 -1
- data/ext/tomoto/slda.cpp +1 -1
- data/lib/tomoto/lda.rb +1 -0
- data/lib/tomoto/version.rb +1 -1
- data/vendor/EigenRand/EigenRand/Core.h +6 -4
- data/vendor/EigenRand/EigenRand/CwiseHeteroBinaryOp.h +265 -0
- data/vendor/EigenRand/EigenRand/Dists/Basic.h +345 -12
- data/vendor/EigenRand/EigenRand/Dists/Discrete.h +381 -7
- data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +4 -4
- data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +576 -4
- data/vendor/EigenRand/EigenRand/EigenRand +4 -4
- data/vendor/EigenRand/EigenRand/Macro.h +3 -3
- data/vendor/EigenRand/EigenRand/MorePacketMath.h +31 -30
- data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +41 -29
- data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +19 -7
- data/vendor/EigenRand/EigenRand/PacketFilter.h +8 -5
- data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +3 -3
- data/vendor/EigenRand/EigenRand/RandUtils.h +180 -5
- data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +42 -3
- data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +3 -3
- data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +3 -3
- data/vendor/EigenRand/EigenRand/arch/AVX512/MorePacketMath.h +312 -0
- data/vendor/EigenRand/EigenRand/arch/AVX512/PacketFilter.h +79 -0
- data/vendor/EigenRand/EigenRand/arch/AVX512/RandUtils.h +147 -0
- data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +118 -3
- data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +3 -3
- data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +21 -3
- data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +32 -4
- data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +3 -3
- data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +3 -3
- data/vendor/EigenRand/EigenRand/doc.h +108 -157
- data/vendor/EigenRand/README.md +60 -272
- data/vendor/tomotopy/README.kr.rst +27 -5
- data/vendor/tomotopy/README.rst +27 -5
- data/vendor/tomotopy/README_pypi.rst +583 -0
- data/vendor/tomotopy/licenses_bundled/EigenRand +21 -0
- data/vendor/tomotopy/src/TopicModel/CT.h +1 -1
- data/vendor/tomotopy/src/TopicModel/CTModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/DMR.h +1 -1
- data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/DT.h +1 -1
- data/vendor/tomotopy/src/TopicModel/DTModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/GDMR.h +1 -1
- data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/HDP.h +1 -1
- data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/HLDA.h +1 -1
- data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/HPA.h +1 -1
- data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/LDA.h +1 -1
- data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +6 -6
- data/vendor/tomotopy/src/TopicModel/LLDA.h +1 -1
- data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +1 -1
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/PA.h +1 -1
- data/vendor/tomotopy/src/TopicModel/PAModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/PLDA.h +1 -1
- data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/PT.h +1 -1
- data/vendor/tomotopy/src/TopicModel/PTModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/SLDA.h +1 -1
- data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +1 -1
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +6 -6
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +41 -0
- data/vendor/tomotopy/src/Utils/ThreadPool.hpp +6 -6
- data/vendor/tomotopy/src/Utils/Utils.hpp +3 -3
- data/vendor/tomotopy/src/Utils/avx512_gamma.h +46 -0
- data/vendor/tomotopy/src/Utils/avx512_mathfun.h +99 -0
- metadata +10 -9
- data/vendor/variant/LICENSE +0 -25
- data/vendor/variant/LICENSE_1_0.txt +0 -23
- data/vendor/variant/README.md +0 -102
- data/vendor/variant/include/mapbox/optional.hpp +0 -74
- data/vendor/variant/include/mapbox/recursive_wrapper.hpp +0 -122
- data/vendor/variant/include/mapbox/variant.hpp +0 -974
- data/vendor/variant/include/mapbox/variant_io.hpp +0 -45
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file MorePacketMath.h
|
|
3
|
+
* @author bab2min (bab2min@gmail.com)
|
|
4
|
+
* @brief
|
|
5
|
+
* @version 0.5.1
|
|
6
|
+
* @date 2024-09-08
|
|
7
|
+
*
|
|
8
|
+
* @copyright Copyright (c) 2020-2024
|
|
9
|
+
*
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
#ifndef EIGENRAND_MORE_PACKET_MATH_AVX512_H
|
|
13
|
+
#define EIGENRAND_MORE_PACKET_MATH_AVX512_H
|
|
14
|
+
|
|
15
|
+
#include <immintrin.h>
|
|
16
|
+
|
|
17
|
+
namespace Eigen
|
|
18
|
+
{
|
|
19
|
+
namespace internal
|
|
20
|
+
{
|
|
21
|
+
template<>
|
|
22
|
+
struct IsIntPacket<Packet16i> : std::true_type {};
|
|
23
|
+
|
|
24
|
+
template<>
|
|
25
|
+
struct HalfPacket<Packet16i>
|
|
26
|
+
{
|
|
27
|
+
using type = Packet8i;
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
template<>
|
|
31
|
+
struct HalfPacket<Packet16f>
|
|
32
|
+
{
|
|
33
|
+
using type = Packet8f;
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
template<>
|
|
37
|
+
struct IsFloatPacket<Packet16f> : std::true_type {};
|
|
38
|
+
|
|
39
|
+
template<>
|
|
40
|
+
struct IsDoublePacket<Packet8d> : std::true_type {};
|
|
41
|
+
|
|
42
|
+
template<>
|
|
43
|
+
struct reinterpreter<Packet16i>
|
|
44
|
+
{
|
|
45
|
+
EIGEN_STRONG_INLINE Packet16f to_float(const Packet16i& x)
|
|
46
|
+
{
|
|
47
|
+
return _mm512_castsi512_ps(x);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
EIGEN_STRONG_INLINE Packet8d to_double(const Packet16i& x)
|
|
51
|
+
{
|
|
52
|
+
return _mm512_castsi512_pd(x);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
EIGEN_STRONG_INLINE Packet16i to_int(const Packet16i& x)
|
|
56
|
+
{
|
|
57
|
+
return x;
|
|
58
|
+
}
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
template<>
|
|
62
|
+
struct reinterpreter<Packet16f>
|
|
63
|
+
{
|
|
64
|
+
EIGEN_STRONG_INLINE Packet16f to_float(const Packet16f& x)
|
|
65
|
+
{
|
|
66
|
+
return x;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
EIGEN_STRONG_INLINE Packet8d to_double(const Packet16f& x)
|
|
70
|
+
{
|
|
71
|
+
return _mm512_castps_pd(x);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
EIGEN_STRONG_INLINE Packet16i to_int(const Packet16f& x)
|
|
75
|
+
{
|
|
76
|
+
return _mm512_castps_si512(x);
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
template<>
|
|
81
|
+
struct reinterpreter<Packet8d>
|
|
82
|
+
{
|
|
83
|
+
EIGEN_STRONG_INLINE Packet16f to_float(const Packet8d& x)
|
|
84
|
+
{
|
|
85
|
+
return _mm512_castpd_ps(x);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
EIGEN_STRONG_INLINE Packet8d to_double(const Packet8d& x)
|
|
89
|
+
{
|
|
90
|
+
return x;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
EIGEN_STRONG_INLINE Packet16i to_int(const Packet8d& x)
|
|
94
|
+
{
|
|
95
|
+
return _mm512_castpd_si512(x);
|
|
96
|
+
}
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
template<>
|
|
100
|
+
EIGEN_STRONG_INLINE Packet16i pseti64<Packet16i>(uint64_t a)
|
|
101
|
+
{
|
|
102
|
+
return _mm512_set1_epi64(a);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
template<>
|
|
106
|
+
EIGEN_STRONG_INLINE Packet16i padd64<Packet16i>(const Packet16i& a, const Packet16i& b)
|
|
107
|
+
{
|
|
108
|
+
return _mm512_add_epi64(a, b);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
template<>
|
|
112
|
+
EIGEN_STRONG_INLINE Packet16i psub64<Packet16i>(const Packet16i& a, const Packet16i& b)
|
|
113
|
+
{
|
|
114
|
+
return _mm512_sub_epi64(a, b);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
template<>
|
|
118
|
+
EIGEN_STRONG_INLINE Packet16i pcmpeq<Packet16i>(const Packet16i& a, const Packet16i& b)
|
|
119
|
+
{
|
|
120
|
+
return pcmp_eq(a, b);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
template<>
|
|
124
|
+
EIGEN_STRONG_INLINE Packet16f pcmpeq<Packet16f>(const Packet16f& a, const Packet16f& b)
|
|
125
|
+
{
|
|
126
|
+
return pcmp_eq(a, b);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
template<>
|
|
130
|
+
EIGEN_STRONG_INLINE Packet16i pnegate<Packet16i>(const Packet16i& a)
|
|
131
|
+
{
|
|
132
|
+
return _mm512_sub_epi32(pset1<Packet16i>(0), a);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
template<>
|
|
136
|
+
struct BitShifter<Packet16i>
|
|
137
|
+
{
|
|
138
|
+
template<int b>
|
|
139
|
+
EIGEN_STRONG_INLINE Packet16i sll(const Packet16i& a)
|
|
140
|
+
{
|
|
141
|
+
return _mm512_slli_epi32(a, b);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
template<int b>
|
|
145
|
+
EIGEN_STRONG_INLINE Packet16i srl(const Packet16i& a, int _b = b)
|
|
146
|
+
{
|
|
147
|
+
if (b >= 0)
|
|
148
|
+
{
|
|
149
|
+
return _mm512_srli_epi32(a, b);
|
|
150
|
+
}
|
|
151
|
+
else
|
|
152
|
+
{
|
|
153
|
+
return _mm512_srli_epi32(a, _b);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
template<int b>
|
|
158
|
+
EIGEN_STRONG_INLINE Packet16i sll64(const Packet16i& a)
|
|
159
|
+
{
|
|
160
|
+
return _mm512_slli_epi64(a, b);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
template<int b>
|
|
164
|
+
EIGEN_STRONG_INLINE Packet16i srl64(const Packet16i& a)
|
|
165
|
+
{
|
|
166
|
+
return _mm512_srli_epi64(a, b);
|
|
167
|
+
}
|
|
168
|
+
};
|
|
169
|
+
|
|
170
|
+
template<> EIGEN_STRONG_INLINE bool predux_all(const Packet16i& x)
|
|
171
|
+
{
|
|
172
|
+
return _mm512_movepi32_mask(x) == 0xFFFF;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
template<> EIGEN_STRONG_INLINE bool predux_all(const Packet16f& x)
|
|
176
|
+
{
|
|
177
|
+
return predux_all(_mm512_castps_si512(x));
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
template<>
|
|
181
|
+
EIGEN_STRONG_INLINE Packet16i pcmplt<Packet16i>(const Packet16i& a, const Packet16i& b)
|
|
182
|
+
{
|
|
183
|
+
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
|
|
184
|
+
return _mm512_movm_epi32(mask);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
template<>
|
|
188
|
+
EIGEN_STRONG_INLINE Packet16f pcmplt<Packet16f>(const Packet16f& a, const Packet16f& b)
|
|
189
|
+
{
|
|
190
|
+
return pcmp_lt(a, b);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
template<>
|
|
194
|
+
EIGEN_STRONG_INLINE Packet16f pcmple<Packet16f>(const Packet16f& a, const Packet16f& b)
|
|
195
|
+
{
|
|
196
|
+
return pcmp_le(a, b);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
template<>
|
|
200
|
+
EIGEN_STRONG_INLINE Packet8d pcmplt<Packet8d>(const Packet8d& a, const Packet8d& b)
|
|
201
|
+
{
|
|
202
|
+
return pcmp_lt(a, b);
|
|
203
|
+
}
|
|
204
|
+
template<>
|
|
205
|
+
EIGEN_STRONG_INLINE Packet8d pcmple<Packet8d>(const Packet8d& a, const Packet8d& b)
|
|
206
|
+
{
|
|
207
|
+
return pcmp_le(a, b);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
template<>
|
|
211
|
+
EIGEN_STRONG_INLINE Packet16f pblendv(const Packet16i& ifPacket, const Packet16f& thenPacket, const Packet16f& elsePacket)
|
|
212
|
+
{
|
|
213
|
+
__mmask16 mask = _mm512_movepi32_mask(ifPacket);
|
|
214
|
+
return _mm512_mask_blend_ps(mask, elsePacket, thenPacket);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
template<>
|
|
218
|
+
EIGEN_STRONG_INLINE Packet16f pblendv(const Packet16f& ifPacket, const Packet16f& thenPacket, const Packet16f& elsePacket)
|
|
219
|
+
{
|
|
220
|
+
return pblendv(_mm512_castps_si512(ifPacket), thenPacket, elsePacket);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
template<>
|
|
224
|
+
EIGEN_STRONG_INLINE Packet16i pblendv(const Packet16i& ifPacket, const Packet16i& thenPacket, const Packet16i& elsePacket)
|
|
225
|
+
{
|
|
226
|
+
__mmask16 mask = _mm512_movepi32_mask(ifPacket);
|
|
227
|
+
return _mm512_mask_blend_epi32(mask, elsePacket, thenPacket);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
template<>
|
|
231
|
+
EIGEN_STRONG_INLINE Packet8d pblendv(const Packet16i& ifPacket, const Packet8d& thenPacket, const Packet8d& elsePacket)
|
|
232
|
+
{
|
|
233
|
+
__mmask8 mask = _mm512_movepi64_mask(ifPacket);
|
|
234
|
+
return _mm512_mask_blend_pd(mask, elsePacket, thenPacket);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
template<>
|
|
238
|
+
EIGEN_STRONG_INLINE Packet8d pblendv(const Packet8d& ifPacket, const Packet8d& thenPacket, const Packet8d& elsePacket)
|
|
239
|
+
{
|
|
240
|
+
return pblendv(_mm512_castpd_si512(ifPacket), thenPacket, elsePacket);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
template<>
|
|
244
|
+
EIGEN_STRONG_INLINE Packet16i pgather<Packet16i>(const int* addr, const Packet16i& index)
|
|
245
|
+
{
|
|
246
|
+
return _mm512_i32gather_epi32(index, addr, 4);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
template<>
|
|
250
|
+
EIGEN_STRONG_INLINE Packet16f pgather<Packet16i>(const float* addr, const Packet16i& index)
|
|
251
|
+
{
|
|
252
|
+
return _mm512_i32gather_ps(index, addr, 4);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
template<>
|
|
256
|
+
EIGEN_STRONG_INLINE Packet8d pgather<Packet16i>(const double* addr, const Packet16i& index, bool upperhalf)
|
|
257
|
+
{
|
|
258
|
+
return _mm512_i32gather_pd(_mm512_castsi512_si256(index), addr, 8);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
template<>
|
|
262
|
+
EIGEN_STRONG_INLINE Packet16f ptruncate<Packet16f>(const Packet16f& a)
|
|
263
|
+
{
|
|
264
|
+
return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
template<>
|
|
268
|
+
EIGEN_STRONG_INLINE Packet8d ptruncate<Packet8d>(const Packet8d& a)
|
|
269
|
+
{
|
|
270
|
+
return _mm512_roundscale_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
template<>
|
|
274
|
+
EIGEN_STRONG_INLINE Packet16i pcmpeq64<Packet16i>(const Packet16i& a, const Packet16i& b)
|
|
275
|
+
{
|
|
276
|
+
__mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ);
|
|
277
|
+
return _mm512_movm_epi64(mask);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
EIGEN_STRONG_INLINE __m512d int64_to_double_avx512(__m512i x) {
|
|
281
|
+
x = padd64(x, _mm512_castpd_si512(_mm512_set1_pd(0x0018000000000000)));
|
|
282
|
+
return _mm512_sub_pd(_mm512_castsi512_pd(x), _mm512_set1_pd(0x0018000000000000));
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
EIGEN_STRONG_INLINE __m512i double_to_int64_avx512(__m512d x) {
|
|
286
|
+
x = _mm512_add_pd(_mm512_floor_pd(x), _mm512_set1_pd(0x0018000000000000));
|
|
287
|
+
return psub64(
|
|
288
|
+
_mm512_castpd_si512(x),
|
|
289
|
+
_mm512_castpd_si512(_mm512_set1_pd(0x0018000000000000))
|
|
290
|
+
);
|
|
291
|
+
}
|
|
292
|
+
template<>
|
|
293
|
+
EIGEN_STRONG_INLINE Packet16i pcast64<Packet8d, Packet16i>(const Packet8d& a)
|
|
294
|
+
{
|
|
295
|
+
return double_to_int64_avx512(a);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
template<>
|
|
299
|
+
EIGEN_STRONG_INLINE Packet8d pcast64<Packet16i, Packet8d>(const Packet16i& a)
|
|
300
|
+
{
|
|
301
|
+
return int64_to_double_avx512(a);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
|
305
|
+
Packet8d psin<Packet8d>(const Packet8d& x)
|
|
306
|
+
{
|
|
307
|
+
return _psin(x);
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
#endif
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file PacketFilter.h
|
|
3
|
+
* @author bab2min (bab2min@gmail.com)
|
|
4
|
+
* @brief
|
|
5
|
+
* @version 0.5.1
|
|
6
|
+
* @date 2024-09-08
|
|
7
|
+
*
|
|
8
|
+
* @copyright Copyright (c) 2020-2024
|
|
9
|
+
*
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
#ifndef EIGENRAND_PACKET_FILTER_AVX512_H
|
|
13
|
+
#define EIGENRAND_PACKET_FILTER_AVX512_H
|
|
14
|
+
|
|
15
|
+
#include <immintrin.h>
|
|
16
|
+
|
|
17
|
+
namespace Eigen
|
|
18
|
+
{
|
|
19
|
+
namespace Rand
|
|
20
|
+
{
|
|
21
|
+
namespace detail
|
|
22
|
+
{
|
|
23
|
+
template<>
|
|
24
|
+
class CompressMask<64>
|
|
25
|
+
{
|
|
26
|
+
CompressMask() {}
|
|
27
|
+
|
|
28
|
+
public:
|
|
29
|
+
enum { full_size = 16 };
|
|
30
|
+
static const CompressMask& get_inst()
|
|
31
|
+
{
|
|
32
|
+
static CompressMask cm;
|
|
33
|
+
return cm;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
template<typename Packet>
|
|
37
|
+
EIGEN_STRONG_INLINE int compress_append(Packet& _value, const Packet& _mask,
|
|
38
|
+
Packet& _rest, int rest_cnt, bool& full) const
|
|
39
|
+
{
|
|
40
|
+
auto& value = reinterpret_cast<internal::Packet16f&>(_value);
|
|
41
|
+
auto& mask = reinterpret_cast<const internal::Packet16f&>(_mask);
|
|
42
|
+
auto& rest = reinterpret_cast<internal::Packet16f&>(_rest);
|
|
43
|
+
|
|
44
|
+
const __mmask16 m = _mm512_movepi32_mask(_mm512_castps_si512(mask));
|
|
45
|
+
|
|
46
|
+
if (m == 0xFFFF)
|
|
47
|
+
{
|
|
48
|
+
full = true;
|
|
49
|
+
return rest_cnt;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const int cnt_m = _mm_popcnt_u32(m);
|
|
53
|
+
|
|
54
|
+
const __m512i counting = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
55
|
+
__m512i rotate = _mm512_sub_epi32(counting, _mm512_set1_epi32(cnt_m));
|
|
56
|
+
__m512 rot_rest = _mm512_permutexvar_ps(rotate, rest);
|
|
57
|
+
|
|
58
|
+
__m512 p1 = _mm512_mask_compress_ps(rot_rest, m, value);
|
|
59
|
+
|
|
60
|
+
auto new_cnt = rest_cnt + cnt_m;
|
|
61
|
+
if (new_cnt >= full_size)
|
|
62
|
+
{
|
|
63
|
+
rest = rot_rest;
|
|
64
|
+
value = p1;
|
|
65
|
+
full = true;
|
|
66
|
+
return new_cnt - full_size;
|
|
67
|
+
}
|
|
68
|
+
else
|
|
69
|
+
{
|
|
70
|
+
rest = p1;
|
|
71
|
+
full = false;
|
|
72
|
+
return new_cnt;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
#endif
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file RandUtils.h
|
|
3
|
+
* @author bab2min (bab2min@gmail.com)
|
|
4
|
+
* @brief
|
|
5
|
+
* @version 0.5.1
|
|
6
|
+
* @date 2024-09-08
|
|
7
|
+
*
|
|
8
|
+
* @copyright Copyright (c) 2020-2024
|
|
9
|
+
*
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
#ifndef EIGENRAND_RAND_UTILS_AVX512_H
|
|
13
|
+
#define EIGENRAND_RAND_UTILS_AVX512_H
|
|
14
|
+
|
|
15
|
+
#include <immintrin.h>
|
|
16
|
+
|
|
17
|
+
namespace Eigen
|
|
18
|
+
{
|
|
19
|
+
namespace internal
|
|
20
|
+
{
|
|
21
|
+
template<typename Rng>
|
|
22
|
+
struct RawbitsMaker<Packet8i, Rng, Packet16i, Rand::RandomEngineType::packet>
|
|
23
|
+
{
|
|
24
|
+
EIGEN_STRONG_INLINE Packet8i rawbits(Rng& rng)
|
|
25
|
+
{
|
|
26
|
+
return rng.half();
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
EIGEN_STRONG_INLINE Packet8i rawbits_34(Rng& rng)
|
|
30
|
+
{
|
|
31
|
+
return rng.half();
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
EIGEN_STRONG_INLINE Packet8i rawbits_half(Rng& rng)
|
|
35
|
+
{
|
|
36
|
+
return rng.half();
|
|
37
|
+
}
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
template<typename Rng>
|
|
41
|
+
struct RawbitsMaker<Packet16i, Rng, Packet8i, Rand::RandomEngineType::packet>
|
|
42
|
+
{
|
|
43
|
+
EIGEN_STRONG_INLINE Packet16i rawbits(Rng& rng)
|
|
44
|
+
{
|
|
45
|
+
return _mm512_inserti64x4(_mm512_castsi256_si512(rng()), rng(), 1);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
EIGEN_STRONG_INLINE Packet16i rawbits_34(Rng& rng)
|
|
49
|
+
{
|
|
50
|
+
return _mm512_inserti64x4(_mm512_castsi256_si512(rng()), rng(), 1);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
EIGEN_STRONG_INLINE Packet8i rawbits_half(Rng& rng)
|
|
54
|
+
{
|
|
55
|
+
return rng();
|
|
56
|
+
}
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
template<typename Rng, typename RngResult>
|
|
60
|
+
struct RawbitsMaker<Packet16i, Rng, RngResult, Rand::RandomEngineType::scalar_fullbit>
|
|
61
|
+
{
|
|
62
|
+
EIGEN_STRONG_INLINE Packet16i rawbits(Rng& rng)
|
|
63
|
+
{
|
|
64
|
+
if (sizeof(decltype(rng())) == 8)
|
|
65
|
+
{
|
|
66
|
+
return _mm512_set_epi64(rng(), rng(), rng(), rng(),
|
|
67
|
+
rng(), rng(), rng(), rng());
|
|
68
|
+
}
|
|
69
|
+
else
|
|
70
|
+
{
|
|
71
|
+
return _mm512_set_epi32(rng(), rng(), rng(), rng(),
|
|
72
|
+
rng(), rng(), rng(), rng(),
|
|
73
|
+
rng(), rng(), rng(), rng(),
|
|
74
|
+
rng(), rng(), rng(), rng());
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
EIGEN_STRONG_INLINE Packet16i rawbits_34(Rng& rng)
|
|
79
|
+
{
|
|
80
|
+
return rawbits(rng);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
EIGEN_STRONG_INLINE Packet8i rawbits_half(Rng& rng)
|
|
84
|
+
{
|
|
85
|
+
if (sizeof(decltype(rng())) == 8)
|
|
86
|
+
{
|
|
87
|
+
return _mm256_set_epi64x(rng(), rng(), rng(), rng());
|
|
88
|
+
}
|
|
89
|
+
else
|
|
90
|
+
{
|
|
91
|
+
return _mm256_set_epi32(rng(), rng(), rng(), rng(),
|
|
92
|
+
rng(), rng(), rng(), rng());
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
template<typename Rng>
|
|
98
|
+
struct RawbitsMaker<Packet16i, Rng, Packet16i, Rand::RandomEngineType::packet>
|
|
99
|
+
{
|
|
100
|
+
EIGEN_STRONG_INLINE Packet16i rawbits(Rng& rng)
|
|
101
|
+
{
|
|
102
|
+
return rng();
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
EIGEN_STRONG_INLINE Packet16i rawbits_34(Rng& rng)
|
|
106
|
+
{
|
|
107
|
+
return rng();
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
EIGEN_STRONG_INLINE Packet8i rawbits_half(Rng& rng)
|
|
111
|
+
{
|
|
112
|
+
return rng.half();
|
|
113
|
+
}
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
template<typename Rng>
|
|
117
|
+
struct UniformRealUtils<Packet16f, Rng> : public RawbitsMaker<Packet16i, Rng>
|
|
118
|
+
{
|
|
119
|
+
EIGEN_STRONG_INLINE Packet16f zero_to_one(Rng& rng)
|
|
120
|
+
{
|
|
121
|
+
return pdiv(_mm512_cvtepi32_ps(pand(this->rawbits(rng), pset1<Packet16i>(0x7FFFFFFF))),
|
|
122
|
+
pset1<Packet16f>(0x7FFFFFFF));
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
EIGEN_STRONG_INLINE Packet16f uniform_real(Rng& rng)
|
|
126
|
+
{
|
|
127
|
+
return bit_to_ur_float(this->rawbits_34(rng));
|
|
128
|
+
}
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
template<typename Rng>
|
|
132
|
+
struct UniformRealUtils<Packet8d, Rng> : public RawbitsMaker<Packet16i, Rng>
|
|
133
|
+
{
|
|
134
|
+
EIGEN_STRONG_INLINE Packet8d zero_to_one(Rng& rng)
|
|
135
|
+
{
|
|
136
|
+
return pdiv(_mm512_cvtepi32_pd(pand(this->rawbits_half(rng), pset1<Packet8i>(0x7FFFFFFF))),
|
|
137
|
+
pset1<Packet8d>(0x7FFFFFFF));
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
EIGEN_STRONG_INLINE Packet8d uniform_real(Rng& rng)
|
|
141
|
+
{
|
|
142
|
+
return bit_to_ur_double(this->rawbits(rng));
|
|
143
|
+
}
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
#endif
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
* @file MorePacketMath.h
|
|
3
3
|
* @author bab2min (bab2min@gmail.com)
|
|
4
4
|
* @brief
|
|
5
|
-
* @version 0.
|
|
6
|
-
* @date
|
|
5
|
+
* @version 0.5.1
|
|
6
|
+
* @date 2024-09-08
|
|
7
7
|
*
|
|
8
|
-
* @copyright Copyright (c) 2020-
|
|
8
|
+
* @copyright Copyright (c) 2020-2024
|
|
9
9
|
*
|
|
10
10
|
*/
|
|
11
11
|
|
|
@@ -46,6 +46,9 @@ namespace Eigen
|
|
|
46
46
|
template<>
|
|
47
47
|
struct IsFloatPacket<Packet4f> : std::true_type {};
|
|
48
48
|
|
|
49
|
+
template<>
|
|
50
|
+
struct IsDoublePacket<Packet2d> : std::true_type {};
|
|
51
|
+
|
|
49
52
|
template<>
|
|
50
53
|
struct HalfPacket<Packet4i>
|
|
51
54
|
{
|
|
@@ -64,6 +67,11 @@ namespace Eigen
|
|
|
64
67
|
{
|
|
65
68
|
return x;
|
|
66
69
|
}
|
|
70
|
+
|
|
71
|
+
EIGEN_STRONG_INLINE Packet2d to_double(const Packet4i& x)
|
|
72
|
+
{
|
|
73
|
+
return (Packet2d)vreinterpretq_f64_s32(x);
|
|
74
|
+
}
|
|
67
75
|
};
|
|
68
76
|
|
|
69
77
|
template<>
|
|
@@ -78,6 +86,30 @@ namespace Eigen
|
|
|
78
86
|
{
|
|
79
87
|
return (Packet4i)vreinterpretq_s32_f32(x);
|
|
80
88
|
}
|
|
89
|
+
|
|
90
|
+
EIGEN_STRONG_INLINE Packet2d to_double(const Packet4f& x)
|
|
91
|
+
{
|
|
92
|
+
return (Packet2d)vreinterpretq_f64_f32(x);
|
|
93
|
+
}
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
template<>
|
|
97
|
+
struct reinterpreter<Packet2d>
|
|
98
|
+
{
|
|
99
|
+
EIGEN_STRONG_INLINE Packet4f to_float(const Packet2d& x)
|
|
100
|
+
{
|
|
101
|
+
return vreinterpretq_f32_f64(x);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
EIGEN_STRONG_INLINE Packet2d to_double(const Packet2d& x)
|
|
105
|
+
{
|
|
106
|
+
return x;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
EIGEN_STRONG_INLINE Packet4i to_int(const Packet2d& x)
|
|
110
|
+
{
|
|
111
|
+
return vreinterpretq_s32_f64(x);
|
|
112
|
+
}
|
|
81
113
|
};
|
|
82
114
|
|
|
83
115
|
template<>
|
|
@@ -192,6 +224,18 @@ namespace Eigen
|
|
|
192
224
|
return vreinterpretq_f32_u32(vcleq_f32(a, b));
|
|
193
225
|
}
|
|
194
226
|
|
|
227
|
+
template<>
|
|
228
|
+
EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(const Packet2d& a, const Packet2d& b)
|
|
229
|
+
{
|
|
230
|
+
return vreinterpretq_f64_u64(vcltq_f64(a,b));
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
template<>
|
|
234
|
+
EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(const Packet2d& a, const Packet2d& b)
|
|
235
|
+
{
|
|
236
|
+
return vreinterpretq_f64_u64(vcleq_f64(a,b));
|
|
237
|
+
}
|
|
238
|
+
|
|
195
239
|
template<>
|
|
196
240
|
EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4f& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
|
|
197
241
|
{
|
|
@@ -210,6 +254,18 @@ namespace Eigen
|
|
|
210
254
|
return vbslq_s32(vreinterpretq_u32_s32(ifPacket), thenPacket, elsePacket);
|
|
211
255
|
}
|
|
212
256
|
|
|
257
|
+
template<>
|
|
258
|
+
EIGEN_STRONG_INLINE Packet2d pblendv(const Packet2d& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
|
|
259
|
+
{
|
|
260
|
+
return vbslq_f64(vreinterpretq_u64_f64(ifPacket), thenPacket, elsePacket);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
template<>
|
|
264
|
+
EIGEN_STRONG_INLINE Packet2d pblendv(const Packet4i& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
|
|
265
|
+
{
|
|
266
|
+
return vbslq_f64(vreinterpretq_u64_s32(ifPacket), thenPacket, elsePacket);
|
|
267
|
+
}
|
|
268
|
+
|
|
213
269
|
template<>
|
|
214
270
|
EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(const int* addr, const Packet4i& index)
|
|
215
271
|
{
|
|
@@ -256,6 +312,37 @@ namespace Eigen
|
|
|
256
312
|
{
|
|
257
313
|
return vrndq_f32(a);
|
|
258
314
|
}
|
|
315
|
+
|
|
316
|
+
template<>
|
|
317
|
+
EIGEN_STRONG_INLINE Packet4i pcast64<Packet2d, Packet4i>(const Packet2d& a)
|
|
318
|
+
{
|
|
319
|
+
return (Packet4i)vcvtq_s64_f64(a);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
template<>
|
|
323
|
+
EIGEN_STRONG_INLINE Packet2d pcast64<Packet4i, Packet2d>(const Packet4i& a)
|
|
324
|
+
{
|
|
325
|
+
return vcvtq_f64_s64((int64x2_t)a);
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
template<>
|
|
330
|
+
EIGEN_STRONG_INLINE Packet4i padd64<Packet4i>(const Packet4i& a, const Packet4i& b)
|
|
331
|
+
{
|
|
332
|
+
return (Packet4i)vaddq_s64((int64x2_t)a, (int64x2_t)b);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
template<>
|
|
336
|
+
EIGEN_STRONG_INLINE Packet4i psub64<Packet4i>(const Packet4i& a, const Packet4i& b)
|
|
337
|
+
{
|
|
338
|
+
return (Packet4i)vsubq_s64((int64x2_t)a, (int64x2_t)b);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
|
342
|
+
Packet2d psin<Packet2d>(const Packet2d& x)
|
|
343
|
+
{
|
|
344
|
+
return _psin(x);
|
|
345
|
+
}
|
|
259
346
|
|
|
260
347
|
template<>
|
|
261
348
|
EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
|
|
@@ -279,7 +366,35 @@ namespace Eigen
|
|
|
279
366
|
return vreinterpretq_s32_u64(vld1q_u64(u));
|
|
280
367
|
}
|
|
281
368
|
|
|
369
|
+
template<>
|
|
370
|
+
EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
|
|
371
|
+
{
|
|
372
|
+
uint32x2_t tmp = vand_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
|
|
373
|
+
vget_high_u32(vreinterpretq_u32_f32(x)));
|
|
374
|
+
return vget_lane_u32(vpmin_u32(tmp, tmp), 0);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
template<>
|
|
378
|
+
EIGEN_STRONG_INLINE bool predux_all(const Packet4i& x)
|
|
379
|
+
{
|
|
380
|
+
return predux_all((Packet4f)vreinterpretq_f32_s32(x));
|
|
381
|
+
}
|
|
382
|
+
|
|
282
383
|
#ifdef EIGENRAND_EIGEN_33_MODE
|
|
384
|
+
template<>
|
|
385
|
+
EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
|
|
386
|
+
{
|
|
387
|
+
uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
|
|
388
|
+
vget_high_u32(vreinterpretq_u32_f32(x)));
|
|
389
|
+
return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
template<>
|
|
393
|
+
EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x)
|
|
394
|
+
{
|
|
395
|
+
return predux_any((Packet4f)vreinterpretq_f32_s32(x));
|
|
396
|
+
}
|
|
397
|
+
|
|
283
398
|
template<>
|
|
284
399
|
EIGEN_STRONG_INLINE Packet4f plog<Packet4f>(const Packet4f& _x)
|
|
285
400
|
{
|