@smake/eigen 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -20
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +187 -120
  6. package/eigen/Eigen/Eigenvalues +16 -13
  7. package/eigen/Eigen/Geometry +18 -18
  8. package/eigen/Eigen/Householder +9 -7
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -13
  11. package/eigen/Eigen/KLUSupport +23 -21
  12. package/eigen/Eigen/LU +15 -16
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -51
  15. package/eigen/Eigen/PaStiXSupport +23 -21
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -20
  18. package/eigen/Eigen/QtAlignedMalloc +5 -12
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -17
  21. package/eigen/Eigen/Sparse +1 -2
  22. package/eigen/Eigen/SparseCholesky +18 -15
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +9 -9
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
  37. package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
  42. package/eigen/Eigen/src/Core/Array.h +329 -370
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
  48. package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
  49. package/eigen/Eigen/src/Core/Block.h +371 -390
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
  53. package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
  59. package/eigen/Eigen/src/Core/DenseBase.h +630 -658
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
  61. package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +168 -207
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +167 -217
  67. package/eigen/Eigen/src/Core/EigenBase.h +74 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
  75. package/eigen/Eigen/src/Core/IO.h +131 -156
  76. package/eigen/Eigen/src/Core/IndexedView.h +209 -125
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +50 -59
  80. package/eigen/Eigen/src/Core/Map.h +123 -141
  81. package/eigen/Eigen/src/Core/MapBase.h +255 -282
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
  84. package/eigen/Eigen/src/Core/Matrix.h +463 -494
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
  86. package/eigen/Eigen/src/Core/NestByValue.h +58 -52
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -86
  88. package/eigen/Eigen/src/Core/NumTraits.h +206 -206
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
  92. package/eigen/Eigen/src/Core/Product.h +246 -130
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
  94. package/eigen/Eigen/src/Core/Random.h +153 -164
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +334 -314
  98. package/eigen/Eigen/src/Core/Ref.h +259 -257
  99. package/eigen/Eigen/src/Core/Replicate.h +92 -104
  100. package/eigen/Eigen/src/Core/Reshaped.h +215 -271
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +133 -148
  103. package/eigen/Eigen/src/Core/Select.h +68 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +88 -102
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
  109. package/eigen/Eigen/src/Core/SolverBase.h +132 -133
  110. package/eigen/Eigen/src/Core/StableNorm.h +113 -147
  111. package/eigen/Eigen/src/Core/StlIterators.h +404 -248
  112. package/eigen/Eigen/src/Core/Stride.h +90 -92
  113. package/eigen/Eigen/src/Core/Swap.h +70 -39
  114. package/eigen/Eigen/src/Core/Transpose.h +258 -295
  115. package/eigen/Eigen/src/Core/Transpositions.h +270 -333
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
  119. package/eigen/Eigen/src/Core/Visitor.h +464 -308
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
  217. package/eigen/Eigen/src/Core/util/Constants.h +297 -262
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
  226. package/eigen/Eigen/src/Core/util/Macros.h +655 -773
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +970 -748
  229. package/eigen/Eigen/src/Core/util/Meta.h +581 -633
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
  264. package/eigen/Eigen/src/Geometry/Transform.h +858 -936
  265. package/eigen/Eigen/src/Geometry/Translation.h +94 -92
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
  269. package/eigen/Eigen/src/Householder/Householder.h +102 -124
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
  285. package/eigen/Eigen/src/LU/Determinant.h +50 -69
  286. package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/package.json +1 -1
  409. package/eigen/COPYING.APACHE +0 -203
  410. package/eigen/COPYING.BSD +0 -26
  411. package/eigen/COPYING.GPL +0 -674
  412. package/eigen/COPYING.LGPL +0 -502
  413. package/eigen/COPYING.MINPACK +0 -51
  414. package/eigen/COPYING.MPL2 +0 -373
  415. package/eigen/COPYING.README +0 -18
  416. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
  417. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
  418. package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
  419. package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
  420. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  421. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  422. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  423. package/eigen/Eigen/src/misc/lapack.h +0 -152
  424. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
  425. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
  426. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  427. package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
  428. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  429. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
  430. package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
  431. package/eigen/README.md +0 -5
@@ -10,6 +10,9 @@
10
10
  #ifndef EIGEN_PACKET_MATH_AVX512_H
11
11
  #define EIGEN_PACKET_MATH_AVX512_H
12
12
 
13
+ // IWYU pragma: private
14
+ #include "../../InternalHeaderCheck.h"
15
+
13
16
  namespace Eigen {
14
17
 
15
18
  namespace internal {
@@ -31,9 +34,16 @@ namespace internal {
31
34
  typedef __m512 Packet16f;
32
35
  typedef __m512i Packet16i;
33
36
  typedef __m512d Packet8d;
37
+ typedef eigen_packet_wrapper<__m512i, 1> Packet8l;
38
+ #ifndef EIGEN_VECTORIZE_AVX512FP16
34
39
  typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
40
+ #endif
35
41
  typedef eigen_packet_wrapper<__m256i, 2> Packet16bf;
36
42
 
43
+ typedef eigen_packet_wrapper<__m512i, 6> Packet32s;
44
+ typedef eigen_packet_wrapper<__m256i, 6> Packet16s;
45
+ typedef eigen_packet_wrapper<__m128i, 6> Packet8s;
46
+
37
47
  template <>
38
48
  struct is_arithmetic<__m512> {
39
49
  enum { value = true };
@@ -46,8 +56,16 @@ template <>
46
56
  struct is_arithmetic<__m512d> {
47
57
  enum { value = true };
48
58
  };
59
+ template <>
60
+ struct is_arithmetic<Packet8l> {
61
+ enum { value = true };
62
+ };
49
63
 
50
- template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
64
+ #ifndef EIGEN_VECTORIZE_AVX512FP16
65
+ template <>
66
+ struct is_arithmetic<Packet16h> {
67
+ enum { value = true };
68
+ };
51
69
 
52
70
  template <>
53
71
  struct packet_traits<half> : default_packet_traits {
@@ -58,112 +76,114 @@ struct packet_traits<half> : default_packet_traits {
58
76
  Vectorizable = 1,
59
77
  AlignedOnScalar = 1,
60
78
  size = 16,
61
- HasHalfPacket = 1,
62
79
 
63
- HasCmp = 1,
64
- HasAdd = 1,
65
- HasSub = 1,
66
- HasMul = 1,
67
- HasDiv = 1,
80
+ HasCmp = 1,
81
+ HasAdd = 1,
82
+ HasSub = 1,
83
+ HasMul = 1,
84
+ HasDiv = 1,
68
85
  HasNegate = 1,
69
- HasAbs = 1,
70
- HasAbs2 = 0,
71
- HasMin = 1,
72
- HasMax = 1,
73
- HasConj = 1,
86
+ HasAbs = 1,
87
+ HasAbs2 = 0,
88
+ HasMin = 1,
89
+ HasMax = 1,
90
+ HasConj = 1,
74
91
  HasSetLinear = 0,
75
- HasLog = 1,
76
- HasLog1p = 1,
77
- HasExpm1 = 1,
78
- HasExp = 1,
79
- HasSqrt = 1,
80
- HasRsqrt = 1,
81
- HasSin = EIGEN_FAST_MATH,
82
- HasCos = EIGEN_FAST_MATH,
83
- HasTanh = EIGEN_FAST_MATH,
84
- HasErf = EIGEN_FAST_MATH,
85
- HasBlend = 0,
86
- HasRound = 1,
87
- HasFloor = 1,
88
- HasCeil = 1,
89
- HasRint = 1,
92
+ HasSqrt = 1,
93
+ HasRsqrt = 1,
94
+ HasLog = 1,
95
+ HasLog1p = 1,
96
+ HasExp = 1,
97
+ HasExpm1 = 1,
90
98
  HasBessel = 1,
91
- HasNdtri = 1
99
+ HasNdtri = 1,
100
+ HasSin = EIGEN_FAST_MATH,
101
+ HasCos = EIGEN_FAST_MATH,
102
+ HasTanh = EIGEN_FAST_MATH,
103
+ HasErf = EIGEN_FAST_MATH,
104
+ HasBlend = 0
92
105
  };
93
106
  };
107
+ #endif
94
108
 
95
- template<> struct packet_traits<float> : default_packet_traits
96
- {
109
+ template <>
110
+ struct packet_traits<float> : default_packet_traits {
97
111
  typedef Packet16f type;
98
112
  typedef Packet8f half;
99
113
  enum {
100
114
  Vectorizable = 1,
101
115
  AlignedOnScalar = 1,
102
116
  size = 16,
103
- HasHalfPacket = 1,
104
117
 
105
118
  HasAbs = 1,
106
- HasMin = 1,
107
- HasMax = 1,
108
- HasConj = 1,
109
- HasBlend = 0,
119
+ HasMin = 1,
120
+ HasMax = 1,
121
+ HasConj = 1,
122
+ HasBlend = 1,
110
123
  HasSin = EIGEN_FAST_MATH,
111
124
  HasCos = EIGEN_FAST_MATH,
112
- #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
125
+ HasACos = 1,
126
+ HasASin = 1,
127
+ HasATan = 1,
128
+ HasATanh = 1,
129
+ HasSqrt = 1,
130
+ HasRsqrt = 1,
131
+ HasCbrt = 1,
113
132
  HasLog = 1,
114
- HasLog1p = 1,
115
- HasExpm1 = 1,
133
+ HasLog1p = 1,
134
+ HasExpm1 = 1,
116
135
  HasNdtri = 1,
117
- HasBessel = 1,
136
+ HasBessel = 1,
118
137
  HasExp = 1,
119
- HasSqrt = EIGEN_FAST_MATH,
120
- HasRsqrt = EIGEN_FAST_MATH,
138
+ HasPow = 1,
139
+ HasReciprocal = EIGEN_FAST_MATH,
121
140
  HasTanh = EIGEN_FAST_MATH,
122
141
  HasErf = EIGEN_FAST_MATH,
123
- #endif
124
- HasCmp = 1,
125
- HasDiv = 1,
126
- HasRound = 1,
127
- HasFloor = 1,
128
- HasCeil = 1,
129
- HasRint = 1
142
+ HasErfc = EIGEN_FAST_MATH,
143
+ HasCmp = 1,
144
+ HasDiv = 1
130
145
  };
131
- };
132
- template<> struct packet_traits<double> : default_packet_traits
133
- {
146
+ };
147
+ template <>
148
+ struct packet_traits<double> : default_packet_traits {
134
149
  typedef Packet8d type;
135
150
  typedef Packet4d half;
136
151
  enum {
137
152
  Vectorizable = 1,
138
153
  AlignedOnScalar = 1,
139
154
  size = 8,
140
- HasHalfPacket = 1,
141
- #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
142
- HasLog = 1,
155
+ HasBlend = 1,
156
+ HasSqrt = 1,
157
+ HasRsqrt = 1,
158
+ HasCbrt = 1,
159
+ HasSin = EIGEN_FAST_MATH,
160
+ HasCos = EIGEN_FAST_MATH,
161
+ HasLog = 1,
143
162
  HasExp = 1,
144
- HasSqrt = EIGEN_FAST_MATH,
145
- HasRsqrt = EIGEN_FAST_MATH,
146
- #endif
147
- HasCmp = 1,
148
- HasDiv = 1,
149
- HasRound = 1,
150
- HasFloor = 1,
151
- HasCeil = 1,
152
- HasRint = 1
163
+ HasPow = 1,
164
+ HasATan = 1,
165
+ HasTanh = EIGEN_FAST_MATH,
166
+ HasErf = EIGEN_FAST_MATH,
167
+ HasErfc = EIGEN_FAST_MATH,
168
+ HasATanh = 1,
169
+ HasCmp = 1,
170
+ HasDiv = 1
153
171
  };
154
172
  };
155
173
 
156
- /* TODO Implement AVX512 for integers
157
- template<> struct packet_traits<int> : default_packet_traits
158
- {
174
+ template <>
175
+ struct packet_traits<int> : default_packet_traits {
159
176
  typedef Packet16i type;
160
- enum {
161
- Vectorizable = 1,
162
- AlignedOnScalar = 1,
163
- size=8
164
- };
177
+ typedef Packet8i half;
178
+ enum { Vectorizable = 1, AlignedOnScalar = 1, HasBlend = 0, HasCmp = 1, HasDiv = 1, size = 16 };
179
+ };
180
+
181
+ template <>
182
+ struct packet_traits<int64_t> : default_packet_traits {
183
+ typedef Packet8l type;
184
+ typedef Packet4l half;
185
+ enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 8 };
165
186
  };
166
- */
167
187
 
168
188
  template <>
169
189
  struct unpacket_traits<Packet16f> {
@@ -171,26 +191,102 @@ struct unpacket_traits<Packet16f> {
171
191
  typedef Packet8f half;
172
192
  typedef Packet16i integer_packet;
173
193
  typedef uint16_t mask_t;
174
- enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true };
194
+ enum {
195
+ size = 16,
196
+ alignment = Aligned64,
197
+ vectorizable = true,
198
+ masked_load_available = true,
199
+ masked_store_available = true,
200
+ masked_fpops_available = true
201
+ };
175
202
  };
176
203
  template <>
177
204
  struct unpacket_traits<Packet8d> {
178
205
  typedef double type;
179
206
  typedef Packet4d half;
180
- enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false };
207
+ typedef Packet8l integer_packet;
208
+ typedef uint8_t mask_t;
209
+ enum {
210
+ size = 8,
211
+ alignment = Aligned64,
212
+ vectorizable = true,
213
+ masked_load_available = true,
214
+ masked_store_available = true,
215
+ masked_fpops_available = true
216
+ };
181
217
  };
182
218
  template <>
183
219
  struct unpacket_traits<Packet16i> {
184
220
  typedef int type;
185
221
  typedef Packet8i half;
186
- enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false };
222
+ enum {
223
+ size = 16,
224
+ alignment = Aligned64,
225
+ vectorizable = true,
226
+ masked_load_available = false,
227
+ masked_store_available = false
228
+ };
229
+ };
230
+
231
+ template <>
232
+ struct unpacket_traits<Packet8l> {
233
+ typedef int64_t type;
234
+ typedef Packet4l half;
235
+ enum {
236
+ size = 8,
237
+ alignment = Aligned64,
238
+ vectorizable = true,
239
+ masked_load_available = false,
240
+ masked_store_available = false
241
+ };
187
242
  };
188
243
 
189
- template<>
244
+ #ifndef EIGEN_VECTORIZE_AVX512FP16
245
+ template <>
190
246
  struct unpacket_traits<Packet16h> {
191
247
  typedef Eigen::half type;
192
248
  typedef Packet8h half;
193
- enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
249
+ enum {
250
+ size = 16,
251
+ alignment = Aligned32,
252
+ vectorizable = true,
253
+ masked_load_available = false,
254
+ masked_store_available = false
255
+ };
256
+ };
257
+ #endif
258
+
259
+ template <>
260
+ struct unpacket_traits<Packet32s> {
261
+ typedef numext::int16_t type;
262
+ typedef Packet16s half;
263
+ enum {
264
+ size = 32,
265
+ alignment = Aligned64,
266
+ vectorizable = false,
267
+ };
268
+ };
269
+
270
+ template <>
271
+ struct unpacket_traits<Packet16s> {
272
+ typedef numext::int16_t type;
273
+ typedef Packet8s half;
274
+ enum {
275
+ size = 16,
276
+ alignment = Aligned32,
277
+ vectorizable = false,
278
+ };
279
+ };
280
+
281
+ template <>
282
+ struct unpacket_traits<Packet8s> {
283
+ typedef numext::int16_t type;
284
+ typedef Packet8s half;
285
+ enum {
286
+ size = 8,
287
+ alignment = Aligned16,
288
+ vectorizable = false,
289
+ };
194
290
  };
195
291
 
196
292
  template <>
@@ -205,6 +301,10 @@ template <>
205
301
  EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
206
302
  return _mm512_set1_epi32(from);
207
303
  }
304
+ template <>
305
+ EIGEN_STRONG_INLINE Packet8l pset1<Packet8l>(const int64_t& from) {
306
+ return _mm512_set1_epi64(from);
307
+ }
208
308
 
209
309
  template <>
210
310
  EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(unsigned int from) {
@@ -216,84 +316,151 @@ EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(const numext::uint64_t from
216
316
  return _mm512_castsi512_pd(_mm512_set1_epi64(from));
217
317
  }
218
318
 
219
- template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return _mm512_setzero_ps(); }
220
- template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); }
221
- template<> EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) { return _mm512_setzero_si512(); }
319
+ template <>
320
+ EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) {
321
+ return _mm512_setzero_ps();
322
+ }
323
+ template <>
324
+ EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) {
325
+ return _mm512_setzero_pd();
326
+ }
327
+ template <>
328
+ EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) {
329
+ return _mm512_setzero_si512();
330
+ }
331
+
332
+ template <>
333
+ EIGEN_STRONG_INLINE Packet8l pzero(const Packet8l& /*a*/) {
334
+ return _mm512_setzero_si512();
335
+ }
222
336
 
223
- template<> EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
224
- return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
225
- 0, -1, 0, -1, 0, -1, 0, -1));
337
+ template <>
338
+ EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
339
+ return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1));
226
340
  }
227
- template<> EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
228
- return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
229
- 0, -1, 0, -1, 0, -1, 0, -1);
341
+ template <>
342
+ EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
343
+ return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
230
344
  }
231
- template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
232
- return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
233
- 0, 0, -1, -1, 0, 0, -1, -1));
345
+ template <>
346
+ EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
347
+ return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1));
348
+ }
349
+ template <>
350
+ EIGEN_STRONG_INLINE Packet8l peven_mask(const Packet8l& /*a*/) {
351
+ return _mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1);
234
352
  }
235
353
 
236
354
  template <>
237
355
  EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
356
+ #if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
357
+ // Inline asm here helps reduce some register spilling in TRSM kernels.
358
+ // See note in unrolls::gemm::microKernel in TrsmKernel.h
359
+ Packet16f ret;
360
+ __asm__("vbroadcastss %[mem], %[dst]" : [dst] "=v"(ret) : [mem] "m"(*from));
361
+ return ret;
362
+ #else
238
363
  return _mm512_broadcastss_ps(_mm_load_ps1(from));
364
+ #endif
239
365
  }
240
366
  template <>
241
367
  EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
368
+ #if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
369
+ Packet8d ret;
370
+ __asm__("vbroadcastsd %[mem], %[dst]" : [dst] "=v"(ret) : [mem] "m"(*from));
371
+ return ret;
372
+ #else
242
373
  return _mm512_set1_pd(*from);
374
+ #endif
243
375
  }
244
376
 
245
377
  template <>
246
378
  EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
247
- return _mm512_add_ps(
248
- _mm512_set1_ps(a),
249
- _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
250
- 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
379
+ return _mm512_add_ps(_mm512_set1_ps(a), _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f,
380
+ 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
251
381
  }
252
382
  template <>
253
383
  EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
254
- return _mm512_add_pd(_mm512_set1_pd(a),
255
- _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
384
+ return _mm512_add_pd(_mm512_set1_pd(a), _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
385
+ }
386
+ template <>
387
+ EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int& a) {
388
+ return _mm512_add_epi32(_mm512_set1_epi32(a), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
389
+ }
390
+ template <>
391
+ EIGEN_STRONG_INLINE Packet8l plset<Packet8l>(const int64_t& a) {
392
+ return _mm512_add_epi64(_mm512_set1_epi64(a), _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0));
256
393
  }
257
394
 
258
395
  template <>
259
- EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
260
- const Packet16f& b) {
396
+ EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
261
397
  return _mm512_add_ps(a, b);
262
398
  }
263
399
  template <>
264
- EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
265
- const Packet8d& b) {
400
+ EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, const Packet8d& b) {
266
401
  return _mm512_add_pd(a, b);
267
402
  }
268
403
  template <>
269
- EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
270
- const Packet16i& b) {
404
+ EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a, const Packet16i& b) {
271
405
  return _mm512_add_epi32(a, b);
272
406
  }
407
+ template <>
408
+ EIGEN_STRONG_INLINE Packet8l padd<Packet8l>(const Packet8l& a, const Packet8l& b) {
409
+ return _mm512_add_epi64(a, b);
410
+ }
411
+
412
+ template <>
413
+ EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b, uint16_t umask) {
414
+ __mmask16 mask = static_cast<__mmask16>(umask);
415
+ return _mm512_maskz_add_ps(mask, a, b);
416
+ }
417
+ template <>
418
+ EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, const Packet8d& b, uint8_t umask) {
419
+ __mmask8 mask = static_cast<__mmask8>(umask);
420
+ return _mm512_maskz_add_pd(mask, a, b);
421
+ }
273
422
 
274
423
  template <>
275
- EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
276
- const Packet16f& b) {
424
+ EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
277
425
  return _mm512_sub_ps(a, b);
278
426
  }
279
427
  template <>
280
- EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
281
- const Packet8d& b) {
428
+ EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a, const Packet8d& b) {
282
429
  return _mm512_sub_pd(a, b);
283
430
  }
284
431
  template <>
285
- EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
286
- const Packet16i& b) {
432
+ EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a, const Packet16i& b) {
287
433
  return _mm512_sub_epi32(a, b);
288
434
  }
435
+ template <>
436
+ EIGEN_STRONG_INLINE Packet8l psub<Packet8l>(const Packet8l& a, const Packet8l& b) {
437
+ return _mm512_sub_epi64(a, b);
438
+ }
289
439
 
290
440
  template <>
291
441
  EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
292
- return _mm512_sub_ps(_mm512_set1_ps(0.0), a);
442
+ // NOTE: MSVC seems to struggle with _mm512_set1_epi32, leading to random results.
443
+ // The intel docs give it a relatively high latency as well, so we're probably
444
+ // better off with using _mm512_set_epi32 directly anyways.
445
+ const __m512i mask =
446
+ _mm512_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
447
+ 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
448
+ return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
293
449
  }
294
450
  template <>
295
451
  EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
296
- return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
452
+ const __m512i mask =
453
+ _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
454
+ 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
455
+ return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
456
+ }
457
+ template <>
458
+ EIGEN_STRONG_INLINE Packet16i pnegate(const Packet16i& a) {
459
+ return _mm512_sub_epi32(_mm512_setzero_si512(), a);
460
+ }
461
+ template <>
462
+ EIGEN_STRONG_INLINE Packet8l pnegate(const Packet8l& a) {
463
+ return _mm512_sub_epi64(_mm512_setzero_si512(), a);
297
464
  }
298
465
 
299
466
  template <>
@@ -308,144 +475,217 @@ template <>
308
475
  EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) {
309
476
  return a;
310
477
  }
478
+ template <>
479
+ EIGEN_STRONG_INLINE Packet8l pconj(const Packet8l& a) {
480
+ return a;
481
+ }
311
482
 
312
483
  template <>
313
- EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a,
314
- const Packet16f& b) {
484
+ EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
315
485
  return _mm512_mul_ps(a, b);
316
486
  }
317
487
  template <>
318
- EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
319
- const Packet8d& b) {
488
+ EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a, const Packet8d& b) {
320
489
  return _mm512_mul_pd(a, b);
321
490
  }
322
491
  template <>
323
- EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
324
- const Packet16i& b) {
492
+ EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a, const Packet16i& b) {
325
493
  return _mm512_mullo_epi32(a, b);
326
494
  }
495
+ template <>
496
+ EIGEN_STRONG_INLINE Packet8l pmul<Packet8l>(const Packet8l& a, const Packet8l& b) {
497
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
498
+ return _mm512_mullo_epi64(a, b);
499
+ #else
500
+ return _mm512_mullox_epi64(a, b);
501
+ #endif
502
+ }
327
503
 
328
504
  template <>
329
- EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
330
- const Packet16f& b) {
505
+ EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a, const Packet16f& b) {
331
506
  return _mm512_div_ps(a, b);
332
507
  }
508
+
333
509
  template <>
334
- EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
335
- const Packet8d& b) {
510
+ EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a, const Packet8d& b) {
336
511
  return _mm512_div_pd(a, b);
337
512
  }
338
513
 
514
+ template <>
515
+ EIGEN_STRONG_INLINE Packet16i pdiv<Packet16i>(const Packet16i& a, const Packet16i& b) {
516
+ Packet8i q_lo = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
517
+ Packet8i q_hi = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1));
518
+ return _mm512_inserti64x4(_mm512_castsi256_si512(q_lo), q_hi, 1);
519
+ }
520
+
339
521
  #ifdef EIGEN_VECTORIZE_FMA
340
522
  template <>
341
- EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
342
- const Packet16f& c) {
523
+ EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
343
524
  return _mm512_fmadd_ps(a, b, c);
344
525
  }
345
526
  template <>
346
- EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
347
- const Packet8d& c) {
527
+ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
348
528
  return _mm512_fmadd_pd(a, b, c);
349
529
  }
530
+
531
+ template <>
532
+ EIGEN_STRONG_INLINE Packet16f pmsub(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
533
+ return _mm512_fmsub_ps(a, b, c);
534
+ }
535
+ template <>
536
+ EIGEN_STRONG_INLINE Packet8d pmsub(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
537
+ return _mm512_fmsub_pd(a, b, c);
538
+ }
539
+
540
+ template <>
541
+ EIGEN_STRONG_INLINE Packet16f pnmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
542
+ return _mm512_fnmadd_ps(a, b, c);
543
+ }
544
+ template <>
545
+ EIGEN_STRONG_INLINE Packet8d pnmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
546
+ return _mm512_fnmadd_pd(a, b, c);
547
+ }
548
+
549
+ template <>
550
+ EIGEN_STRONG_INLINE Packet16f pnmsub(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
551
+ return _mm512_fnmsub_ps(a, b, c);
552
+ }
553
+ template <>
554
+ EIGEN_STRONG_INLINE Packet8d pnmsub(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
555
+ return _mm512_fnmsub_pd(a, b, c);
556
+ }
350
557
  #endif
351
558
 
352
559
  template <>
353
- EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask,
354
- const Packet16f& a,
355
- const Packet16f& b) {
356
- __mmask16 mask16 = _mm512_cmp_epi32_mask(
357
- _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
560
+ EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
561
+ __mmask16 mask16 = _mm512_cmpeq_epi32_mask(_mm512_castps_si512(mask), _mm512_setzero_epi32());
358
562
  return _mm512_mask_blend_ps(mask16, a, b);
359
563
  }
360
564
 
361
565
  template <>
362
- EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask,
363
- const Packet8d& a,
364
- const Packet8d& b) {
365
- __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask),
366
- _mm512_setzero_epi32(), _MM_CMPINT_EQ);
566
+ EIGEN_DEVICE_FUNC inline Packet16i pselect(const Packet16i& mask, const Packet16i& a, const Packet16i& b) {
567
+ __mmask16 mask16 = _mm512_cmpeq_epi32_mask(mask, _mm512_setzero_epi32());
568
+ return _mm512_mask_blend_epi32(mask16, a, b);
569
+ }
570
+
571
+ template <>
572
+ EIGEN_DEVICE_FUNC inline Packet8l pselect(const Packet8l& mask, const Packet8l& a, const Packet8l& b) {
573
+ __mmask8 mask8 = _mm512_cmpeq_epi64_mask(mask, _mm512_setzero_si512());
574
+ return _mm512_mask_blend_epi64(mask8, a, b);
575
+ }
576
+
577
+ template <>
578
+ EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask, const Packet8d& a, const Packet8d& b) {
579
+ __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
367
580
  return _mm512_mask_blend_pd(mask8, a, b);
368
581
  }
369
582
 
370
583
  template <>
371
- EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
372
- const Packet16f& b) {
584
+ EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a, const Packet16f& b) {
373
585
  // Arguments are reversed to match NaN propagation behavior of std::min.
374
586
  return _mm512_min_ps(b, a);
375
587
  }
376
588
  template <>
377
- EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
378
- const Packet8d& b) {
589
+ EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a, const Packet8d& b) {
379
590
  // Arguments are reversed to match NaN propagation behavior of std::min.
380
591
  return _mm512_min_pd(b, a);
381
592
  }
593
+ template <>
594
+ EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(const Packet16i& a, const Packet16i& b) {
595
+ return _mm512_min_epi32(b, a);
596
+ }
597
+ template <>
598
+ EIGEN_STRONG_INLINE Packet8l pmin<Packet8l>(const Packet8l& a, const Packet8l& b) {
599
+ return _mm512_min_epi64(b, a);
600
+ }
382
601
 
383
602
  template <>
384
- EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
385
- const Packet16f& b) {
603
+ EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a, const Packet16f& b) {
386
604
  // Arguments are reversed to match NaN propagation behavior of std::max.
387
605
  return _mm512_max_ps(b, a);
388
606
  }
389
607
  template <>
390
- EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
391
- const Packet8d& b) {
608
+ EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a, const Packet8d& b) {
392
609
  // Arguments are reversed to match NaN propagation behavior of std::max.
393
610
  return _mm512_max_pd(b, a);
394
611
  }
612
+ template <>
613
+ EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(const Packet16i& a, const Packet16i& b) {
614
+ return _mm512_max_epi32(b, a);
615
+ }
616
+ template <>
617
+ EIGEN_STRONG_INLINE Packet8l pmax<Packet8l>(const Packet8l& a, const Packet8l& b) {
618
+ return _mm512_max_epi64(b, a);
619
+ }
395
620
 
396
- // Add specializations for min/max with prescribed NaN progation.
397
- template<>
621
+ // Add specializations for min/max with prescribed NaN propagation.
622
+ template <>
398
623
  EIGEN_STRONG_INLINE Packet16f pmin<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
399
624
  return pminmax_propagate_numbers(a, b, pmin<Packet16f>);
400
625
  }
401
- template<>
626
+ template <>
402
627
  EIGEN_STRONG_INLINE Packet8d pmin<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
403
628
  return pminmax_propagate_numbers(a, b, pmin<Packet8d>);
404
629
  }
405
- template<>
630
+ template <>
406
631
  EIGEN_STRONG_INLINE Packet16f pmax<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
407
632
  return pminmax_propagate_numbers(a, b, pmax<Packet16f>);
408
633
  }
409
- template<>
634
+ template <>
410
635
  EIGEN_STRONG_INLINE Packet8d pmax<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
411
636
  return pminmax_propagate_numbers(a, b, pmax<Packet8d>);
412
637
  }
413
- template<>
638
+ template <>
414
639
  EIGEN_STRONG_INLINE Packet16f pmin<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
415
640
  return pminmax_propagate_nan(a, b, pmin<Packet16f>);
416
641
  }
417
- template<>
642
+ template <>
418
643
  EIGEN_STRONG_INLINE Packet8d pmin<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
419
644
  return pminmax_propagate_nan(a, b, pmin<Packet8d>);
420
645
  }
421
- template<>
646
+ template <>
422
647
  EIGEN_STRONG_INLINE Packet16f pmax<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
423
648
  return pminmax_propagate_nan(a, b, pmax<Packet16f>);
424
649
  }
425
- template<>
650
+ template <>
426
651
  EIGEN_STRONG_INLINE Packet8d pmax<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
427
652
  return pminmax_propagate_nan(a, b, pmax<Packet8d>);
428
653
  }
429
654
 
430
-
431
655
  #ifdef EIGEN_VECTORIZE_AVX512DQ
432
- template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
433
- template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
434
- EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
656
+ template <int I_>
657
+ EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
658
+ return _mm512_extractf32x8_ps(x, I_);
659
+ }
660
+ template <int I_>
661
+ EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
662
+ return _mm512_extractf64x2_pd(x, I_);
663
+ }
664
+ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
665
+ return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
666
+ }
667
+ EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
668
+ return _mm512_inserti32x8(_mm512_castsi256_si512(a), b, 1);
669
+ }
435
670
  #else
436
671
  // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
437
- template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
438
- return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
672
+ template <int I_>
673
+ EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
674
+ return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(x), I_));
439
675
  }
440
676
 
441
677
  // AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
442
- template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
443
- return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
678
+ template <int I_>
679
+ EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
680
+ return _mm_castsi128_pd(_mm512_extracti32x4_epi32(_mm512_castpd_si512(x), I_));
444
681
  }
445
682
 
446
683
  EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
447
- return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
448
- _mm256_castps_si256(b),1));
684
+ return _mm512_castsi512_ps(
685
+ _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), _mm256_castps_si256(b), 1));
686
+ }
687
+ EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
688
+ return _mm512_inserti64x4(_mm512_castsi256_si512(a), b, 1);
449
689
  }
450
690
  #endif
451
691
 
@@ -461,80 +701,137 @@ EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
461
701
  // dst[255:240] := Saturate16(rf[255:224])
462
702
  __m256i lo = _mm256_castps_si256(extract256<0>(rf));
463
703
  __m256i hi = _mm256_castps_si256(extract256<1>(rf));
464
- __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
465
- _mm256_extractf128_si256(lo, 1));
466
- __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
467
- _mm256_extractf128_si256(hi, 1));
704
+ __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), _mm256_extractf128_si256(lo, 1));
705
+ __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), _mm256_extractf128_si256(hi, 1));
468
706
  return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
469
707
  }
470
708
 
709
+ template <>
710
+ EIGEN_STRONG_INLINE Packet16f pisnan(const Packet16f& a) {
711
+ __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_UNORD_Q);
712
+ return _mm512_castsi512_ps(_mm512_maskz_set1_epi32(mask, int32_t(-1)));
713
+ }
714
+
471
715
  template <>
472
716
  EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
473
717
  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
474
- return _mm512_castsi512_ps(
475
- _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
718
+ return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
476
719
  }
477
- template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
720
+ template <>
721
+ EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
478
722
  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
479
- return _mm512_castsi512_ps(
480
- _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
723
+ return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
481
724
  }
482
725
 
483
- template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
726
+ template <>
727
+ EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
484
728
  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
485
- return _mm512_castsi512_ps(
486
- _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
729
+ return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
487
730
  }
488
731
 
489
- template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
732
+ template <>
733
+ EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
490
734
  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
491
- return _mm512_castsi512_ps(
492
- _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
735
+ return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
493
736
  }
494
737
 
495
- template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
496
- __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
497
- return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
738
+ template <>
739
+ EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
740
+ __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ);
741
+ return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
742
+ }
743
+ template <>
744
+ EIGEN_STRONG_INLINE Packet16i pcmp_le(const Packet16i& a, const Packet16i& b) {
745
+ __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LE);
746
+ return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
747
+ }
748
+ template <>
749
+ EIGEN_STRONG_INLINE Packet16i pcmp_lt(const Packet16i& a, const Packet16i& b) {
750
+ __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
751
+ return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
498
752
  }
499
753
 
754
+ template <>
755
+ EIGEN_STRONG_INLINE Packet8l pcmp_eq(const Packet8l& a, const Packet8l& b) {
756
+ __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ);
757
+ return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
758
+ }
759
+ template <>
760
+ EIGEN_STRONG_INLINE Packet8l pcmp_le(const Packet8l& a, const Packet8l& b) {
761
+ __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LE);
762
+ return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
763
+ }
764
+ template <>
765
+ EIGEN_STRONG_INLINE Packet8l pcmp_lt(const Packet8l& a, const Packet8l& b) {
766
+ __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LT);
767
+ return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
768
+ }
500
769
 
501
770
  template <>
502
771
  EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
503
772
  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
504
- return _mm512_castsi512_pd(
505
- _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
773
+ return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
506
774
  }
507
775
  template <>
508
776
  EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
509
777
  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
510
- return _mm512_castsi512_pd(
511
- _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
778
+ return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
512
779
  }
513
780
  template <>
514
781
  EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
515
782
  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
516
- return _mm512_castsi512_pd(
517
- _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
783
+ return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
518
784
  }
519
785
  template <>
520
786
  EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
521
787
  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
522
- return _mm512_castsi512_pd(
523
- _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
788
+ return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
524
789
  }
525
790
 
526
- template<> EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION); }
527
- template<> EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION); }
791
+ template <>
792
+ EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) {
793
+ return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION);
794
+ }
795
+ template <>
796
+ EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) {
797
+ return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION);
798
+ }
528
799
 
529
- template<> EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF); }
530
- template<> EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF); }
800
+ template <>
801
+ EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) {
802
+ return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF);
803
+ }
804
+ template <>
805
+ EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) {
806
+ return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF);
807
+ }
531
808
 
532
- template<> EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF); }
533
- template<> EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF); }
809
+ template <>
810
+ EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) {
811
+ return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF);
812
+ }
813
+ template <>
814
+ EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) {
815
+ return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF);
816
+ }
817
+
818
+ template <>
819
+ EIGEN_STRONG_INLINE Packet16f ptrunc<Packet16f>(const Packet16f& a) {
820
+ return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO);
821
+ }
822
+ template <>
823
+ EIGEN_STRONG_INLINE Packet8d ptrunc<Packet8d>(const Packet8d& a) {
824
+ return _mm512_roundscale_pd(a, _MM_FROUND_TO_ZERO);
825
+ }
534
826
 
535
827
  template <>
536
828
  EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
537
- return _mm512_set1_epi32(0xffffffffu);
829
+ return _mm512_set1_epi32(int32_t(-1));
830
+ }
831
+
832
+ template <>
833
+ EIGEN_STRONG_INLINE Packet8l ptrue<Packet8l>(const Packet8l& /*a*/) {
834
+ return _mm512_set1_epi64(int64_t(-1));
538
835
  }
539
836
 
540
837
  template <>
@@ -548,23 +845,25 @@ EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
548
845
  }
549
846
 
550
847
  template <>
551
- EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
552
- const Packet16i& b) {
553
- return _mm512_and_si512(a,b);
848
+ EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a, const Packet16i& b) {
849
+ return _mm512_and_si512(a, b);
554
850
  }
555
851
 
556
852
  template <>
557
- EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
558
- const Packet16f& b) {
853
+ EIGEN_STRONG_INLINE Packet8l pand<Packet8l>(const Packet8l& a, const Packet8l& b) {
854
+ return _mm512_and_si512(a, b);
855
+ }
856
+
857
+ template <>
858
+ EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a, const Packet16f& b) {
559
859
  #ifdef EIGEN_VECTORIZE_AVX512DQ
560
860
  return _mm512_and_ps(a, b);
561
861
  #else
562
- return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
862
+ return _mm512_castsi512_ps(pand(_mm512_castps_si512(a), _mm512_castps_si512(b)));
563
863
  #endif
564
864
  }
565
865
  template <>
566
- EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
567
- const Packet8d& b) {
866
+ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a, const Packet8d& b) {
568
867
  #ifdef EIGEN_VECTORIZE_AVX512DQ
569
868
  return _mm512_and_pd(a, b);
570
869
  #else
@@ -584,22 +883,26 @@ EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i
584
883
  return _mm512_or_si512(a, b);
585
884
  }
586
885
 
886
+ template <>
887
+ EIGEN_STRONG_INLINE Packet8l por<Packet8l>(const Packet8l& a, const Packet8l& b) {
888
+ return _mm512_or_si512(a, b);
889
+ }
890
+
587
891
  template <>
588
892
  EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
589
893
  #ifdef EIGEN_VECTORIZE_AVX512DQ
590
894
  return _mm512_or_ps(a, b);
591
895
  #else
592
- return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
896
+ return _mm512_castsi512_ps(por(_mm512_castps_si512(a), _mm512_castps_si512(b)));
593
897
  #endif
594
898
  }
595
899
 
596
900
  template <>
597
- EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
598
- const Packet8d& b) {
901
+ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a, const Packet8d& b) {
599
902
  #ifdef EIGEN_VECTORIZE_AVX512DQ
600
903
  return _mm512_or_pd(a, b);
601
904
  #else
602
- return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
905
+ return _mm512_castsi512_pd(por(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
603
906
  #endif
604
907
  }
605
908
 
@@ -608,12 +911,17 @@ EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16
608
911
  return _mm512_xor_si512(a, b);
609
912
  }
610
913
 
914
+ template <>
915
+ EIGEN_STRONG_INLINE Packet8l pxor<Packet8l>(const Packet8l& a, const Packet8l& b) {
916
+ return _mm512_xor_si512(a, b);
917
+ }
918
+
611
919
  template <>
612
920
  EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
613
921
  #ifdef EIGEN_VECTORIZE_AVX512DQ
614
922
  return _mm512_xor_ps(a, b);
615
923
  #else
616
- return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
924
+ return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a), _mm512_castps_si512(b)));
617
925
  #endif
618
926
  }
619
927
 
@@ -622,7 +930,7 @@ EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b
622
930
  #ifdef EIGEN_VECTORIZE_AVX512DQ
623
931
  return _mm512_xor_pd(a, b);
624
932
  #else
625
- return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
933
+ return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
626
934
  #endif
627
935
  }
628
936
 
@@ -631,50 +939,73 @@ EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packe
631
939
  return _mm512_andnot_si512(b, a);
632
940
  }
633
941
 
942
+ template <>
943
+ EIGEN_STRONG_INLINE Packet8l pandnot<Packet8l>(const Packet8l& a, const Packet8l& b) {
944
+ return _mm512_andnot_si512(b, a);
945
+ }
946
+
634
947
  template <>
635
948
  EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
636
949
  #ifdef EIGEN_VECTORIZE_AVX512DQ
637
950
  return _mm512_andnot_ps(b, a);
638
951
  #else
639
- return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
952
+ return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a), _mm512_castps_si512(b)));
640
953
  #endif
641
954
  }
642
955
  template <>
643
- EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
956
+ EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a, const Packet8d& b) {
644
957
  #ifdef EIGEN_VECTORIZE_AVX512DQ
645
958
  return _mm512_andnot_pd(b, a);
646
959
  #else
647
- return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
960
+ return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
648
961
  #endif
649
962
  }
650
963
 
651
- template<> EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a)
652
- {
964
+ template <>
965
+ EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a) {
653
966
  // Work-around for default std::round rounding mode.
654
967
  const Packet16f mask = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x80000000u));
655
968
  const Packet16f prev0dot5 = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
656
969
  return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
657
970
  }
658
- template<> EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a)
659
- {
971
+ template <>
972
+ EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a) {
660
973
  // Work-around for default std::round rounding mode.
661
974
  const Packet8d mask = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
662
975
  const Packet8d prev0dot5 = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
663
976
  return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
664
977
  }
665
978
 
666
- template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
979
+ template <int N>
980
+ EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
667
981
  return _mm512_srai_epi32(a, N);
668
982
  }
669
983
 
670
- template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
984
+ template <int N>
985
+ EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
671
986
  return _mm512_srli_epi32(a, N);
672
987
  }
673
988
 
674
- template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
989
+ template <int N>
990
+ EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
675
991
  return _mm512_slli_epi32(a, N);
676
992
  }
677
993
 
994
+ template <int N>
995
+ EIGEN_STRONG_INLINE Packet8l parithmetic_shift_right(Packet8l a) {
996
+ return _mm512_srai_epi64(a, N);
997
+ }
998
+
999
+ template <int N>
1000
+ EIGEN_STRONG_INLINE Packet8l plogical_shift_right(Packet8l a) {
1001
+ return _mm512_srli_epi64(a, N);
1002
+ }
1003
+
1004
+ template <int N>
1005
+ EIGEN_STRONG_INLINE Packet8l plogical_shift_left(Packet8l a) {
1006
+ return _mm512_slli_epi64(a, N);
1007
+ }
1008
+
678
1009
  template <>
679
1010
  EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
680
1011
  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
@@ -685,8 +1016,11 @@ EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) {
685
1016
  }
686
1017
  template <>
687
1018
  EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
688
- EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
689
- reinterpret_cast<const __m512i*>(from));
1019
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_epi64(from);
1020
+ }
1021
+ template <>
1022
+ EIGEN_STRONG_INLINE Packet8l pload<Packet8l>(const int64_t* from) {
1023
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_epi64(from);
690
1024
  }
691
1025
 
692
1026
  template <>
@@ -699,8 +1033,11 @@ EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) {
699
1033
  }
700
1034
  template <>
701
1035
  EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
702
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
703
- reinterpret_cast<const __m512i*>(from));
1036
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi32(from);
1037
+ }
1038
+ template <>
1039
+ EIGEN_STRONG_INLINE Packet8l ploadu<Packet8l>(const int64_t* from) {
1040
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi64(from);
704
1041
  }
705
1042
 
706
1043
  template <>
@@ -708,6 +1045,11 @@ EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from, uint16_t umas
708
1045
  __mmask16 mask = static_cast<__mmask16>(umask);
709
1046
  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from);
710
1047
  }
1048
+ template <>
1049
+ EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from, uint8_t umask) {
1050
+ __mmask8 mask = static_cast<__mmask8>(umask);
1051
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_pd(mask, from);
1052
+ }
711
1053
 
712
1054
  // Loads 8 floats from memory a returns the packet
713
1055
  // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
@@ -715,43 +1057,46 @@ template <>
715
1057
  EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
716
1058
  // an unaligned load is required here as there is no requirement
717
1059
  // on the alignment of input pointer 'from'
718
- __m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
1060
+ __m256i low_half = _mm256_castps_si256(_mm256_loadu_ps(from));
719
1061
  __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
720
1062
  __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
721
1063
  return pairs;
722
1064
  }
723
1065
 
724
- #ifdef EIGEN_VECTORIZE_AVX512DQ
725
- // FIXME: this does not look optimal, better load a Packet4d and shuffle...
726
- // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
1066
+ // Loads 4 doubles from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3,
727
1067
  // a3}
728
1068
  template <>
729
1069
  EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
730
- __m512d x = _mm512_setzero_pd();
731
- x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
732
- x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
733
- x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
734
- x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
735
- return x;
1070
+ Packet8d tmp = _mm512_castpd256_pd512(ploadu<Packet4d>(from));
1071
+ const Packet8l scatter_mask = _mm512_set_epi64(3, 3, 2, 2, 1, 1, 0, 0);
1072
+ return _mm512_permutexvar_pd(scatter_mask, tmp);
736
1073
  }
737
- #else
1074
+
1075
+ // Loads 4 int64_t from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3,
1076
+ // a3}
738
1077
  template <>
739
- EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
740
- __m512d x = _mm512_setzero_pd();
741
- x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
742
- x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
743
- x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
744
- x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
745
- return x;
1078
+ EIGEN_STRONG_INLINE Packet8l ploaddup<Packet8l>(const int64_t* from) {
1079
+ Packet8l tmp = _mm512_castsi256_si512(ploadu<Packet4l>(from));
1080
+ const Packet8l scatter_mask = _mm512_set_epi64(3, 3, 2, 2, 1, 1, 0, 0);
1081
+ return _mm512_permutexvar_epi64(scatter_mask, tmp);
1082
+ }
1083
+
1084
+ // Loads 8 integers from memory and returns the packet
1085
+ // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
1086
+ template <>
1087
+ EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(const int* from) {
1088
+ __m256i low_half = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
1089
+ __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
1090
+ __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
1091
+ return _mm512_castps_si512(pairs);
746
1092
  }
747
- #endif
748
1093
 
749
1094
  // Loads 4 floats from memory a returns the packet
750
1095
  // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
751
1096
  template <>
752
1097
  EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
753
1098
  Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
754
- const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
1099
+ const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
755
1100
  return _mm512_permutexvar_ps(scatter_mask, tmp);
756
1101
  }
757
1102
 
@@ -760,12 +1105,32 @@ EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
760
1105
  template <>
761
1106
  EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
762
1107
  __m256d lane0 = _mm256_set1_pd(*from);
763
- __m256d lane1 = _mm256_set1_pd(*(from+1));
1108
+ __m256d lane1 = _mm256_set1_pd(*(from + 1));
764
1109
  __m512d tmp = _mm512_undefined_pd();
765
1110
  tmp = _mm512_insertf64x4(tmp, lane0, 0);
766
1111
  return _mm512_insertf64x4(tmp, lane1, 1);
767
1112
  }
768
1113
 
1114
+ // Loads 2 int64_t from memory a returns the packet
1115
+ // {a0, a0 a0, a0, a1, a1, a1, a1}
1116
+ template <>
1117
+ EIGEN_STRONG_INLINE Packet8l ploadquad<Packet8l>(const int64_t* from) {
1118
+ __m256i lane0 = _mm256_set1_epi64x(*from);
1119
+ __m256i lane1 = _mm256_set1_epi64x(*(from + 1));
1120
+ __m512i tmp = _mm512_undefined_epi32();
1121
+ tmp = _mm512_inserti64x4(tmp, lane0, 0);
1122
+ return _mm512_inserti64x4(tmp, lane1, 1);
1123
+ }
1124
+
1125
+ // Loads 4 integers from memory and returns the packet
1126
+ // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
1127
+ template <>
1128
+ EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int* from) {
1129
+ Packet16i tmp = _mm512_castsi128_si512(ploadu<Packet4i>(from));
1130
+ const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
1131
+ return _mm512_permutexvar_epi32(scatter_mask, tmp);
1132
+ }
1133
+
769
1134
  template <>
770
1135
  EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
771
1136
  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from);
@@ -776,8 +1141,11 @@ EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet8d& from) {
776
1141
  }
777
1142
  template <>
778
1143
  EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) {
779
- EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to),
780
- from);
1144
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_epi32(to, from);
1145
+ }
1146
+ template <>
1147
+ EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet8l& from) {
1148
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_epi64(to, from);
781
1149
  }
782
1150
 
783
1151
  template <>
@@ -790,54 +1158,128 @@ EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) {
790
1158
  }
791
1159
  template <>
792
1160
  EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
793
- EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
794
- reinterpret_cast<__m512i*>(to), from);
1161
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi32(to, from);
1162
+ }
1163
+ template <>
1164
+ EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet8l& from) {
1165
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi64(to, from);
795
1166
  }
796
1167
  template <>
797
1168
  EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
798
1169
  __mmask16 mask = static_cast<__mmask16>(umask);
799
1170
  EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from);
800
1171
  }
1172
+ template <>
1173
+ EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from, uint8_t umask) {
1174
+ __mmask8 mask = static_cast<__mmask8>(umask);
1175
+ EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_pd(to, mask, from);
1176
+ }
1177
+
1178
+ template <typename Scalar, typename Packet>
1179
+ EIGEN_DEVICE_FUNC inline Packet pgather(const Packet& src, const Scalar* from, Index stride,
1180
+ typename unpacket_traits<Packet>::mask_t umask);
1181
+ template <>
1182
+ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const Packet16f& src, const float* from, Index stride,
1183
+ uint16_t umask) {
1184
+ Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
1185
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1186
+ Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
1187
+ __mmask16 mask = static_cast<__mmask16>(umask);
1188
+
1189
+ return _mm512_mask_i32gather_ps(src, mask, indices, from, 4);
1190
+ }
1191
+ template <>
1192
+ EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const Packet8d& src, const double* from, Index stride,
1193
+ uint8_t umask) {
1194
+ Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
1195
+ Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1196
+ Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
1197
+ __mmask8 mask = static_cast<__mmask8>(umask);
1198
+
1199
+ return _mm512_mask_i32gather_pd(src, mask, indices, from, 8);
1200
+ }
801
1201
 
802
1202
  template <>
803
- EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
804
- Index stride) {
1203
+ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
805
1204
  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
806
- Packet16i stride_multiplier =
807
- _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1205
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
808
1206
  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
809
1207
 
810
1208
  return _mm512_i32gather_ps(indices, from, 4);
811
1209
  }
812
1210
  template <>
813
- EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
814
- Index stride) {
1211
+ EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from, Index stride) {
815
1212
  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
816
1213
  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
817
1214
  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
818
1215
 
819
1216
  return _mm512_i32gather_pd(indices, from, 8);
820
1217
  }
1218
+ template <>
1219
+ EIGEN_DEVICE_FUNC inline Packet8l pgather<int64_t, Packet8l>(const int64_t* from, Index stride) {
1220
+ Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
1221
+ Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1222
+ Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
1223
+
1224
+ return _mm512_i32gather_epi64(indices, from, 8);
1225
+ }
1226
+ template <>
1227
+ EIGEN_DEVICE_FUNC inline Packet16i pgather<int, Packet16i>(const int* from, Index stride) {
1228
+ Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
1229
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1230
+ Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
1231
+ return _mm512_i32gather_epi32(indices, from, 4);
1232
+ }
821
1233
 
1234
+ template <typename Scalar, typename Packet>
1235
+ EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index stride,
1236
+ typename unpacket_traits<Packet>::mask_t umask);
1237
+ template <>
1238
+ EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride,
1239
+ uint16_t umask) {
1240
+ Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
1241
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1242
+ Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
1243
+ __mmask16 mask = static_cast<__mmask16>(umask);
1244
+ _mm512_mask_i32scatter_ps(to, mask, indices, from, 4);
1245
+ }
1246
+ template <>
1247
+ EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packet8d& from, Index stride,
1248
+ uint8_t umask) {
1249
+ Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
1250
+ Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1251
+ Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
1252
+ __mmask8 mask = static_cast<__mmask8>(umask);
1253
+ _mm512_mask_i32scatter_pd(to, mask, indices, from, 8);
1254
+ }
822
1255
  template <>
823
- EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
824
- const Packet16f& from,
825
- Index stride) {
1256
+ EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
826
1257
  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
827
- Packet16i stride_multiplier =
828
- _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1258
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
829
1259
  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
830
1260
  _mm512_i32scatter_ps(to, indices, from, 4);
831
1261
  }
832
1262
  template <>
833
- EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
834
- const Packet8d& from,
835
- Index stride) {
1263
+ EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packet8d& from, Index stride) {
836
1264
  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
837
1265
  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
838
1266
  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
839
1267
  _mm512_i32scatter_pd(to, indices, from, 8);
840
1268
  }
1269
+ template <>
1270
+ EIGEN_DEVICE_FUNC inline void pscatter<int64_t, Packet8l>(int64_t* to, const Packet8l& from, Index stride) {
1271
+ Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
1272
+ Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1273
+ Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
1274
+ _mm512_i32scatter_epi64(to, indices, from, 8);
1275
+ }
1276
+ template <>
1277
+ EIGEN_DEVICE_FUNC inline void pscatter<int, Packet16i>(int* to, const Packet16i& from, Index stride) {
1278
+ Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
1279
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1280
+ Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
1281
+ _mm512_i32scatter_epi32(to, indices, from, 4);
1282
+ }
841
1283
 
842
1284
  template <>
843
1285
  EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) {
@@ -854,81 +1296,142 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
854
1296
  Packet16i pa = pset1<Packet16i>(a);
855
1297
  pstore(to, pa);
856
1298
  }
1299
+ template <>
1300
+ EIGEN_STRONG_INLINE void pstore1<Packet8l>(int64_t* to, const int64_t& a) {
1301
+ Packet8l pa = pset1<Packet8l>(a);
1302
+ pstore(to, pa);
1303
+ }
857
1304
 
858
- template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
859
- template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
860
- template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
1305
+ template <>
1306
+ EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1307
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1308
+ }
1309
+ template <>
1310
+ EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1311
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1312
+ }
1313
+ template <>
1314
+ EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1315
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1316
+ }
861
1317
 
862
1318
  template <>
863
1319
  EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
864
- return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0));
1320
+ return _mm512_cvtss_f32(a);
865
1321
  }
866
1322
  template <>
867
1323
  EIGEN_STRONG_INLINE double pfirst<Packet8d>(const Packet8d& a) {
868
- return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0));
1324
+ return _mm512_cvtsd_f64(a);
1325
+ }
1326
+ template <>
1327
+ EIGEN_STRONG_INLINE int64_t pfirst<Packet8l>(const Packet8l& a) {
1328
+ int64_t x = _mm_extract_epi64_0(_mm512_extracti32x4_epi32(a, 0));
1329
+ return x;
869
1330
  }
870
1331
  template <>
871
1332
  EIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) {
872
- return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0);
1333
+ #if EIGEN_GNUC_STRICT_LESS_THAN(11, 0, 0)
1334
+ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
1335
+ #else
1336
+ return _mm512_cvtsi512_si32(a);
1337
+ #endif
873
1338
  }
874
1339
 
875
- template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a)
876
- {
1340
+ template <>
1341
+ EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
877
1342
  return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
878
1343
  }
879
1344
 
880
- template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
881
- {
1345
+ template <>
1346
+ EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) {
882
1347
  return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
883
1348
  }
884
1349
 
885
- template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
886
- {
1350
+ template <>
1351
+ EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a) {
1352
+ return _mm512_permutexvar_epi32(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
1353
+ }
1354
+
1355
+ template <>
1356
+ EIGEN_STRONG_INLINE Packet8l preverse(const Packet8l& a) {
1357
+ return _mm512_permutexvar_epi64(_mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7), a);
1358
+ }
1359
+
1360
+ template <>
1361
+ EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
887
1362
  // _mm512_abs_ps intrinsic not found, so hack around it
888
1363
  return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
889
1364
  }
890
1365
  template <>
891
1366
  EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
892
1367
  // _mm512_abs_ps intrinsic not found, so hack around it
893
- return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
894
- _mm512_set1_epi64(0x7fffffffffffffff)));
1368
+ return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x7fffffffffffffff)));
1369
+ }
1370
+ template <>
1371
+ EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a) {
1372
+ return _mm512_abs_epi32(a);
1373
+ }
1374
+ template <>
1375
+ EIGEN_STRONG_INLINE Packet8l pabs(const Packet8l& a) {
1376
+ return _mm512_abs_epi64(a);
1377
+ }
1378
+
1379
+ #ifndef EIGEN_VECTORIZE_AVX512FP16
1380
+ template <>
1381
+ EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) {
1382
+ return _mm256_srai_epi16(a, 15);
1383
+ }
1384
+ #endif // EIGEN_VECTORIZE_AVX512FP16
1385
+
1386
+ template <>
1387
+ EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) {
1388
+ return _mm256_srai_epi16(a, 15);
1389
+ }
1390
+ template <>
1391
+ EIGEN_STRONG_INLINE Packet16f psignbit(const Packet16f& a) {
1392
+ return _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(a), 31));
1393
+ }
1394
+ template <>
1395
+ EIGEN_STRONG_INLINE Packet8d psignbit(const Packet8d& a) {
1396
+ return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), 63));
895
1397
  }
896
1398
 
897
- template<>
898
- EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent){
1399
+ template <>
1400
+ EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent) {
899
1401
  return pfrexp_generic(a, exponent);
900
1402
  }
901
1403
 
902
1404
  // Extract exponent without existence of Packet8l.
903
- template<>
904
- EIGEN_STRONG_INLINE
905
- Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
906
- const Packet8d cst_exp_mask = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
907
- #ifdef EIGEN_VECTORIZE_AVX512DQ
1405
+ template <>
1406
+ EIGEN_STRONG_INLINE Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
1407
+ const Packet8d cst_exp_mask = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
1408
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
908
1409
  return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52));
909
- #else
1410
+ #else
910
1411
  return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)));
911
- #endif
1412
+ #endif
912
1413
  }
913
1414
 
914
- template<>
1415
+ template <>
915
1416
  EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
916
1417
  return pfrexp_generic(a, exponent);
917
1418
  }
918
1419
 
919
- template<> EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
1420
+ template <>
1421
+ EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
920
1422
  return pldexp_generic(a, exponent);
921
1423
  }
922
1424
 
923
- template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
1425
+ template <>
1426
+ EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
924
1427
  // Clamp exponent to [-2099, 2099]
925
1428
  const Packet8d max_exponent = pset1<Packet8d>(2099.0);
926
1429
  const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
927
-
1430
+
928
1431
  // Split 2^e into four factors and multiply.
929
1432
  const Packet8i bias = pset1<Packet8i>(1023);
930
1433
  Packet8i b = parithmetic_shift_right<2>(e); // floor(e/4)
931
-
1434
+
932
1435
  // 2^b
933
1436
  const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
934
1437
  Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
@@ -936,7 +1439,7 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, cons
936
1439
  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
937
1440
  Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
938
1441
  Packet8d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
939
-
1442
+
940
1443
  // 2^(e - 3b)
941
1444
  b = psub(psub(psub(e, b), b), b); // e - 3b
942
1445
  hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
@@ -949,57 +1452,49 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, cons
949
1452
 
950
1453
  #ifdef EIGEN_VECTORIZE_AVX512DQ
951
1454
  // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
952
- #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
953
- __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
1455
+ #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
1456
+ __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
954
1457
  __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
1458
+
1459
+ // AVX512F does not define _mm512_extracti32x8_epi32 to extract _m256i from _m512i
1460
+ #define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
1461
+ __m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0); \
1462
+ __m256i OUTPUT##_1 = _mm512_extracti32x8_epi32(INPUT, 1)
955
1463
  #else
956
- #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
957
- __m256 OUTPUT##_0 = _mm256_insertf128_ps( \
958
- _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
959
- _mm512_extractf32x4_ps(INPUT, 1), 1); \
960
- __m256 OUTPUT##_1 = _mm256_insertf128_ps( \
961
- _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
962
- _mm512_extractf32x4_ps(INPUT, 3), 1);
1464
+ #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
1465
+ __m256 OUTPUT##_0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
1466
+ _mm512_extractf32x4_ps(INPUT, 1), 1); \
1467
+ __m256 OUTPUT##_1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
1468
+ _mm512_extractf32x4_ps(INPUT, 3), 1)
1469
+
1470
+ #define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
1471
+ __m256i OUTPUT##_0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 0)), \
1472
+ _mm512_extracti32x4_epi32(INPUT, 1), 1); \
1473
+ __m256i OUTPUT##_1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)), \
1474
+ _mm512_extracti32x4_epi32(INPUT, 3), 1)
963
1475
  #endif
964
1476
 
965
1477
  #ifdef EIGEN_VECTORIZE_AVX512DQ
966
1478
  #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
967
1479
  OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
1480
+
1481
+ #define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
1482
+ OUTPUT = _mm512_inserti32x8(_mm512_castsi256_si512(INPUTA), INPUTB, 1);
968
1483
  #else
969
1484
  #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
970
1485
  OUTPUT = _mm512_undefined_ps(); \
971
1486
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
972
1487
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
973
- OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
974
- OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
975
- #endif
976
-
977
- template <>
978
- EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
979
- #ifdef EIGEN_VECTORIZE_AVX512DQ
980
- __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
981
- __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
982
- Packet8f x = _mm256_add_ps(lane0, lane1);
983
- return predux<Packet8f>(x);
984
- #else
985
- __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
986
- __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
987
- __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
988
- __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
989
- __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
990
- sum = _mm_hadd_ps(sum, sum);
991
- sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
992
- return _mm_cvtss_f32(sum);
993
- #endif
994
- }
995
- template <>
996
- EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
997
- __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
998
- __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
999
- __m256d sum = _mm256_add_pd(lane0, lane1);
1000
- __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
1001
- return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
1002
- }
1488
+ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
1489
+ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
1490
+
1491
+ #define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
1492
+ OUTPUT = _mm512_undefined_epi32(); \
1493
+ OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 0), 0); \
1494
+ OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 1), 1); \
1495
+ OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 0), 2); \
1496
+ OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 1), 3);
1497
+ #endif
1003
1498
 
1004
1499
  template <>
1005
1500
  EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
@@ -1023,84 +1518,30 @@ EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
1023
1518
  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
1024
1519
  return _mm256_add_pd(lane0, lane1);
1025
1520
  }
1026
-
1027
1521
  template <>
1028
- EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
1029
- //#ifdef EIGEN_VECTORIZE_AVX512DQ
1030
- #if 0
1031
- Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
1032
- Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
1033
- Packet8f res = pmul(lane0, lane1);
1034
- res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
1035
- res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1036
- return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1522
+ EIGEN_STRONG_INLINE Packet8i predux_half_dowto4<Packet16i>(const Packet16i& a) {
1523
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
1524
+ __m256i lane0 = _mm512_extracti32x8_epi32(a, 0);
1525
+ __m256i lane1 = _mm512_extracti32x8_epi32(a, 1);
1526
+ return _mm256_add_epi32(lane0, lane1);
1037
1527
  #else
1038
- __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
1039
- __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
1040
- __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
1041
- __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
1042
- __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
1043
- res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1044
- return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1528
+ __m128i lane0 = _mm512_extracti32x4_epi32(a, 0);
1529
+ __m128i lane1 = _mm512_extracti32x4_epi32(a, 1);
1530
+ __m128i lane2 = _mm512_extracti32x4_epi32(a, 2);
1531
+ __m128i lane3 = _mm512_extracti32x4_epi32(a, 3);
1532
+ __m128i sum0 = _mm_add_epi32(lane0, lane2);
1533
+ __m128i sum1 = _mm_add_epi32(lane1, lane3);
1534
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(sum0), sum1, 1);
1045
1535
  #endif
1046
1536
  }
1047
- template <>
1048
- EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
1049
- __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
1050
- __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
1051
- __m256d res = pmul(lane0, lane1);
1052
- res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
1053
- return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
1054
- }
1055
-
1056
- template <>
1057
- EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
1058
- __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
1059
- __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
1060
- __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
1061
- __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
1062
- __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
1063
- res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1064
- return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1065
- }
1066
- template <>
1067
- EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
1068
- __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
1069
- __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
1070
- __m256d res = _mm256_min_pd(lane0, lane1);
1071
- res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
1072
- return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
1073
- }
1074
-
1075
- template <>
1076
- EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
1077
- __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
1078
- __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
1079
- __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
1080
- __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
1081
- __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
1082
- res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1083
- return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1084
- }
1085
1537
 
1086
1538
  template <>
1087
- EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
1088
- __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
1089
- __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
1090
- __m256d res = _mm256_max_pd(lane0, lane1);
1091
- res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
1092
- return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
1093
- }
1094
-
1095
- template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
1096
- {
1097
- Packet16i xi = _mm512_castps_si512(x);
1098
- __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
1099
- return !_mm512_kortestz(tmp,tmp);
1539
+ EIGEN_STRONG_INLINE Packet4l predux_half_dowto4<Packet8l>(const Packet8l& a) {
1540
+ __m256i lane0 = _mm512_extracti64x4_epi64(a, 0);
1541
+ __m256i lane1 = _mm512_extracti64x4_epi64(a, 1);
1542
+ return _mm256_add_epi64(lane0, lane1);
1100
1543
  }
1101
1544
 
1102
-
1103
-
1104
1545
  #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
1105
1546
  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
1106
1547
 
@@ -1215,9 +1656,46 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
1215
1656
  PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
1216
1657
  PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
1217
1658
  }
1218
- #define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
1219
- EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
1220
- INPUT[2 * INDEX + STRIDE]);
1659
+ #define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
1660
+ EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
1661
+
1662
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 8>& kernel) {
1663
+ __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1664
+ __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1665
+ __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1666
+ __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1667
+ __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
1668
+ __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
1669
+ __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
1670
+ __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
1671
+
1672
+ kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
1673
+ kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
1674
+ kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
1675
+ kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
1676
+ kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
1677
+ kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
1678
+ kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
1679
+ kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
1680
+
1681
+ T0 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0x44);
1682
+ T1 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0xee);
1683
+ T2 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0x44);
1684
+ T3 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0xee);
1685
+ T4 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0x44);
1686
+ T5 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0xee);
1687
+ T6 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0x44);
1688
+ T7 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0xee);
1689
+
1690
+ kernel.packet[0] = _mm512_shuffle_f32x4(T0, T2, 0x88);
1691
+ kernel.packet[2] = _mm512_shuffle_f32x4(T0, T2, 0xdd);
1692
+ kernel.packet[1] = _mm512_shuffle_f32x4(T4, T6, 0x88);
1693
+ kernel.packet[3] = _mm512_shuffle_f32x4(T4, T6, 0xdd);
1694
+ kernel.packet[4] = _mm512_shuffle_f32x4(T1, T3, 0x88);
1695
+ kernel.packet[6] = _mm512_shuffle_f32x4(T1, T3, 0xdd);
1696
+ kernel.packet[5] = _mm512_shuffle_f32x4(T5, T7, 0x88);
1697
+ kernel.packet[7] = _mm512_shuffle_f32x4(T5, T7, 0xdd);
1698
+ }
1221
1699
 
1222
1700
  EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
1223
1701
  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
@@ -1259,8 +1737,11 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
1259
1737
 
1260
1738
  #define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \
1261
1739
  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
1262
- OUTPUT[INDEX] = \
1263
- _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
1740
+ OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
1741
+
1742
+ #define PACK_OUTPUT_L(OUTPUT, INPUT, INDEX, STRIDE) \
1743
+ OUTPUT[INDEX] = _mm512_inserti64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
1744
+ OUTPUT[INDEX] = _mm512_inserti64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
1264
1745
 
1265
1746
  EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
1266
1747
  __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
@@ -1270,23 +1751,15 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
1270
1751
 
1271
1752
  PacketBlock<Packet4d, 8> tmp;
1272
1753
 
1273
- tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1274
- _mm512_extractf64x4_pd(T2, 0), 0x20);
1275
- tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1276
- _mm512_extractf64x4_pd(T3, 0), 0x20);
1277
- tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1278
- _mm512_extractf64x4_pd(T2, 0), 0x31);
1279
- tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1280
- _mm512_extractf64x4_pd(T3, 0), 0x31);
1281
-
1282
- tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1283
- _mm512_extractf64x4_pd(T2, 1), 0x20);
1284
- tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1285
- _mm512_extractf64x4_pd(T3, 1), 0x20);
1286
- tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1287
- _mm512_extractf64x4_pd(T2, 1), 0x31);
1288
- tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1289
- _mm512_extractf64x4_pd(T3, 1), 0x31);
1754
+ tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x20);
1755
+ tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x20);
1756
+ tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x31);
1757
+ tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x31);
1758
+
1759
+ tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x20);
1760
+ tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x20);
1761
+ tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x31);
1762
+ tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x31);
1290
1763
 
1291
1764
  PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
1292
1765
  PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
@@ -1304,107 +1777,347 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
1304
1777
  __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
1305
1778
  __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
1306
1779
 
1307
- PacketBlock<Packet4d, 16> tmp;
1308
-
1309
- tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1310
- _mm512_extractf64x4_pd(T2, 0), 0x20);
1311
- tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1312
- _mm512_extractf64x4_pd(T3, 0), 0x20);
1313
- tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1314
- _mm512_extractf64x4_pd(T2, 0), 0x31);
1315
- tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1316
- _mm512_extractf64x4_pd(T3, 0), 0x31);
1317
-
1318
- tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1319
- _mm512_extractf64x4_pd(T2, 1), 0x20);
1320
- tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1321
- _mm512_extractf64x4_pd(T3, 1), 0x20);
1322
- tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1323
- _mm512_extractf64x4_pd(T2, 1), 0x31);
1324
- tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1325
- _mm512_extractf64x4_pd(T3, 1), 0x31);
1326
-
1327
- tmp.packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
1328
- _mm512_extractf64x4_pd(T6, 0), 0x20);
1329
- tmp.packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
1330
- _mm512_extractf64x4_pd(T7, 0), 0x20);
1331
- tmp.packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
1332
- _mm512_extractf64x4_pd(T6, 0), 0x31);
1333
- tmp.packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
1334
- _mm512_extractf64x4_pd(T7, 0), 0x31);
1335
-
1336
- tmp.packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
1337
- _mm512_extractf64x4_pd(T6, 1), 0x20);
1338
- tmp.packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
1339
- _mm512_extractf64x4_pd(T7, 1), 0x20);
1340
- tmp.packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
1341
- _mm512_extractf64x4_pd(T6, 1), 0x31);
1342
- tmp.packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
1343
- _mm512_extractf64x4_pd(T7, 1), 0x31);
1344
-
1345
- PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8);
1346
- PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8);
1347
- PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8);
1348
- PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8);
1349
-
1350
- PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8);
1351
- PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8);
1352
- PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8);
1353
- PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8);
1354
- }
1355
- template <>
1356
- EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
1357
- const Packet16f& /*thenPacket*/,
1358
- const Packet16f& /*elsePacket*/) {
1359
- assert(false && "To be implemented");
1360
- return Packet16f();
1361
- }
1362
- template <>
1363
- EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
1364
- const Packet8d& thenPacket,
1780
+ kernel.packet[0] = _mm512_permutex_pd(T2, 0x4E);
1781
+ kernel.packet[0] = _mm512_mask_blend_pd(0xCC, T0, kernel.packet[0]);
1782
+ kernel.packet[2] = _mm512_permutex_pd(T0, 0x4E);
1783
+ kernel.packet[2] = _mm512_mask_blend_pd(0xCC, kernel.packet[2], T2);
1784
+ kernel.packet[1] = _mm512_permutex_pd(T3, 0x4E);
1785
+ kernel.packet[1] = _mm512_mask_blend_pd(0xCC, T1, kernel.packet[1]);
1786
+ kernel.packet[3] = _mm512_permutex_pd(T1, 0x4E);
1787
+ kernel.packet[3] = _mm512_mask_blend_pd(0xCC, kernel.packet[3], T3);
1788
+ kernel.packet[4] = _mm512_permutex_pd(T6, 0x4E);
1789
+ kernel.packet[4] = _mm512_mask_blend_pd(0xCC, T4, kernel.packet[4]);
1790
+ kernel.packet[6] = _mm512_permutex_pd(T4, 0x4E);
1791
+ kernel.packet[6] = _mm512_mask_blend_pd(0xCC, kernel.packet[6], T6);
1792
+ kernel.packet[5] = _mm512_permutex_pd(T7, 0x4E);
1793
+ kernel.packet[5] = _mm512_mask_blend_pd(0xCC, T5, kernel.packet[5]);
1794
+ kernel.packet[7] = _mm512_permutex_pd(T5, 0x4E);
1795
+ kernel.packet[7] = _mm512_mask_blend_pd(0xCC, kernel.packet[7], T7);
1796
+
1797
+ T0 = _mm512_shuffle_f64x2(kernel.packet[4], kernel.packet[4], 0x4E);
1798
+ T0 = _mm512_mask_blend_pd(0xF0, kernel.packet[0], T0);
1799
+ T4 = _mm512_shuffle_f64x2(kernel.packet[0], kernel.packet[0], 0x4E);
1800
+ T4 = _mm512_mask_blend_pd(0xF0, T4, kernel.packet[4]);
1801
+ T1 = _mm512_shuffle_f64x2(kernel.packet[5], kernel.packet[5], 0x4E);
1802
+ T1 = _mm512_mask_blend_pd(0xF0, kernel.packet[1], T1);
1803
+ T5 = _mm512_shuffle_f64x2(kernel.packet[1], kernel.packet[1], 0x4E);
1804
+ T5 = _mm512_mask_blend_pd(0xF0, T5, kernel.packet[5]);
1805
+ T2 = _mm512_shuffle_f64x2(kernel.packet[6], kernel.packet[6], 0x4E);
1806
+ T2 = _mm512_mask_blend_pd(0xF0, kernel.packet[2], T2);
1807
+ T6 = _mm512_shuffle_f64x2(kernel.packet[2], kernel.packet[2], 0x4E);
1808
+ T6 = _mm512_mask_blend_pd(0xF0, T6, kernel.packet[6]);
1809
+ T3 = _mm512_shuffle_f64x2(kernel.packet[7], kernel.packet[7], 0x4E);
1810
+ T3 = _mm512_mask_blend_pd(0xF0, kernel.packet[3], T3);
1811
+ T7 = _mm512_shuffle_f64x2(kernel.packet[3], kernel.packet[3], 0x4E);
1812
+ T7 = _mm512_mask_blend_pd(0xF0, T7, kernel.packet[7]);
1813
+
1814
+ kernel.packet[0] = T0;
1815
+ kernel.packet[1] = T1;
1816
+ kernel.packet[2] = T2;
1817
+ kernel.packet[3] = T3;
1818
+ kernel.packet[4] = T4;
1819
+ kernel.packet[5] = T5;
1820
+ kernel.packet[6] = T6;
1821
+ kernel.packet[7] = T7;
1822
+ }
1823
+
1824
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8l, 4>& kernel) {
1825
+ __m512i T0 = _mm512_castpd_si512(
1826
+ _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[0]), _mm512_castsi512_pd(kernel.packet[1]), 0));
1827
+ __m512i T1 = _mm512_castpd_si512(
1828
+ _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[0]), _mm512_castsi512_pd(kernel.packet[1]), 0xff));
1829
+ __m512i T2 = _mm512_castpd_si512(
1830
+ _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[2]), _mm512_castsi512_pd(kernel.packet[3]), 0));
1831
+ __m512i T3 = _mm512_castpd_si512(
1832
+ _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[2]), _mm512_castsi512_pd(kernel.packet[3]), 0xff));
1833
+
1834
+ PacketBlock<Packet4l, 8> tmp;
1835
+
1836
+ tmp.packet[0] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 0), _mm512_extracti64x4_epi64(T2, 0), 0x20);
1837
+ tmp.packet[1] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 0), _mm512_extracti64x4_epi64(T3, 0), 0x20);
1838
+ tmp.packet[2] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 0), _mm512_extracti64x4_epi64(T2, 0), 0x31);
1839
+ tmp.packet[3] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 0), _mm512_extracti64x4_epi64(T3, 0), 0x31);
1840
+
1841
+ tmp.packet[4] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 1), _mm512_extracti64x4_epi64(T2, 1), 0x20);
1842
+ tmp.packet[5] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 1), _mm512_extracti64x4_epi64(T3, 1), 0x20);
1843
+ tmp.packet[6] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 1), _mm512_extracti64x4_epi64(T2, 1), 0x31);
1844
+ tmp.packet[7] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 1), _mm512_extracti64x4_epi64(T3, 1), 0x31);
1845
+
1846
+ PACK_OUTPUT_L(kernel.packet, tmp.packet, 0, 1);
1847
+ PACK_OUTPUT_L(kernel.packet, tmp.packet, 1, 1);
1848
+ PACK_OUTPUT_L(kernel.packet, tmp.packet, 2, 1);
1849
+ PACK_OUTPUT_L(kernel.packet, tmp.packet, 3, 1);
1850
+ }
1851
+
1852
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8l, 8>& kernel) {
1853
+ __m512i T0 = _mm512_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
1854
+ __m512i T1 = _mm512_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
1855
+ __m512i T2 = _mm512_unpacklo_epi64(kernel.packet[2], kernel.packet[3]);
1856
+ __m512i T3 = _mm512_unpackhi_epi64(kernel.packet[2], kernel.packet[3]);
1857
+ __m512i T4 = _mm512_unpacklo_epi64(kernel.packet[4], kernel.packet[5]);
1858
+ __m512i T5 = _mm512_unpackhi_epi64(kernel.packet[4], kernel.packet[5]);
1859
+ __m512i T6 = _mm512_unpacklo_epi64(kernel.packet[6], kernel.packet[7]);
1860
+ __m512i T7 = _mm512_unpackhi_epi64(kernel.packet[6], kernel.packet[7]);
1861
+
1862
+ kernel.packet[0] = _mm512_permutex_epi64(T2, 0x4E);
1863
+ kernel.packet[0] = _mm512_mask_blend_epi64(0xCC, T0, kernel.packet[0]);
1864
+ kernel.packet[2] = _mm512_permutex_epi64(T0, 0x4E);
1865
+ kernel.packet[2] = _mm512_mask_blend_epi64(0xCC, kernel.packet[2], T2);
1866
+ kernel.packet[1] = _mm512_permutex_epi64(T3, 0x4E);
1867
+ kernel.packet[1] = _mm512_mask_blend_epi64(0xCC, T1, kernel.packet[1]);
1868
+ kernel.packet[3] = _mm512_permutex_epi64(T1, 0x4E);
1869
+ kernel.packet[3] = _mm512_mask_blend_epi64(0xCC, kernel.packet[3], T3);
1870
+ kernel.packet[4] = _mm512_permutex_epi64(T6, 0x4E);
1871
+ kernel.packet[4] = _mm512_mask_blend_epi64(0xCC, T4, kernel.packet[4]);
1872
+ kernel.packet[6] = _mm512_permutex_epi64(T4, 0x4E);
1873
+ kernel.packet[6] = _mm512_mask_blend_epi64(0xCC, kernel.packet[6], T6);
1874
+ kernel.packet[5] = _mm512_permutex_epi64(T7, 0x4E);
1875
+ kernel.packet[5] = _mm512_mask_blend_epi64(0xCC, T5, kernel.packet[5]);
1876
+ kernel.packet[7] = _mm512_permutex_epi64(T5, 0x4E);
1877
+ kernel.packet[7] = _mm512_mask_blend_epi64(0xCC, kernel.packet[7], T7);
1878
+
1879
+ T0 = _mm512_shuffle_i64x2(kernel.packet[4], kernel.packet[4], 0x4E);
1880
+ T0 = _mm512_mask_blend_epi64(0xF0, kernel.packet[0], T0);
1881
+ T4 = _mm512_shuffle_i64x2(kernel.packet[0], kernel.packet[0], 0x4E);
1882
+ T4 = _mm512_mask_blend_epi64(0xF0, T4, kernel.packet[4]);
1883
+ T1 = _mm512_shuffle_i64x2(kernel.packet[5], kernel.packet[5], 0x4E);
1884
+ T1 = _mm512_mask_blend_epi64(0xF0, kernel.packet[1], T1);
1885
+ T5 = _mm512_shuffle_i64x2(kernel.packet[1], kernel.packet[1], 0x4E);
1886
+ T5 = _mm512_mask_blend_epi64(0xF0, T5, kernel.packet[5]);
1887
+ T2 = _mm512_shuffle_i64x2(kernel.packet[6], kernel.packet[6], 0x4E);
1888
+ T2 = _mm512_mask_blend_epi64(0xF0, kernel.packet[2], T2);
1889
+ T6 = _mm512_shuffle_i64x2(kernel.packet[2], kernel.packet[2], 0x4E);
1890
+ T6 = _mm512_mask_blend_epi64(0xF0, T6, kernel.packet[6]);
1891
+ T3 = _mm512_shuffle_i64x2(kernel.packet[7], kernel.packet[7], 0x4E);
1892
+ T3 = _mm512_mask_blend_epi64(0xF0, kernel.packet[3], T3);
1893
+ T7 = _mm512_shuffle_i64x2(kernel.packet[3], kernel.packet[3], 0x4E);
1894
+ T7 = _mm512_mask_blend_epi64(0xF0, T7, kernel.packet[7]);
1895
+
1896
+ kernel.packet[0] = T0;
1897
+ kernel.packet[1] = T1;
1898
+ kernel.packet[2] = T2;
1899
+ kernel.packet[3] = T3;
1900
+ kernel.packet[4] = T4;
1901
+ kernel.packet[5] = T5;
1902
+ kernel.packet[6] = T6;
1903
+ kernel.packet[7] = T7;
1904
+ }
1905
+
1906
+ #define PACK_OUTPUT_I32(OUTPUT, INPUT, INDEX, STRIDE) \
1907
+ EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
1908
+
1909
+ #define PACK_OUTPUT_I32_2(OUTPUT, INPUT, INDEX, STRIDE) \
1910
+ EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
1911
+
1912
+ #define SHUFFLE_EPI32(A, B, M) _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(A), _mm512_castsi512_ps(B), M))
1913
+
1914
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
1915
+ __m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1916
+ __m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1917
+ __m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1918
+ __m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1919
+ __m512i T4 = _mm512_unpacklo_epi32(kernel.packet[4], kernel.packet[5]);
1920
+ __m512i T5 = _mm512_unpackhi_epi32(kernel.packet[4], kernel.packet[5]);
1921
+ __m512i T6 = _mm512_unpacklo_epi32(kernel.packet[6], kernel.packet[7]);
1922
+ __m512i T7 = _mm512_unpackhi_epi32(kernel.packet[6], kernel.packet[7]);
1923
+ __m512i T8 = _mm512_unpacklo_epi32(kernel.packet[8], kernel.packet[9]);
1924
+ __m512i T9 = _mm512_unpackhi_epi32(kernel.packet[8], kernel.packet[9]);
1925
+ __m512i T10 = _mm512_unpacklo_epi32(kernel.packet[10], kernel.packet[11]);
1926
+ __m512i T11 = _mm512_unpackhi_epi32(kernel.packet[10], kernel.packet[11]);
1927
+ __m512i T12 = _mm512_unpacklo_epi32(kernel.packet[12], kernel.packet[13]);
1928
+ __m512i T13 = _mm512_unpackhi_epi32(kernel.packet[12], kernel.packet[13]);
1929
+ __m512i T14 = _mm512_unpacklo_epi32(kernel.packet[14], kernel.packet[15]);
1930
+ __m512i T15 = _mm512_unpackhi_epi32(kernel.packet[14], kernel.packet[15]);
1931
+ __m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1932
+ __m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1933
+ __m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1934
+ __m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1935
+ __m512i S4 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
1936
+ __m512i S5 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
1937
+ __m512i S6 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
1938
+ __m512i S7 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
1939
+ __m512i S8 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
1940
+ __m512i S9 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
1941
+ __m512i S10 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
1942
+ __m512i S11 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
1943
+ __m512i S12 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
1944
+ __m512i S13 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
1945
+ __m512i S14 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
1946
+ __m512i S15 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
1947
+
1948
+ EIGEN_EXTRACT_8i_FROM_16i(S0, S0);
1949
+ EIGEN_EXTRACT_8i_FROM_16i(S1, S1);
1950
+ EIGEN_EXTRACT_8i_FROM_16i(S2, S2);
1951
+ EIGEN_EXTRACT_8i_FROM_16i(S3, S3);
1952
+ EIGEN_EXTRACT_8i_FROM_16i(S4, S4);
1953
+ EIGEN_EXTRACT_8i_FROM_16i(S5, S5);
1954
+ EIGEN_EXTRACT_8i_FROM_16i(S6, S6);
1955
+ EIGEN_EXTRACT_8i_FROM_16i(S7, S7);
1956
+ EIGEN_EXTRACT_8i_FROM_16i(S8, S8);
1957
+ EIGEN_EXTRACT_8i_FROM_16i(S9, S9);
1958
+ EIGEN_EXTRACT_8i_FROM_16i(S10, S10);
1959
+ EIGEN_EXTRACT_8i_FROM_16i(S11, S11);
1960
+ EIGEN_EXTRACT_8i_FROM_16i(S12, S12);
1961
+ EIGEN_EXTRACT_8i_FROM_16i(S13, S13);
1962
+ EIGEN_EXTRACT_8i_FROM_16i(S14, S14);
1963
+ EIGEN_EXTRACT_8i_FROM_16i(S15, S15);
1964
+
1965
+ PacketBlock<Packet8i, 32> tmp;
1966
+
1967
+ tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S4_0, 0x20);
1968
+ tmp.packet[1] = _mm256_permute2f128_si256(S1_0, S5_0, 0x20);
1969
+ tmp.packet[2] = _mm256_permute2f128_si256(S2_0, S6_0, 0x20);
1970
+ tmp.packet[3] = _mm256_permute2f128_si256(S3_0, S7_0, 0x20);
1971
+ tmp.packet[4] = _mm256_permute2f128_si256(S0_0, S4_0, 0x31);
1972
+ tmp.packet[5] = _mm256_permute2f128_si256(S1_0, S5_0, 0x31);
1973
+ tmp.packet[6] = _mm256_permute2f128_si256(S2_0, S6_0, 0x31);
1974
+ tmp.packet[7] = _mm256_permute2f128_si256(S3_0, S7_0, 0x31);
1975
+
1976
+ tmp.packet[8] = _mm256_permute2f128_si256(S0_1, S4_1, 0x20);
1977
+ tmp.packet[9] = _mm256_permute2f128_si256(S1_1, S5_1, 0x20);
1978
+ tmp.packet[10] = _mm256_permute2f128_si256(S2_1, S6_1, 0x20);
1979
+ tmp.packet[11] = _mm256_permute2f128_si256(S3_1, S7_1, 0x20);
1980
+ tmp.packet[12] = _mm256_permute2f128_si256(S0_1, S4_1, 0x31);
1981
+ tmp.packet[13] = _mm256_permute2f128_si256(S1_1, S5_1, 0x31);
1982
+ tmp.packet[14] = _mm256_permute2f128_si256(S2_1, S6_1, 0x31);
1983
+ tmp.packet[15] = _mm256_permute2f128_si256(S3_1, S7_1, 0x31);
1984
+
1985
+ // Second set of _m256 outputs
1986
+ tmp.packet[16] = _mm256_permute2f128_si256(S8_0, S12_0, 0x20);
1987
+ tmp.packet[17] = _mm256_permute2f128_si256(S9_0, S13_0, 0x20);
1988
+ tmp.packet[18] = _mm256_permute2f128_si256(S10_0, S14_0, 0x20);
1989
+ tmp.packet[19] = _mm256_permute2f128_si256(S11_0, S15_0, 0x20);
1990
+ tmp.packet[20] = _mm256_permute2f128_si256(S8_0, S12_0, 0x31);
1991
+ tmp.packet[21] = _mm256_permute2f128_si256(S9_0, S13_0, 0x31);
1992
+ tmp.packet[22] = _mm256_permute2f128_si256(S10_0, S14_0, 0x31);
1993
+ tmp.packet[23] = _mm256_permute2f128_si256(S11_0, S15_0, 0x31);
1994
+
1995
+ tmp.packet[24] = _mm256_permute2f128_si256(S8_1, S12_1, 0x20);
1996
+ tmp.packet[25] = _mm256_permute2f128_si256(S9_1, S13_1, 0x20);
1997
+ tmp.packet[26] = _mm256_permute2f128_si256(S10_1, S14_1, 0x20);
1998
+ tmp.packet[27] = _mm256_permute2f128_si256(S11_1, S15_1, 0x20);
1999
+ tmp.packet[28] = _mm256_permute2f128_si256(S8_1, S12_1, 0x31);
2000
+ tmp.packet[29] = _mm256_permute2f128_si256(S9_1, S13_1, 0x31);
2001
+ tmp.packet[30] = _mm256_permute2f128_si256(S10_1, S14_1, 0x31);
2002
+ tmp.packet[31] = _mm256_permute2f128_si256(S11_1, S15_1, 0x31);
2003
+
2004
+ // Pack them into the output
2005
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 0, 16);
2006
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 1, 16);
2007
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 2, 16);
2008
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 3, 16);
2009
+
2010
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 4, 16);
2011
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 5, 16);
2012
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 6, 16);
2013
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 7, 16);
2014
+
2015
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 8, 16);
2016
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 9, 16);
2017
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 10, 16);
2018
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 11, 16);
2019
+
2020
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 12, 16);
2021
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 13, 16);
2022
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 14, 16);
2023
+ PACK_OUTPUT_I32(kernel.packet, tmp.packet, 15, 16);
2024
+ }
2025
+
2026
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
2027
+ __m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
2028
+ __m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
2029
+ __m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
2030
+ __m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
2031
+
2032
+ __m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
2033
+ __m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
2034
+ __m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
2035
+ __m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
2036
+
2037
+ EIGEN_EXTRACT_8i_FROM_16i(S0, S0);
2038
+ EIGEN_EXTRACT_8i_FROM_16i(S1, S1);
2039
+ EIGEN_EXTRACT_8i_FROM_16i(S2, S2);
2040
+ EIGEN_EXTRACT_8i_FROM_16i(S3, S3);
2041
+
2042
+ PacketBlock<Packet8i, 8> tmp;
2043
+
2044
+ tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S1_0, 0x20);
2045
+ tmp.packet[1] = _mm256_permute2f128_si256(S2_0, S3_0, 0x20);
2046
+ tmp.packet[2] = _mm256_permute2f128_si256(S0_0, S1_0, 0x31);
2047
+ tmp.packet[3] = _mm256_permute2f128_si256(S2_0, S3_0, 0x31);
2048
+
2049
+ tmp.packet[4] = _mm256_permute2f128_si256(S0_1, S1_1, 0x20);
2050
+ tmp.packet[5] = _mm256_permute2f128_si256(S2_1, S3_1, 0x20);
2051
+ tmp.packet[6] = _mm256_permute2f128_si256(S0_1, S1_1, 0x31);
2052
+ tmp.packet[7] = _mm256_permute2f128_si256(S2_1, S3_1, 0x31);
2053
+
2054
+ PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 0, 1);
2055
+ PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 1, 1);
2056
+ PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 2, 1);
2057
+ PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 3, 1);
2058
+ }
2059
+
2060
+ template <size_t N>
2061
+ EIGEN_STRONG_INLINE int avx512_blend_mask(const Selector<N>& ifPacket) {
2062
+ alignas(__m128i) uint8_t aux[sizeof(__m128i)];
2063
+ for (size_t i = 0; i < N; i++) aux[i] = static_cast<uint8_t>(ifPacket.select[i]);
2064
+ __m128i paux = _mm_sub_epi8(_mm_setzero_si128(), _mm_load_si128(reinterpret_cast<const __m128i*>(aux)));
2065
+ return _mm_movemask_epi8(paux);
2066
+ }
2067
+
2068
+ template <>
2069
+ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket, const Packet16f& thenPacket,
2070
+ const Packet16f& elsePacket) {
2071
+ __mmask16 m = avx512_blend_mask(ifPacket);
2072
+ return _mm512_mask_blend_ps(m, elsePacket, thenPacket);
2073
+ }
2074
+ template <>
2075
+ EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, const Packet8d& thenPacket,
1365
2076
  const Packet8d& elsePacket) {
1366
- __mmask8 m = (ifPacket.select[0] )
1367
- | (ifPacket.select[1]<<1)
1368
- | (ifPacket.select[2]<<2)
1369
- | (ifPacket.select[3]<<3)
1370
- | (ifPacket.select[4]<<4)
1371
- | (ifPacket.select[5]<<5)
1372
- | (ifPacket.select[6]<<6)
1373
- | (ifPacket.select[7]<<7);
2077
+ __mmask8 m = avx512_blend_mask(ifPacket);
1374
2078
  return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
1375
2079
  }
1376
2080
 
1377
2081
  // Packet math for Eigen::half
1378
- template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
2082
+ #ifndef EIGEN_VECTORIZE_AVX512FP16
2083
+ template <>
2084
+ EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
1379
2085
  return _mm256_set1_epi16(from.x);
1380
2086
  }
1381
2087
 
1382
- template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
2088
+ template <>
2089
+ EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
1383
2090
  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
1384
2091
  }
1385
2092
 
1386
- template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
2093
+ template <>
2094
+ EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
1387
2095
  return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
1388
2096
  }
1389
2097
 
1390
- template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
2098
+ template <>
2099
+ EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
1391
2100
  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
1392
2101
  }
1393
2102
 
1394
- template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
2103
+ template <>
2104
+ EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
1395
2105
  // (void*) -> workaround clang warning:
1396
2106
  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
2107
+ EIGEN_DEBUG_ALIGNED_STORE
1397
2108
  _mm256_store_si256((__m256i*)(void*)to, from);
1398
2109
  }
1399
2110
 
1400
- template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
2111
+ template <>
2112
+ EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
1401
2113
  // (void*) -> workaround clang warning:
1402
2114
  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
2115
+ EIGEN_DEBUG_UNALIGNED_STORE
1403
2116
  _mm256_storeu_si256((__m256i*)(void*)to, from);
1404
2117
  }
1405
2118
 
1406
- template<> EIGEN_STRONG_INLINE Packet16h
1407
- ploaddup<Packet16h>(const Eigen::half* from) {
2119
+ template <>
2120
+ EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
1408
2121
  unsigned short a = from[0].x;
1409
2122
  unsigned short b = from[1].x;
1410
2123
  unsigned short c = from[2].x;
@@ -1416,8 +2129,8 @@ ploaddup<Packet16h>(const Eigen::half* from) {
1416
2129
  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
1417
2130
  }
1418
2131
 
1419
- template<> EIGEN_STRONG_INLINE Packet16h
1420
- ploadquad(const Eigen::half* from) {
2132
+ template <>
2133
+ EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half* from) {
1421
2134
  unsigned short a = from[0].x;
1422
2135
  unsigned short b = from[1].x;
1423
2136
  unsigned short c = from[2].x;
@@ -1425,65 +2138,15 @@ ploadquad(const Eigen::half* from) {
1425
2138
  return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
1426
2139
  }
1427
2140
 
1428
- EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
1429
- #ifdef EIGEN_HAS_FP16_C
1430
- return _mm512_cvtph_ps(a);
1431
- #else
1432
- EIGEN_ALIGN64 half aux[16];
1433
- pstore(aux, a);
1434
- float f0(aux[0]);
1435
- float f1(aux[1]);
1436
- float f2(aux[2]);
1437
- float f3(aux[3]);
1438
- float f4(aux[4]);
1439
- float f5(aux[5]);
1440
- float f6(aux[6]);
1441
- float f7(aux[7]);
1442
- float f8(aux[8]);
1443
- float f9(aux[9]);
1444
- float fa(aux[10]);
1445
- float fb(aux[11]);
1446
- float fc(aux[12]);
1447
- float fd(aux[13]);
1448
- float fe(aux[14]);
1449
- float ff(aux[15]);
1450
-
1451
- return _mm512_set_ps(
1452
- ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
1453
- #endif
1454
- }
2141
+ EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtph_ps(a); }
1455
2142
 
1456
2143
  EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
1457
- #ifdef EIGEN_HAS_FP16_C
1458
- return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
1459
- #else
1460
- EIGEN_ALIGN64 float aux[16];
1461
- pstore(aux, a);
1462
- half h0(aux[0]);
1463
- half h1(aux[1]);
1464
- half h2(aux[2]);
1465
- half h3(aux[3]);
1466
- half h4(aux[4]);
1467
- half h5(aux[5]);
1468
- half h6(aux[6]);
1469
- half h7(aux[7]);
1470
- half h8(aux[8]);
1471
- half h9(aux[9]);
1472
- half ha(aux[10]);
1473
- half hb(aux[11]);
1474
- half hc(aux[12]);
1475
- half hd(aux[13]);
1476
- half he(aux[14]);
1477
- half hf(aux[15]);
1478
-
1479
- return _mm256_set_epi16(
1480
- hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
1481
- h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
1482
- #endif
2144
+ return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1483
2145
  }
1484
2146
 
1485
- template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
1486
- return ptrue(Packet8i(a));
2147
+ template <>
2148
+ EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
2149
+ return Packet16h(ptrue(Packet8i(a)));
1487
2150
  }
1488
2151
 
1489
2152
  template <>
@@ -1493,14 +2156,12 @@ EIGEN_STRONG_INLINE Packet16h pabs(const Packet16h& a) {
1493
2156
  }
1494
2157
 
1495
2158
  template <>
1496
- EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a,
1497
- const Packet16h& b) {
2159
+ EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
1498
2160
  return float2half(pmin<Packet16f>(half2float(a), half2float(b)));
1499
2161
  }
1500
2162
 
1501
2163
  template <>
1502
- EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a,
1503
- const Packet16h& b) {
2164
+ EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
1504
2165
  return float2half(pmax<Packet16f>(half2float(a), half2float(b)));
1505
2166
  }
1506
2167
 
@@ -1509,164 +2170,185 @@ EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
1509
2170
  return float2half(plset<Packet16f>(static_cast<float>(a)));
1510
2171
  }
1511
2172
 
1512
- template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
2173
+ template <>
2174
+ EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
1513
2175
  // in some cases Packet8i is a wrapper around __m256i, so we need to
1514
2176
  // cast to Packet8i to call the correct overload.
1515
- return por(Packet8i(a),Packet8i(b));
2177
+ return Packet16h(por(Packet8i(a), Packet8i(b)));
1516
2178
  }
1517
- template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
1518
- return pxor(Packet8i(a),Packet8i(b));
2179
+ template <>
2180
+ EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
2181
+ return Packet16h(pxor(Packet8i(a), Packet8i(b)));
1519
2182
  }
1520
- template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
1521
- return pand(Packet8i(a),Packet8i(b));
2183
+ template <>
2184
+ EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
2185
+ return Packet16h(pand(Packet8i(a), Packet8i(b)));
1522
2186
  }
1523
- template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
1524
- return pandnot(Packet8i(a),Packet8i(b));
2187
+ template <>
2188
+ EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
2189
+ return Packet16h(pandnot(Packet8i(a), Packet8i(b)));
1525
2190
  }
1526
2191
 
1527
- template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
2192
+ template <>
2193
+ EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
1528
2194
  return _mm256_blendv_epi8(b, a, mask);
1529
2195
  }
1530
2196
 
1531
- template<> EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
2197
+ template <>
2198
+ EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
1532
2199
  return float2half(pround<Packet16f>(half2float(a)));
1533
2200
  }
1534
2201
 
1535
- template<> EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
2202
+ template <>
2203
+ EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
1536
2204
  return float2half(print<Packet16f>(half2float(a)));
1537
2205
  }
1538
2206
 
1539
- template<> EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
2207
+ template <>
2208
+ EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
1540
2209
  return float2half(pceil<Packet16f>(half2float(a)));
1541
2210
  }
1542
2211
 
1543
- template<> EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
2212
+ template <>
2213
+ EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
1544
2214
  return float2half(pfloor<Packet16f>(half2float(a)));
1545
2215
  }
1546
2216
 
1547
- template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
2217
+ template <>
2218
+ EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) {
2219
+ return float2half(ptrunc<Packet16f>(half2float(a)));
2220
+ }
2221
+
2222
+ template <>
2223
+ EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
1548
2224
  Packet16f af = half2float(a);
1549
2225
  Packet16f bf = half2float(b);
1550
2226
  return Pack32To16(pcmp_eq(af, bf));
1551
2227
  }
1552
2228
 
1553
- template<> EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a,const Packet16h& b) {
2229
+ template <>
2230
+ EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
1554
2231
  return Pack32To16(pcmp_le(half2float(a), half2float(b)));
1555
2232
  }
1556
2233
 
1557
- template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a,const Packet16h& b) {
2234
+ template <>
2235
+ EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
1558
2236
  return Pack32To16(pcmp_lt(half2float(a), half2float(b)));
1559
2237
  }
1560
2238
 
1561
- template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a,const Packet16h& b) {
2239
+ template <>
2240
+ EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
1562
2241
  return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b)));
1563
2242
  }
1564
2243
 
1565
- template<> EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) { return a; }
2244
+ template <>
2245
+ EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) {
2246
+ return a;
2247
+ }
1566
2248
 
1567
- template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
2249
+ template <>
2250
+ EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
1568
2251
  Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
1569
2252
  return _mm256_xor_si256(a, sign_mask);
1570
2253
  }
1571
2254
 
1572
- template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
2255
+ template <>
2256
+ EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
1573
2257
  Packet16f af = half2float(a);
1574
2258
  Packet16f bf = half2float(b);
1575
2259
  Packet16f rf = padd(af, bf);
1576
2260
  return float2half(rf);
1577
2261
  }
1578
2262
 
1579
- template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
2263
+ template <>
2264
+ EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
1580
2265
  Packet16f af = half2float(a);
1581
2266
  Packet16f bf = half2float(b);
1582
2267
  Packet16f rf = psub(af, bf);
1583
2268
  return float2half(rf);
1584
2269
  }
1585
2270
 
1586
- template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
2271
+ template <>
2272
+ EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
1587
2273
  Packet16f af = half2float(a);
1588
2274
  Packet16f bf = half2float(b);
1589
2275
  Packet16f rf = pmul(af, bf);
1590
2276
  return float2half(rf);
1591
2277
  }
1592
2278
 
1593
- template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
2279
+ template <>
2280
+ EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
1594
2281
  Packet16f af = half2float(a);
1595
2282
  Packet16f bf = half2float(b);
1596
2283
  Packet16f rf = pdiv(af, bf);
1597
2284
  return float2half(rf);
1598
2285
  }
1599
2286
 
1600
- template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
1601
- Packet16f from_float = half2float(from);
1602
- return half(predux(from_float));
2287
+ template <>
2288
+ EIGEN_STRONG_INLINE Packet16h pmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
2289
+ return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
1603
2290
  }
1604
2291
 
1605
2292
  template <>
1606
- EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
1607
- Packet8h lane0 = _mm256_extractf128_si256(a, 0);
1608
- Packet8h lane1 = _mm256_extractf128_si256(a, 1);
1609
- return padd<Packet8h>(lane0, lane1);
2293
+ EIGEN_STRONG_INLINE Packet16h pmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
2294
+ return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
1610
2295
  }
1611
2296
 
1612
- template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
1613
- Packet16f af = half2float(a);
1614
- float reduced = predux_max<Packet16f>(af);
1615
- return Eigen::half(reduced);
2297
+ template <>
2298
+ EIGEN_STRONG_INLINE Packet16h pnmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
2299
+ return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
1616
2300
  }
1617
2301
 
1618
- template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
1619
- Packet16f af = half2float(a);
1620
- float reduced = predux_min<Packet16f>(af);
1621
- return Eigen::half(reduced);
2302
+ template <>
2303
+ EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
2304
+ return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
1622
2305
  }
1623
2306
 
1624
- template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
1625
- Packet16f from_float = half2float(from);
1626
- return half(predux_mul(from_float));
2307
+ template <>
2308
+ EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
2309
+ Packet8h lane0 = _mm256_extractf128_si256(a, 0);
2310
+ Packet8h lane1 = _mm256_extractf128_si256(a, 1);
2311
+ return padd<Packet8h>(lane0, lane1);
1627
2312
  }
1628
2313
 
1629
- template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
1630
- {
1631
- __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
1632
- return _mm256_insertf128_si256(
1633
- _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)),
1634
- _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1);
2314
+ template <>
2315
+ EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
2316
+ __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
2317
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a, 1), m)),
2318
+ _mm_shuffle_epi8(_mm256_extractf128_si256(a, 0), m), 1);
1635
2319
  }
1636
2320
 
1637
- template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
1638
- {
1639
- return _mm256_set_epi16(
1640
- from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
1641
- from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
1642
- from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
1643
- from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
2321
+ template <>
2322
+ EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
2323
+ return _mm256_set_epi16(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
2324
+ from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
2325
+ from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
2326
+ from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
1644
2327
  }
1645
2328
 
1646
- template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
1647
- {
2329
+ template <>
2330
+ EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
1648
2331
  EIGEN_ALIGN64 half aux[16];
1649
2332
  pstore(aux, from);
1650
- to[stride*0] = aux[0];
1651
- to[stride*1] = aux[1];
1652
- to[stride*2] = aux[2];
1653
- to[stride*3] = aux[3];
1654
- to[stride*4] = aux[4];
1655
- to[stride*5] = aux[5];
1656
- to[stride*6] = aux[6];
1657
- to[stride*7] = aux[7];
1658
- to[stride*8] = aux[8];
1659
- to[stride*9] = aux[9];
1660
- to[stride*10] = aux[10];
1661
- to[stride*11] = aux[11];
1662
- to[stride*12] = aux[12];
1663
- to[stride*13] = aux[13];
1664
- to[stride*14] = aux[14];
1665
- to[stride*15] = aux[15];
1666
- }
1667
-
1668
- EIGEN_STRONG_INLINE void
1669
- ptranspose(PacketBlock<Packet16h,16>& kernel) {
2333
+ to[stride * 0] = aux[0];
2334
+ to[stride * 1] = aux[1];
2335
+ to[stride * 2] = aux[2];
2336
+ to[stride * 3] = aux[3];
2337
+ to[stride * 4] = aux[4];
2338
+ to[stride * 5] = aux[5];
2339
+ to[stride * 6] = aux[6];
2340
+ to[stride * 7] = aux[7];
2341
+ to[stride * 8] = aux[8];
2342
+ to[stride * 9] = aux[9];
2343
+ to[stride * 10] = aux[10];
2344
+ to[stride * 11] = aux[11];
2345
+ to[stride * 12] = aux[12];
2346
+ to[stride * 13] = aux[13];
2347
+ to[stride * 14] = aux[14];
2348
+ to[stride * 15] = aux[15];
2349
+ }
2350
+
2351
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
1670
2352
  __m256i a = kernel.packet[0];
1671
2353
  __m256i b = kernel.packet[1];
1672
2354
  __m256i c = kernel.packet[2];
@@ -1773,8 +2455,7 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
1773
2455
  kernel.packet[15] = a_p_f;
1774
2456
  }
1775
2457
 
1776
- EIGEN_STRONG_INLINE void
1777
- ptranspose(PacketBlock<Packet16h,8>& kernel) {
2458
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
1778
2459
  EIGEN_ALIGN64 half in[8][16];
1779
2460
  pstore<half>(in[0], kernel.packet[0]);
1780
2461
  pstore<half>(in[1], kernel.packet[1]);
@@ -1789,10 +2470,10 @@ ptranspose(PacketBlock<Packet16h,8>& kernel) {
1789
2470
 
1790
2471
  for (int i = 0; i < 8; ++i) {
1791
2472
  for (int j = 0; j < 8; ++j) {
1792
- out[i][j] = in[j][2*i];
2473
+ out[i][j] = in[j][2 * i];
1793
2474
  }
1794
2475
  for (int j = 0; j < 8; ++j) {
1795
- out[i][j+8] = in[j][2*i+1];
2476
+ out[i][j + 8] = in[j][2 * i + 1];
1796
2477
  }
1797
2478
  }
1798
2479
 
@@ -1806,8 +2487,7 @@ ptranspose(PacketBlock<Packet16h,8>& kernel) {
1806
2487
  kernel.packet[7] = pload<Packet16h>(out[7]);
1807
2488
  }
1808
2489
 
1809
- EIGEN_STRONG_INLINE void
1810
- ptranspose(PacketBlock<Packet16h,4>& kernel) {
2490
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
1811
2491
  EIGEN_ALIGN64 half in[4][16];
1812
2492
  pstore<half>(in[0], kernel.packet[0]);
1813
2493
  pstore<half>(in[1], kernel.packet[1]);
@@ -1818,16 +2498,16 @@ ptranspose(PacketBlock<Packet16h,4>& kernel) {
1818
2498
 
1819
2499
  for (int i = 0; i < 4; ++i) {
1820
2500
  for (int j = 0; j < 4; ++j) {
1821
- out[i][j] = in[j][4*i];
2501
+ out[i][j] = in[j][4 * i];
1822
2502
  }
1823
2503
  for (int j = 0; j < 4; ++j) {
1824
- out[i][j+4] = in[j][4*i+1];
2504
+ out[i][j + 4] = in[j][4 * i + 1];
1825
2505
  }
1826
2506
  for (int j = 0; j < 4; ++j) {
1827
- out[i][j+8] = in[j][4*i+2];
2507
+ out[i][j + 8] = in[j][4 * i + 2];
1828
2508
  }
1829
2509
  for (int j = 0; j < 4; ++j) {
1830
- out[i][j+12] = in[j][4*i+3];
2510
+ out[i][j + 12] = in[j][4 * i + 3];
1831
2511
  }
1832
2512
  }
1833
2513
 
@@ -1837,7 +2517,12 @@ ptranspose(PacketBlock<Packet16h,4>& kernel) {
1837
2517
  kernel.packet[3] = pload<Packet16h>(out[3]);
1838
2518
  }
1839
2519
 
1840
- template <> struct is_arithmetic<Packet16bf> { enum { value = true }; };
2520
+ #endif // EIGEN_VECTORIZE_AVX512FP16
2521
+
2522
+ template <>
2523
+ struct is_arithmetic<Packet16bf> {
2524
+ enum { value = true };
2525
+ };
1841
2526
 
1842
2527
  template <>
1843
2528
  struct packet_traits<bfloat16> : default_packet_traits {
@@ -1847,35 +2532,37 @@ struct packet_traits<bfloat16> : default_packet_traits {
1847
2532
  Vectorizable = 1,
1848
2533
  AlignedOnScalar = 1,
1849
2534
  size = 16,
1850
- HasHalfPacket = 1,
1851
2535
  HasBlend = 0,
1852
2536
  HasInsert = 1,
1853
2537
  HasSin = EIGEN_FAST_MATH,
1854
2538
  HasCos = EIGEN_FAST_MATH,
1855
- #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
2539
+ HasSqrt = 1,
2540
+ HasRsqrt = 1,
1856
2541
  #ifdef EIGEN_VECTORIZE_AVX512DQ
1857
2542
  HasLog = 1, // Currently fails test with bad accuracy.
1858
- HasLog1p = 1,
1859
- HasExpm1 = 1,
2543
+ HasLog1p = 1,
2544
+ HasExpm1 = 1,
1860
2545
  HasNdtri = 1,
1861
- HasBessel = 1,
2546
+ HasBessel = 1,
1862
2547
  #endif
1863
2548
  HasExp = 1,
1864
- HasSqrt = EIGEN_FAST_MATH,
1865
- HasRsqrt = EIGEN_FAST_MATH,
1866
2549
  HasTanh = EIGEN_FAST_MATH,
1867
2550
  HasErf = EIGEN_FAST_MATH,
1868
- #endif
1869
- HasCmp = 1,
2551
+ HasCmp = 1,
1870
2552
  HasDiv = 1
1871
2553
  };
1872
2554
  };
1873
2555
 
1874
2556
  template <>
1875
- struct unpacket_traits<Packet16bf>
1876
- {
2557
+ struct unpacket_traits<Packet16bf> {
1877
2558
  typedef bfloat16 type;
1878
- enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
2559
+ enum {
2560
+ size = 16,
2561
+ alignment = Aligned32,
2562
+ vectorizable = true,
2563
+ masked_load_available = false,
2564
+ masked_store_available = false
2565
+ };
1879
2566
  typedef Packet8bf half;
1880
2567
  };
1881
2568
 
@@ -1902,20 +2589,19 @@ EIGEN_STRONG_INLINE Packet16bf ploadu<Packet16bf>(const bfloat16* from) {
1902
2589
  }
1903
2590
 
1904
2591
  template <>
1905
- EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to,
1906
- const Packet16bf& from) {
2592
+ EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet16bf& from) {
2593
+ EIGEN_DEBUG_ALIGNED_STORE
1907
2594
  _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
1908
2595
  }
1909
2596
 
1910
2597
  template <>
1911
- EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to,
1912
- const Packet16bf& from) {
2598
+ EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet16bf& from) {
2599
+ EIGEN_DEBUG_UNALIGNED_STORE
1913
2600
  _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
1914
2601
  }
1915
2602
 
1916
- template<> EIGEN_STRONG_INLINE Packet16bf
1917
- ploaddup<Packet16bf>(const bfloat16* from) {
1918
- Packet16bf r;
2603
+ template <>
2604
+ EIGEN_STRONG_INLINE Packet16bf ploaddup<Packet16bf>(const bfloat16* from) {
1919
2605
  unsigned short a = from[0].value;
1920
2606
  unsigned short b = from[1].value;
1921
2607
  unsigned short c = from[2].value;
@@ -1927,9 +2613,8 @@ ploaddup<Packet16bf>(const bfloat16* from) {
1927
2613
  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
1928
2614
  }
1929
2615
 
1930
- template<> EIGEN_STRONG_INLINE Packet16bf
1931
- ploadquad(const bfloat16* from) {
1932
- Packet16bf r;
2616
+ template <>
2617
+ EIGEN_STRONG_INLINE Packet16bf ploadquad(const bfloat16* from) {
1933
2618
  unsigned short a = from[0].value;
1934
2619
  unsigned short b = from[1].value;
1935
2620
  unsigned short c = from[2].value;
@@ -1945,9 +2630,9 @@ EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) {
1945
2630
  EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
1946
2631
  Packet16bf r;
1947
2632
 
1948
- #if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1)
2633
+ #if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_STRICT_AT_LEAST(10, 1, 0)
1949
2634
  // Since GCC 10.1 supports avx512bf16 and C style explicit cast
1950
- // (C++ static_cast is not supported yet), do converion via intrinsic
2635
+ // (C++ static_cast is not supported yet), do conversion via intrinsic
1951
2636
  // and register path for performance.
1952
2637
  r = (__m256i)(_mm512_cvtneps_pbh(a));
1953
2638
 
@@ -1971,84 +2656,85 @@ EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
1971
2656
  t = _mm512_mask_blend_epi32(mask, nan, t);
1972
2657
  // output.value = static_cast<uint16_t>(input);
1973
2658
  r = _mm512_cvtepi32_epi16(t);
1974
- #endif // EIGEN_VECTORIZE_AVX512BF16
2659
+ #endif // EIGEN_VECTORIZE_AVX512BF16
1975
2660
 
1976
2661
  return r;
1977
2662
  }
1978
2663
 
1979
2664
  template <>
1980
2665
  EIGEN_STRONG_INLINE Packet16bf ptrue(const Packet16bf& a) {
1981
- return ptrue<Packet8i>(a);
2666
+ return Packet16bf(ptrue<Packet8i>(Packet8i(a)));
1982
2667
  }
1983
2668
 
1984
2669
  template <>
1985
2670
  EIGEN_STRONG_INLINE Packet16bf por(const Packet16bf& a, const Packet16bf& b) {
1986
- return por<Packet8i>(a, b);
2671
+ return Packet16bf(por<Packet8i>(Packet8i(a), Packet8i(b)));
1987
2672
  }
1988
2673
 
1989
2674
  template <>
1990
2675
  EIGEN_STRONG_INLINE Packet16bf pxor(const Packet16bf& a, const Packet16bf& b) {
1991
- return pxor<Packet8i>(a, b);
2676
+ return Packet16bf(pxor<Packet8i>(Packet8i(a), Packet8i(b)));
1992
2677
  }
1993
2678
 
1994
2679
  template <>
1995
2680
  EIGEN_STRONG_INLINE Packet16bf pand(const Packet16bf& a, const Packet16bf& b) {
1996
- return pand<Packet8i>(a, b);
2681
+ return Packet16bf(pand<Packet8i>(Packet8i(a), Packet8i(b)));
1997
2682
  }
1998
2683
 
1999
2684
  template <>
2000
- EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a,
2001
- const Packet16bf& b) {
2002
- return pandnot<Packet8i>(a, b);
2685
+ EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a, const Packet16bf& b) {
2686
+ return Packet16bf(pandnot<Packet8i>(Packet8i(a), Packet8i(b)));
2003
2687
  }
2004
2688
 
2005
2689
  template <>
2006
- EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask,
2007
- const Packet16bf& a,
2008
- const Packet16bf& b) {
2690
+ EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask, const Packet16bf& a, const Packet16bf& b) {
2009
2691
  // Input mask is expected to be all 0/1, handle it with 8-bit
2010
2692
  // intrinsic for performance.
2011
2693
  return _mm256_blendv_epi8(b, a, mask);
2012
2694
  }
2013
2695
 
2014
- template<> EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a)
2015
- {
2696
+ template <>
2697
+ EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a) {
2016
2698
  return F32ToBf16(pround<Packet16f>(Bf16ToF32(a)));
2017
2699
  }
2018
2700
 
2019
- template<> EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
2701
+ template <>
2702
+ EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
2020
2703
  return F32ToBf16(print<Packet16f>(Bf16ToF32(a)));
2021
2704
  }
2022
2705
 
2023
- template<> EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
2706
+ template <>
2707
+ EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
2024
2708
  return F32ToBf16(pceil<Packet16f>(Bf16ToF32(a)));
2025
2709
  }
2026
2710
 
2027
- template<> EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
2711
+ template <>
2712
+ EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
2028
2713
  return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
2029
2714
  }
2030
2715
 
2031
2716
  template <>
2032
- EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a,
2033
- const Packet16bf& b) {
2717
+ EIGEN_STRONG_INLINE Packet16bf ptrunc<Packet16bf>(const Packet16bf& a) {
2718
+ return F32ToBf16(ptrunc<Packet16f>(Bf16ToF32(a)));
2719
+ }
2720
+
2721
+ template <>
2722
+ EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a, const Packet16bf& b) {
2034
2723
  return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
2035
2724
  }
2036
2725
 
2037
2726
  template <>
2038
- EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a,
2039
- const Packet16bf& b) {
2727
+ EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a, const Packet16bf& b) {
2040
2728
  return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
2041
2729
  }
2042
2730
 
2043
2731
  template <>
2044
- EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a,
2045
- const Packet16bf& b) {
2732
+ EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a, const Packet16bf& b) {
2046
2733
  return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
2047
2734
  }
2048
2735
 
2049
2736
  template <>
2050
- EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a,
2051
- const Packet16bf& b) {
2737
+ EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a, const Packet16bf& b) {
2052
2738
  return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
2053
2739
  }
2054
2740
 
@@ -2070,77 +2756,71 @@ EIGEN_STRONG_INLINE Packet16bf pabs(const Packet16bf& a) {
2070
2756
  }
2071
2757
 
2072
2758
  template <>
2073
- EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a,
2074
- const Packet16bf& b) {
2759
+ EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
2075
2760
  return F32ToBf16(padd<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2076
2761
  }
2077
2762
 
2078
2763
  template <>
2079
- EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a,
2080
- const Packet16bf& b) {
2764
+ EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
2081
2765
  return F32ToBf16(psub<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2082
2766
  }
2083
2767
 
2084
2768
  template <>
2085
- EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a,
2086
- const Packet16bf& b) {
2087
- return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2769
+ EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
2770
+ return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b)));
2088
2771
  }
2089
2772
 
2090
2773
  template <>
2091
- EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a,
2092
- const Packet16bf& b) {
2093
- return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2774
+ EIGEN_STRONG_INLINE Packet16bf pmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
2775
+ return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2094
2776
  }
2095
2777
 
2096
2778
  template <>
2097
- EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a,
2098
- const Packet16bf& b) {
2099
- return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2779
+ EIGEN_STRONG_INLINE Packet16bf pmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
2780
+ return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2100
2781
  }
2101
2782
 
2102
2783
  template <>
2103
- EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a,
2104
- const Packet16bf& b) {
2105
- return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2784
+ EIGEN_STRONG_INLINE Packet16bf pnmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
2785
+ return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2106
2786
  }
2107
2787
 
2108
2788
  template <>
2109
- EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
2110
- return F32ToBf16(plset<Packet16f>(static_cast<float>(a)));
2789
+ EIGEN_STRONG_INLINE Packet16bf pnmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
2790
+ return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2111
2791
  }
2112
2792
 
2113
2793
  template <>
2114
- EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
2115
- Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
2116
- Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
2117
- return padd<Packet8bf>(lane0, lane1);
2794
+ EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
2795
+ return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2118
2796
  }
2119
2797
 
2120
2798
  template <>
2121
- EIGEN_STRONG_INLINE bfloat16 predux<Packet16bf>(const Packet16bf& p) {
2122
- return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(p)));
2799
+ EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
2800
+ return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2123
2801
  }
2124
2802
 
2125
2803
  template <>
2126
- EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet16bf>(const Packet16bf& from) {
2127
- return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
2804
+ EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
2805
+ return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2128
2806
  }
2129
2807
 
2130
2808
  template <>
2131
- EIGEN_STRONG_INLINE bfloat16 predux_min<Packet16bf>(const Packet16bf& from) {
2132
- return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
2809
+ EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
2810
+ return F32ToBf16(plset<Packet16f>(static_cast<float>(a)));
2133
2811
  }
2134
2812
 
2135
2813
  template <>
2136
- EIGEN_STRONG_INLINE bfloat16 predux_max<Packet16bf>(const Packet16bf& from) {
2137
- return static_cast<bfloat16>(predux_max<Packet16f>(Bf16ToF32(from)));
2814
+ EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
2815
+ Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
2816
+ Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
2817
+ return padd<Packet8bf>(lane0, lane1);
2138
2818
  }
2139
2819
 
2140
2820
  template <>
2141
2821
  EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
2142
- __m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
2143
- 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
2822
+ __m256i m = _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,
2823
+ 4, 5, 2, 3, 0, 1);
2144
2824
 
2145
2825
  Packet16bf res;
2146
2826
  // Swap hi and lo first because shuffle is in 128-bit lanes.
@@ -2150,40 +2830,37 @@ EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
2150
2830
  }
2151
2831
 
2152
2832
  template <>
2153
- EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from,
2154
- Index stride) {
2833
+ EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from, Index stride) {
2155
2834
  return _mm256_set_epi16(
2156
- from[15*stride].value, from[14*stride].value, from[13*stride].value, from[12*stride].value,
2157
- from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value,
2158
- from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value,
2159
- from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value);
2835
+ from[15 * stride].value, from[14 * stride].value, from[13 * stride].value, from[12 * stride].value,
2836
+ from[11 * stride].value, from[10 * stride].value, from[9 * stride].value, from[8 * stride].value,
2837
+ from[7 * stride].value, from[6 * stride].value, from[5 * stride].value, from[4 * stride].value,
2838
+ from[3 * stride].value, from[2 * stride].value, from[1 * stride].value, from[0 * stride].value);
2160
2839
  }
2161
2840
 
2162
2841
  template <>
2163
- EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to,
2164
- const Packet16bf& from,
2165
- Index stride) {
2842
+ EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to, const Packet16bf& from, Index stride) {
2166
2843
  EIGEN_ALIGN64 bfloat16 aux[16];
2167
2844
  pstore(aux, from);
2168
- to[stride*0] = aux[0];
2169
- to[stride*1] = aux[1];
2170
- to[stride*2] = aux[2];
2171
- to[stride*3] = aux[3];
2172
- to[stride*4] = aux[4];
2173
- to[stride*5] = aux[5];
2174
- to[stride*6] = aux[6];
2175
- to[stride*7] = aux[7];
2176
- to[stride*8] = aux[8];
2177
- to[stride*9] = aux[9];
2178
- to[stride*10] = aux[10];
2179
- to[stride*11] = aux[11];
2180
- to[stride*12] = aux[12];
2181
- to[stride*13] = aux[13];
2182
- to[stride*14] = aux[14];
2183
- to[stride*15] = aux[15];
2184
- }
2185
-
2186
- EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,16>& kernel) {
2845
+ to[stride * 0] = aux[0];
2846
+ to[stride * 1] = aux[1];
2847
+ to[stride * 2] = aux[2];
2848
+ to[stride * 3] = aux[3];
2849
+ to[stride * 4] = aux[4];
2850
+ to[stride * 5] = aux[5];
2851
+ to[stride * 6] = aux[6];
2852
+ to[stride * 7] = aux[7];
2853
+ to[stride * 8] = aux[8];
2854
+ to[stride * 9] = aux[9];
2855
+ to[stride * 10] = aux[10];
2856
+ to[stride * 11] = aux[11];
2857
+ to[stride * 12] = aux[12];
2858
+ to[stride * 13] = aux[13];
2859
+ to[stride * 14] = aux[14];
2860
+ to[stride * 15] = aux[15];
2861
+ }
2862
+
2863
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 16>& kernel) {
2187
2864
  __m256i a = kernel.packet[0];
2188
2865
  __m256i b = kernel.packet[1];
2189
2866
  __m256i c = kernel.packet[2];
@@ -2273,7 +2950,7 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,16>& kernel) {
2273
2950
  kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
2274
2951
  }
2275
2952
 
2276
- EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,4>& kernel) {
2953
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 4>& kernel) {
2277
2954
  __m256i a = kernel.packet[0];
2278
2955
  __m256i b = kernel.packet[1];
2279
2956
  __m256i c = kernel.packet[2];
@@ -2296,8 +2973,174 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,4>& kernel) {
2296
2973
  kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
2297
2974
  }
2298
2975
 
2299
- } // end namespace internal
2976
+ // Minimal implementation of 16-bit int packets for use in pfrexp, pldexp.
2977
+
2978
+ template <>
2979
+ EIGEN_STRONG_INLINE Packet32s pset1<Packet32s>(const numext::int16_t& x) {
2980
+ return _mm512_set1_epi16(x);
2981
+ }
2982
+
2983
+ template <>
2984
+ EIGEN_STRONG_INLINE Packet16s pset1<Packet16s>(const numext::int16_t& x) {
2985
+ return _mm256_set1_epi16(x);
2986
+ }
2987
+
2988
+ template <>
2989
+ EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const numext::int16_t& x) {
2990
+ return _mm_set1_epi16(x);
2991
+ }
2992
+
2993
+ template <>
2994
+ EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
2995
+ EIGEN_DEBUG_ALIGNED_STORE
2996
+ _mm512_store_epi32(out, x);
2997
+ }
2998
+
2999
+ template <>
3000
+ EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
3001
+ EIGEN_DEBUG_ALIGNED_STORE
3002
+ #if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
3003
+ _mm256_store_epi32(out, x);
3004
+ #else
3005
+ _mm256_store_si256(reinterpret_cast<__m256i*>(out), x);
3006
+ #endif
3007
+ }
3008
+
3009
+ template <>
3010
+ EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
3011
+ EIGEN_DEBUG_ALIGNED_STORE
3012
+ #if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
3013
+ _mm256_store_epi32(out, x);
3014
+ #else
3015
+ _mm_store_si128(reinterpret_cast<__m128i*>(out), x);
3016
+ #endif
3017
+ }
3018
+
3019
+ template <>
3020
+ EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
3021
+ EIGEN_DEBUG_UNALIGNED_STORE
3022
+ _mm512_storeu_epi32(out, x);
3023
+ }
3024
+
3025
+ template <>
3026
+ EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
3027
+ EIGEN_DEBUG_UNALIGNED_STORE
3028
+ _mm256_storeu_epi32(out, x);
3029
+ }
3030
+
3031
+ template <>
3032
+ EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
3033
+ EIGEN_DEBUG_UNALIGNED_STORE
3034
+ _mm_storeu_epi32(out, x);
3035
+ }
3036
+
3037
+ template <>
3038
+ EIGEN_STRONG_INLINE Packet32s padd(const Packet32s& a, const Packet32s& b) {
3039
+ return _mm512_add_epi16(a, b);
3040
+ }
3041
+
3042
+ template <>
3043
+ EIGEN_STRONG_INLINE Packet16s padd(const Packet16s& a, const Packet16s& b) {
3044
+ return _mm256_add_epi16(a, b);
3045
+ }
3046
+
3047
+ template <>
3048
+ EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) {
3049
+ return _mm_add_epi16(a, b);
3050
+ }
3051
+
3052
+ template <>
3053
+ EIGEN_STRONG_INLINE Packet32s psub(const Packet32s& a, const Packet32s& b) {
3054
+ return _mm512_sub_epi16(a, b);
3055
+ }
3056
+
3057
+ template <>
3058
+ EIGEN_STRONG_INLINE Packet16s psub(const Packet16s& a, const Packet16s& b) {
3059
+ return _mm256_sub_epi16(a, b);
3060
+ }
3061
+
3062
+ template <>
3063
+ EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) {
3064
+ return _mm_sub_epi16(a, b);
3065
+ }
3066
+
3067
+ template <>
3068
+ EIGEN_STRONG_INLINE Packet32s pmul(const Packet32s& a, const Packet32s& b) {
3069
+ return _mm512_mullo_epi16(a, b);
3070
+ }
3071
+
3072
+ template <>
3073
+ EIGEN_STRONG_INLINE Packet16s pmul(const Packet16s& a, const Packet16s& b) {
3074
+ return _mm256_mullo_epi16(a, b);
3075
+ }
3076
+
3077
+ template <>
3078
+ EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) {
3079
+ return _mm_mullo_epi16(a, b);
3080
+ }
3081
+
3082
+ template <>
3083
+ EIGEN_STRONG_INLINE Packet32s pnegate(const Packet32s& a) {
3084
+ return _mm512_sub_epi16(_mm512_setzero_si512(), a);
3085
+ }
3086
+
3087
+ template <>
3088
+ EIGEN_STRONG_INLINE Packet16s pnegate(const Packet16s& a) {
3089
+ return _mm256_sub_epi16(_mm256_setzero_si256(), a);
3090
+ }
3091
+
3092
+ template <>
3093
+ EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
3094
+ return _mm_sub_epi16(_mm_setzero_si128(), a);
3095
+ }
3096
+
3097
+ template <int N>
3098
+ EIGEN_STRONG_INLINE Packet32s parithmetic_shift_right(Packet32s a) {
3099
+ return _mm512_srai_epi16(a, N);
3100
+ }
3101
+
3102
+ template <int N>
3103
+ EIGEN_STRONG_INLINE Packet16s parithmetic_shift_right(Packet16s a) {
3104
+ return _mm256_srai_epi16(a, N);
3105
+ }
3106
+
3107
+ template <int N>
3108
+ EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
3109
+ return _mm_srai_epi16(a, N);
3110
+ }
3111
+
3112
+ template <int N>
3113
+ EIGEN_STRONG_INLINE Packet32s plogical_shift_left(Packet32s a) {
3114
+ return _mm512_slli_epi16(a, N);
3115
+ }
3116
+
3117
+ template <int N>
3118
+ EIGEN_STRONG_INLINE Packet16s plogical_shift_left(Packet16s a) {
3119
+ return _mm256_slli_epi16(a, N);
3120
+ }
3121
+
3122
+ template <int N>
3123
+ EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
3124
+ return _mm_slli_epi16(a, N);
3125
+ }
3126
+
3127
+ template <int N>
3128
+ EIGEN_STRONG_INLINE Packet32s plogical_shift_right(Packet32s a) {
3129
+ return _mm512_srli_epi16(a, N);
3130
+ }
3131
+
3132
+ template <int N>
3133
+ EIGEN_STRONG_INLINE Packet16s plogical_shift_right(Packet16s a) {
3134
+ return _mm256_srli_epi16(a, N);
3135
+ }
3136
+
3137
+ template <int N>
3138
+ EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
3139
+ return _mm_srli_epi16(a, N);
3140
+ }
3141
+
3142
+ } // end namespace internal
2300
3143
 
2301
- } // end namespace Eigen
3144
+ } // end namespace Eigen
2302
3145
 
2303
- #endif // EIGEN_PACKET_MATH_AVX512_H
3146
+ #endif // EIGEN_PACKET_MATH_AVX512_H