@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -0,0 +1,1413 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2025 The Eigen Authors.
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
11
+ #define EIGEN_PACKET_MATH_FP16_AVX512_H
12
+
13
+ // IWYU pragma: private
14
+ #include "../../InternalHeaderCheck.h"
15
+
16
+ namespace Eigen {
17
+
18
+ namespace internal {
19
+
20
+ typedef __m512h Packet32h;
21
+ typedef __m256h Packet16h;
22
+ typedef __m128h Packet8h;
23
+
24
+ template <>
25
+ struct is_arithmetic<Packet8h> {
26
+ enum { value = true };
27
+ };
28
+
29
+ template <>
30
+ struct packet_traits<half> : default_packet_traits {
31
+ typedef Packet32h type;
32
+ typedef Packet16h half;
33
+ enum {
34
+ Vectorizable = 1,
35
+ AlignedOnScalar = 1,
36
+ size = 32,
37
+
38
+ HasCmp = 1,
39
+ HasAdd = 1,
40
+ HasSub = 1,
41
+ HasMul = 1,
42
+ HasDiv = 1,
43
+ HasNegate = 1,
44
+ HasAbs = 1,
45
+ HasAbs2 = 0,
46
+ HasMin = 1,
47
+ HasMax = 1,
48
+ HasConj = 1,
49
+ HasSetLinear = 0,
50
+ HasLog = 1,
51
+ HasLog1p = 1,
52
+ HasExp = 1,
53
+ HasExpm1 = 1,
54
+ HasSqrt = 1,
55
+ HasRsqrt = 1,
56
+ // These ones should be implemented in future
57
+ HasBessel = 0,
58
+ HasNdtri = 0,
59
+ HasSin = EIGEN_FAST_MATH,
60
+ HasCos = EIGEN_FAST_MATH,
61
+ HasTanh = EIGEN_FAST_MATH,
62
+ HasErf = 0, // EIGEN_FAST_MATH,
63
+ HasBlend = 0
64
+ };
65
+ };
66
+
67
+ template <>
68
+ struct unpacket_traits<Packet32h> {
69
+ typedef Eigen::half type;
70
+ typedef Packet16h half;
71
+ typedef Packet32s integer_packet;
72
+ enum {
73
+ size = 32,
74
+ alignment = Aligned64,
75
+ vectorizable = true,
76
+ masked_load_available = false,
77
+ masked_store_available = false
78
+ };
79
+ };
80
+
81
+ template <>
82
+ struct unpacket_traits<Packet16h> {
83
+ typedef Eigen::half type;
84
+ typedef Packet8h half;
85
+ typedef Packet16s integer_packet;
86
+ enum {
87
+ size = 16,
88
+ alignment = Aligned32,
89
+ vectorizable = true,
90
+ masked_load_available = false,
91
+ masked_store_available = false
92
+ };
93
+ };
94
+
95
+ template <>
96
+ struct unpacket_traits<Packet8h> {
97
+ typedef Eigen::half type;
98
+ typedef Packet8h half;
99
+ typedef Packet8s integer_packet;
100
+ enum {
101
+ size = 8,
102
+ alignment = Aligned16,
103
+ vectorizable = true,
104
+ masked_load_available = false,
105
+ masked_store_available = false
106
+ };
107
+ };
108
+
109
+ // Conversions
110
+
111
+ EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtxph_ps(a); }
112
+
113
+ EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { return _mm256_cvtxph_ps(a); }
114
+
115
+ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { return _mm512_cvtxps_ph(a); }
116
+
117
+ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { return _mm256_cvtxps_ph(a); }
118
+
119
+ // Memory functions
120
+
121
+ // pset1
122
+
123
+ template <>
124
+ EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
125
+ return _mm512_set1_ph(from.x);
126
+ }
127
+
128
+ template <>
129
+ EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
130
+ return _mm256_set1_ph(from.x);
131
+ }
132
+
133
+ template <>
134
+ EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
135
+ return _mm_set1_ph(from.x);
136
+ }
137
+
138
+ template <>
139
+ EIGEN_STRONG_INLINE Packet32h pzero(const Packet32h& /*a*/) {
140
+ return _mm512_setzero_ph();
141
+ }
142
+
143
+ template <>
144
+ EIGEN_STRONG_INLINE Packet16h pzero(const Packet16h& /*a*/) {
145
+ return _mm256_setzero_ph();
146
+ }
147
+
148
+ template <>
149
+ EIGEN_STRONG_INLINE Packet8h pzero(const Packet8h& /*a*/) {
150
+ return _mm_setzero_ph();
151
+ }
152
+
153
+ // pset1frombits
154
+ template <>
155
+ EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
156
+ return _mm512_castsi512_ph(_mm512_set1_epi16(from));
157
+ }
158
+
159
+ template <>
160
+ EIGEN_STRONG_INLINE Packet16h pset1frombits<Packet16h>(unsigned short from) {
161
+ return _mm256_castsi256_ph(_mm256_set1_epi16(from));
162
+ }
163
+
164
+ template <>
165
+ EIGEN_STRONG_INLINE Packet8h pset1frombits<Packet8h>(unsigned short from) {
166
+ return _mm_castsi128_ph(_mm_set1_epi16(from));
167
+ }
168
+
169
+ // pfirst
170
+
171
+ template <>
172
+ EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
173
+ return Eigen::half(_mm512_cvtsh_h(from));
174
+ }
175
+
176
+ template <>
177
+ EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
178
+ return Eigen::half(_mm256_cvtsh_h(from));
179
+ }
180
+
181
+ template <>
182
+ EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
183
+ return Eigen::half(_mm_cvtsh_h(from));
184
+ }
185
+
186
+ // pload
187
+
188
+ template <>
189
+ EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
190
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
191
+ }
192
+
193
+ template <>
194
+ EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
195
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ph(from);
196
+ }
197
+
198
+ template <>
199
+ EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
200
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ph(from);
201
+ }
202
+
203
+ // ploadu
204
+
205
+ template <>
206
+ EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
207
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
208
+ }
209
+
210
+ template <>
211
+ EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
212
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ph(from);
213
+ }
214
+
215
+ template <>
216
+ EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
217
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ph(from);
218
+ }
219
+
220
+ // pstore
221
+
222
+ template <>
223
+ EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
224
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
225
+ }
226
+
227
+ template <>
228
+ EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
229
+ EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ph(to, from);
230
+ }
231
+
232
+ template <>
233
+ EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet8h& from) {
234
+ EIGEN_DEBUG_ALIGNED_STORE _mm_store_ph(to, from);
235
+ }
236
+
237
+ // pstoreu
238
+
239
+ template <>
240
+ EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
241
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
242
+ }
243
+
244
+ template <>
245
+ EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
246
+ EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ph(to, from);
247
+ }
248
+
249
+ template <>
250
+ EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet8h& from) {
251
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ph(to, from);
252
+ }
253
+
254
+ // ploaddup
255
+ template <>
256
+ EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
257
+ __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
258
+ return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
259
+ 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
260
+ a);
261
+ }
262
+
263
+ template <>
264
+ EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
265
+ __m256h a = _mm256_castph128_ph256(_mm_loadu_ph(from));
266
+ return _mm256_permutexvar_ph(_mm256_set_epi16(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), a);
267
+ }
268
+
269
+ template <>
270
+ EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) {
271
+ return _mm_set_ph(from[3].x, from[3].x, from[2].x, from[2].x, from[1].x, from[1].x, from[0].x, from[0].x);
272
+ }
273
+
274
+ // ploadquad
275
+ template <>
276
+ EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
277
+ __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
278
+ return _mm512_permutexvar_ph(
279
+ _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
280
+ a);
281
+ }
282
+
283
+ template <>
284
+ EIGEN_STRONG_INLINE Packet16h ploadquad<Packet16h>(const Eigen::half* from) {
285
+ return _mm256_set_ph(from[3].x, from[3].x, from[3].x, from[3].x, from[2].x, from[2].x, from[2].x, from[2].x,
286
+ from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
287
+ }
288
+
289
+ template <>
290
+ EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) {
291
+ return _mm_set_ph(from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
292
+ }
293
+
294
+ // pabs
295
+
296
+ template <>
297
+ EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
298
+ return _mm512_abs_ph(a);
299
+ }
300
+
301
+ template <>
302
+ EIGEN_STRONG_INLINE Packet16h pabs<Packet16h>(const Packet16h& a) {
303
+ return _mm256_abs_ph(a);
304
+ }
305
+
306
+ template <>
307
+ EIGEN_STRONG_INLINE Packet8h pabs<Packet8h>(const Packet8h& a) {
308
+ return _mm_abs_ph(a);
309
+ }
310
+
311
+ // psignbit
312
+
313
+ template <>
314
+ EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
315
+ return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
316
+ }
317
+
318
+ template <>
319
+ EIGEN_STRONG_INLINE Packet16h psignbit<Packet16h>(const Packet16h& a) {
320
+ return _mm256_castsi256_ph(_mm256_srai_epi16(_mm256_castph_si256(a), 15));
321
+ }
322
+
323
+ template <>
324
+ EIGEN_STRONG_INLINE Packet8h psignbit<Packet8h>(const Packet8h& a) {
325
+ return _mm_castsi128_ph(_mm_srai_epi16(_mm_castph_si128(a), 15));
326
+ }
327
+
328
+ // pmin
329
+
330
+ template <>
331
+ EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
332
+ return _mm512_min_ph(a, b);
333
+ }
334
+
335
+ template <>
336
+ EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
337
+ return _mm256_min_ph(a, b);
338
+ }
339
+
340
+ template <>
341
+ EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) {
342
+ return _mm_min_ph(a, b);
343
+ }
344
+
345
+ // pmax
346
+
347
+ template <>
348
+ EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
349
+ return _mm512_max_ph(a, b);
350
+ }
351
+
352
+ template <>
353
+ EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
354
+ return _mm256_max_ph(a, b);
355
+ }
356
+
357
+ template <>
358
+ EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) {
359
+ return _mm_max_ph(a, b);
360
+ }
361
+
362
+ // plset
363
+ template <>
364
+ EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
365
+ return _mm512_add_ph(pset1<Packet32h>(a), _mm512_set_ph(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
366
+ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
367
+ }
368
+
369
+ template <>
370
+ EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
371
+ return _mm256_add_ph(pset1<Packet16h>(a), _mm256_set_ph(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
372
+ }
373
+
374
+ template <>
375
+ EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
376
+ return _mm_add_ph(pset1<Packet8h>(a), _mm_set_ph(7, 6, 5, 4, 3, 2, 1, 0));
377
+ }
378
+
379
+ // por
380
+
381
+ template <>
382
+ EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
383
+ return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
384
+ }
385
+
386
+ template <>
387
+ EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
388
+ return _mm256_castsi256_ph(_mm256_or_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
389
+ }
390
+
391
+ template <>
392
+ EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) {
393
+ return _mm_castsi128_ph(_mm_or_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
394
+ }
395
+
396
+ // pxor
397
+
398
+ template <>
399
+ EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
400
+ return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
401
+ }
402
+
403
+ template <>
404
+ EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
405
+ return _mm256_castsi256_ph(_mm256_xor_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
406
+ }
407
+
408
+ template <>
409
+ EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) {
410
+ return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
411
+ }
412
+
413
+ // pand
414
+
415
+ template <>
416
+ EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
417
+ return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
418
+ }
419
+
420
+ template <>
421
+ EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
422
+ return _mm256_castsi256_ph(_mm256_and_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
423
+ }
424
+
425
+ template <>
426
+ EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) {
427
+ return _mm_castsi128_ph(_mm_and_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
428
+ }
429
+
430
+ // pandnot
431
+
432
+ template <>
433
+ EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
434
+ return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
435
+ }
436
+
437
+ template <>
438
+ EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
439
+ return _mm256_castsi256_ph(_mm256_andnot_si256(_mm256_castph_si256(b), _mm256_castph_si256(a)));
440
+ }
441
+
442
+ template <>
443
+ EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) {
444
+ return _mm_castsi128_ph(_mm_andnot_si128(_mm_castph_si128(b), _mm_castph_si128(a)));
445
+ }
446
+
447
+ // pselect
448
+
449
+ template <>
450
+ EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
451
+ __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
452
+ return _mm512_mask_blend_ph(mask32, a, b);
453
+ }
454
+
455
+ template <>
456
+ EIGEN_DEVICE_FUNC inline Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
457
+ __mmask16 mask16 = _mm256_cmp_epi16_mask(_mm256_castph_si256(mask), _mm256_setzero_si256(), _MM_CMPINT_EQ);
458
+ return _mm256_mask_blend_ph(mask16, a, b);
459
+ }
460
+
461
+ template <>
462
+ EIGEN_DEVICE_FUNC inline Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
463
+ __mmask8 mask8 = _mm_cmp_epi16_mask(_mm_castph_si128(mask), _mm_setzero_si128(), _MM_CMPINT_EQ);
464
+ return _mm_mask_blend_ph(mask8, a, b);
465
+ }
466
+
467
+ // pcmp_eq
468
+
469
+ template <>
470
+ EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
471
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
472
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
473
+ }
474
+
475
+ template <>
476
+ EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
477
+ __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_EQ_OQ);
478
+ return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
479
+ }
480
+
481
+ template <>
482
+ EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
483
+ __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_EQ_OQ);
484
+ return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
485
+ }
486
+
487
+ // pcmp_le
488
+
489
+ template <>
490
+ EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
491
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
492
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
493
+ }
494
+
495
+ template <>
496
+ EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
497
+ __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LE_OQ);
498
+ return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
499
+ }
500
+
501
+ template <>
502
+ EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
503
+ __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LE_OQ);
504
+ return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
505
+ }
506
+
507
+ // pcmp_lt
508
+
509
+ template <>
510
+ EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
511
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
512
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
513
+ }
514
+
515
+ template <>
516
+ EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
517
+ __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LT_OQ);
518
+ return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
519
+ }
520
+
521
+ template <>
522
+ EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
523
+ __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LT_OQ);
524
+ return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
525
+ }
526
+
527
+ // pcmp_lt_or_nan
528
+
529
+ template <>
530
+ EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
531
+ __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
532
+ return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, static_cast<short>(0xffffu)));
533
+ }
534
+
535
+ template <>
536
+ EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
537
+ __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_NGE_UQ);
538
+ return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
539
+ }
540
+
541
+ template <>
542
+ EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
543
+ __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_NGE_UQ);
544
+ return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
545
+ }
546
+
547
+ // padd
548
+
549
+ template <>
550
+ EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
551
+ return _mm512_add_ph(a, b);
552
+ }
553
+
554
+ template <>
555
+ EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
556
+ return _mm256_add_ph(a, b);
557
+ }
558
+
559
+ template <>
560
+ EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
561
+ return _mm_add_ph(a, b);
562
+ }
563
+
564
+ // psub
565
+
566
+ template <>
567
+ EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
568
+ return _mm512_sub_ph(a, b);
569
+ }
570
+
571
+ template <>
572
+ EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
573
+ return _mm256_sub_ph(a, b);
574
+ }
575
+
576
+ template <>
577
+ EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
578
+ return _mm_sub_ph(a, b);
579
+ }
580
+
581
+ // pmul
582
+
583
+ template <>
584
+ EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
585
+ return _mm512_mul_ph(a, b);
586
+ }
587
+
588
+ template <>
589
+ EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
590
+ return _mm256_mul_ph(a, b);
591
+ }
592
+
593
+ template <>
594
+ EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
595
+ return _mm_mul_ph(a, b);
596
+ }
597
+
598
+ // pdiv
599
+
600
+ template <>
601
+ EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
602
+ return _mm512_div_ph(a, b);
603
+ }
604
+
605
+ template <>
606
+ EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
607
+ return _mm256_div_ph(a, b);
608
+ }
609
+
610
+ template <>
611
+ EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
612
+ return _mm_div_ph(a, b);
613
+ ;
614
+ }
615
+
616
+ // pround
617
+
618
+ template <>
619
+ EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
620
+ // Work-around for default std::round rounding mode.
621
+
622
+ // Mask for the sign bit.
623
+ const Packet32h signMask =
624
+ pset1frombits<Packet32h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
625
+ // The largest half-precision float less than 0.5.
626
+ const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
627
+
628
+ return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
629
+ }
630
+
631
+ template <>
632
+ EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
633
+ // Work-around for default std::round rounding mode.
634
+
635
+ // Mask for the sign bit.
636
+ const Packet16h signMask =
637
+ pset1frombits<Packet16h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
638
+ // The largest half-precision float less than 0.5.
639
+ const Packet16h prev0dot5 = pset1frombits<Packet16h>(static_cast<numext::uint16_t>(0x37FFu));
640
+
641
+ return _mm256_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
642
+ }
643
+
644
+ template <>
645
+ EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
646
+ // Work-around for default std::round rounding mode.
647
+
648
+ // Mask for the sign bit.
649
+ const Packet8h signMask = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
650
+ // The largest half-precision float less than 0.5.
651
+ const Packet8h prev0dot5 = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(0x37FFu));
652
+
653
+ return _mm_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
654
+ }
655
+
656
+ // print
657
+
658
+ template <>
659
+ EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
660
+ return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
661
+ }
662
+
663
+ template <>
664
+ EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
665
+ return _mm256_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
666
+ }
667
+
668
+ template <>
669
+ EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
670
+ return _mm_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
671
+ }
672
+
673
+ // pceil
674
+
675
+ template <>
676
+ EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
677
+ return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
678
+ }
679
+
680
+ template <>
681
+ EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
682
+ return _mm256_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
683
+ }
684
+
685
+ template <>
686
+ EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
687
+ return _mm_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
688
+ }
689
+
690
+ // pfloor
691
+
692
+ template <>
693
+ EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
694
+ return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
695
+ }
696
+
697
+ template <>
698
+ EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
699
+ return _mm256_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
700
+ }
701
+
702
+ template <>
703
+ EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
704
+ return _mm_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
705
+ }
706
+
707
+ // ptrunc
708
+
709
+ template <>
710
+ EIGEN_STRONG_INLINE Packet32h ptrunc<Packet32h>(const Packet32h& a) {
711
+ return _mm512_roundscale_ph(a, _MM_FROUND_TO_ZERO);
712
+ }
713
+
714
+ template <>
715
+ EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) {
716
+ return _mm256_roundscale_ph(a, _MM_FROUND_TO_ZERO);
717
+ }
718
+
719
+ template <>
720
+ EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
721
+ return _mm_roundscale_ph(a, _MM_FROUND_TO_ZERO);
722
+ }
723
+
724
+ // predux
725
+ template <>
726
+ EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
727
+ return half(_mm512_reduce_add_ph(a));
728
+ }
729
+
730
+ template <>
731
+ EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
732
+ return half(_mm256_reduce_add_ph(a));
733
+ }
734
+
735
+ template <>
736
+ EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
737
+ return half(_mm_reduce_add_ph(a));
738
+ }
739
+
740
+ // predux_half_dowto4
741
+ template <>
742
+ EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
743
+ const __m512i bits = _mm512_castph_si512(a);
744
+ Packet16h lo = _mm256_castsi256_ph(_mm512_castsi512_si256(bits));
745
+ Packet16h hi = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(bits, 1));
746
+ return padd(lo, hi);
747
+ }
748
+
749
+ template <>
750
+ EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
751
+ Packet8h lo = _mm_castsi128_ph(_mm256_castsi256_si128(_mm256_castph_si256(a)));
752
+ Packet8h hi = _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(a), 1));
753
+ return padd(lo, hi);
754
+ }
755
+
756
+ // predux_max
757
+
758
+ template <>
759
+ EIGEN_STRONG_INLINE half predux_max<Packet32h>(const Packet32h& a) {
760
+ return half(_mm512_reduce_max_ph(a));
761
+ }
762
+
763
+ template <>
764
+ EIGEN_STRONG_INLINE half predux_max<Packet16h>(const Packet16h& a) {
765
+ return half(_mm256_reduce_max_ph(a));
766
+ }
767
+
768
+ template <>
769
+ EIGEN_STRONG_INLINE half predux_max<Packet8h>(const Packet8h& a) {
770
+ return half(_mm_reduce_max_ph(a));
771
+ }
772
+
773
+ // predux_min
774
+
775
+ template <>
776
+ EIGEN_STRONG_INLINE half predux_min<Packet32h>(const Packet32h& a) {
777
+ return half(_mm512_reduce_min_ph(a));
778
+ }
779
+
780
+ template <>
781
+ EIGEN_STRONG_INLINE half predux_min<Packet16h>(const Packet16h& a) {
782
+ return half(_mm256_reduce_min_ph(a));
783
+ }
784
+
785
+ template <>
786
+ EIGEN_STRONG_INLINE half predux_min<Packet8h>(const Packet8h& a) {
787
+ return half(_mm_reduce_min_ph(a));
788
+ }
789
+
790
+ // predux_mul
791
+
792
+ template <>
793
+ EIGEN_STRONG_INLINE half predux_mul<Packet32h>(const Packet32h& a) {
794
+ return half(_mm512_reduce_mul_ph(a));
795
+ }
796
+
797
+ template <>
798
+ EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& a) {
799
+ return half(_mm256_reduce_mul_ph(a));
800
+ }
801
+
802
+ template <>
803
+ EIGEN_STRONG_INLINE half predux_mul<Packet8h>(const Packet8h& a) {
804
+ return half(_mm_reduce_mul_ph(a));
805
+ }
806
+
807
+ #ifdef EIGEN_VECTORIZE_FMA
808
+
809
+ // pmadd
810
+
811
+ template <>
812
+ EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
813
+ return _mm512_fmadd_ph(a, b, c);
814
+ }
815
+
816
+ template <>
817
+ EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
818
+ return _mm256_fmadd_ph(a, b, c);
819
+ }
820
+
821
+ template <>
822
+ EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
823
+ return _mm_fmadd_ph(a, b, c);
824
+ }
825
+
826
+ // pmsub
827
+
828
+ template <>
829
+ EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
830
+ return _mm512_fmsub_ph(a, b, c);
831
+ }
832
+
833
+ template <>
834
+ EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
835
+ return _mm256_fmsub_ph(a, b, c);
836
+ }
837
+
838
+ template <>
839
+ EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
840
+ return _mm_fmsub_ph(a, b, c);
841
+ }
842
+
843
+ // pnmadd
844
+
845
+ template <>
846
+ EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
847
+ return _mm512_fnmadd_ph(a, b, c);
848
+ }
849
+
850
+ template <>
851
+ EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
852
+ return _mm256_fnmadd_ph(a, b, c);
853
+ }
854
+
855
+ template <>
856
+ EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
857
+ return _mm_fnmadd_ph(a, b, c);
858
+ }
859
+
860
+ // pnmsub
861
+
862
+ template <>
863
+ EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
864
+ return _mm512_fnmsub_ph(a, b, c);
865
+ }
866
+
867
+ template <>
868
+ EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
869
+ return _mm256_fnmsub_ph(a, b, c);
870
+ }
871
+
872
+ template <>
873
+ EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
874
+ return _mm_fnmsub_ph(a, b, c);
875
+ }
876
+
877
+ #endif
878
+
879
+ // pnegate
880
+
881
+ template <>
882
+ EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
883
+ return _mm512_castsi512_ph(
884
+ _mm512_xor_si512(_mm512_castph_si512(a), _mm512_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
885
+ }
886
+
887
+ template <>
888
+ EIGEN_STRONG_INLINE Packet16h pnegate<Packet16h>(const Packet16h& a) {
889
+ return _mm256_castsi256_ph(
890
+ _mm256_xor_si256(_mm256_castph_si256(a), _mm256_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
891
+ }
892
+
893
+ template <>
894
+ EIGEN_STRONG_INLINE Packet8h pnegate<Packet8h>(const Packet8h& a) {
895
+ return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
896
+ }
897
+
898
+ // pconj
899
+
900
+ // Nothing, packets are real.
901
+
902
+ // psqrt
903
+
904
+ template <>
905
+ EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
906
+ return generic_sqrt_newton_step<Packet32h>::run(a, _mm512_rsqrt_ph(a));
907
+ }
908
+
909
+ template <>
910
+ EIGEN_STRONG_INLINE Packet16h psqrt<Packet16h>(const Packet16h& a) {
911
+ return generic_sqrt_newton_step<Packet16h>::run(a, _mm256_rsqrt_ph(a));
912
+ }
913
+
914
+ template <>
915
+ EIGEN_STRONG_INLINE Packet8h psqrt<Packet8h>(const Packet8h& a) {
916
+ return generic_sqrt_newton_step<Packet8h>::run(a, _mm_rsqrt_ph(a));
917
+ }
918
+
919
+ // prsqrt
920
+
921
+ template <>
922
+ EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
923
+ return generic_rsqrt_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rsqrt_ph(a));
924
+ }
925
+
926
+ template <>
927
+ EIGEN_STRONG_INLINE Packet16h prsqrt<Packet16h>(const Packet16h& a) {
928
+ return generic_rsqrt_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rsqrt_ph(a));
929
+ }
930
+
931
+ template <>
932
+ EIGEN_STRONG_INLINE Packet8h prsqrt<Packet8h>(const Packet8h& a) {
933
+ return generic_rsqrt_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rsqrt_ph(a));
934
+ }
935
+
936
+ // preciprocal
937
+
938
+ template <>
939
+ EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
940
+ return generic_reciprocal_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rcp_ph(a));
941
+ }
942
+
943
+ template <>
944
+ EIGEN_STRONG_INLINE Packet16h preciprocal<Packet16h>(const Packet16h& a) {
945
+ return generic_reciprocal_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rcp_ph(a));
946
+ }
947
+
948
+ template <>
949
+ EIGEN_STRONG_INLINE Packet8h preciprocal<Packet8h>(const Packet8h& a) {
950
+ return generic_reciprocal_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rcp_ph(a));
951
+ }
952
+
953
+ // ptranspose
954
+
955
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
956
+ __m512i t[32];
957
+
958
+ EIGEN_UNROLL_LOOP
959
+ for (int i = 0; i < 16; i++) {
960
+ t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
961
+ t[2 * i + 1] =
962
+ _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
963
+ }
964
+
965
+ __m512i p[32];
966
+
967
+ EIGEN_UNROLL_LOOP
968
+ for (int i = 0; i < 8; i++) {
969
+ p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
970
+ p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
971
+ p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
972
+ p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
973
+ }
974
+
975
+ __m512i q[32];
976
+
977
+ EIGEN_UNROLL_LOOP
978
+ for (int i = 0; i < 4; i++) {
979
+ q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
980
+ q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
981
+ q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
982
+ q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
983
+ q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
984
+ q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
985
+ q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
986
+ q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
987
+ }
988
+
989
+ __m512i f[32];
990
+
991
+ #define PACKET32H_TRANSPOSE_HELPER(X, Y) \
992
+ do { \
993
+ f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \
994
+ f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
995
+ f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
996
+ f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
997
+ f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
998
+ f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
999
+ f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
1000
+ f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
1001
+ } while (false);
1002
+
1003
+ PACKET32H_TRANSPOSE_HELPER(0, 0);
1004
+ PACKET32H_TRANSPOSE_HELPER(1, 1);
1005
+ PACKET32H_TRANSPOSE_HELPER(2, 2);
1006
+ PACKET32H_TRANSPOSE_HELPER(3, 3);
1007
+
1008
+ PACKET32H_TRANSPOSE_HELPER(1, 0);
1009
+ PACKET32H_TRANSPOSE_HELPER(2, 0);
1010
+ PACKET32H_TRANSPOSE_HELPER(3, 0);
1011
+ PACKET32H_TRANSPOSE_HELPER(2, 1);
1012
+ PACKET32H_TRANSPOSE_HELPER(3, 1);
1013
+ PACKET32H_TRANSPOSE_HELPER(3, 2);
1014
+
1015
+ PACKET32H_TRANSPOSE_HELPER(0, 1);
1016
+ PACKET32H_TRANSPOSE_HELPER(0, 2);
1017
+ PACKET32H_TRANSPOSE_HELPER(0, 3);
1018
+ PACKET32H_TRANSPOSE_HELPER(1, 2);
1019
+ PACKET32H_TRANSPOSE_HELPER(1, 3);
1020
+ PACKET32H_TRANSPOSE_HELPER(2, 3);
1021
+
1022
+ #undef PACKET32H_TRANSPOSE_HELPER
1023
+
1024
+ EIGEN_UNROLL_LOOP
1025
+ for (int i = 0; i < 32; i++) {
1026
+ a.packet[i] = _mm512_castsi512_ph(f[i]);
1027
+ }
1028
+ }
1029
+
1030
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
1031
+ __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
1032
+ t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
1033
+ t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
1034
+ t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
1035
+ t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
1036
+
1037
+ p0 = _mm512_unpacklo_epi32(t0, t2);
1038
+ p1 = _mm512_unpackhi_epi32(t0, t2);
1039
+ p2 = _mm512_unpacklo_epi32(t1, t3);
1040
+ p3 = _mm512_unpackhi_epi32(t1, t3);
1041
+
1042
+ a0 = p0;
1043
+ a1 = p1;
1044
+ a2 = p2;
1045
+ a3 = p3;
1046
+
1047
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
1048
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
1049
+
1050
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
1051
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
1052
+
1053
+ a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
1054
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
1055
+
1056
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
1057
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
1058
+
1059
+ a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
1060
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
1061
+
1062
+ a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
1063
+ a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
1064
+
1065
+ a.packet[0] = _mm512_castsi512_ph(a0);
1066
+ a.packet[1] = _mm512_castsi512_ph(a1);
1067
+ a.packet[2] = _mm512_castsi512_ph(a2);
1068
+ a.packet[3] = _mm512_castsi512_ph(a3);
1069
+ }
1070
+
1071
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
1072
+ __m256i a = _mm256_castph_si256(kernel.packet[0]);
1073
+ __m256i b = _mm256_castph_si256(kernel.packet[1]);
1074
+ __m256i c = _mm256_castph_si256(kernel.packet[2]);
1075
+ __m256i d = _mm256_castph_si256(kernel.packet[3]);
1076
+ __m256i e = _mm256_castph_si256(kernel.packet[4]);
1077
+ __m256i f = _mm256_castph_si256(kernel.packet[5]);
1078
+ __m256i g = _mm256_castph_si256(kernel.packet[6]);
1079
+ __m256i h = _mm256_castph_si256(kernel.packet[7]);
1080
+ __m256i i = _mm256_castph_si256(kernel.packet[8]);
1081
+ __m256i j = _mm256_castph_si256(kernel.packet[9]);
1082
+ __m256i k = _mm256_castph_si256(kernel.packet[10]);
1083
+ __m256i l = _mm256_castph_si256(kernel.packet[11]);
1084
+ __m256i m = _mm256_castph_si256(kernel.packet[12]);
1085
+ __m256i n = _mm256_castph_si256(kernel.packet[13]);
1086
+ __m256i o = _mm256_castph_si256(kernel.packet[14]);
1087
+ __m256i p = _mm256_castph_si256(kernel.packet[15]);
1088
+
1089
+ __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
1090
+ __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
1091
+ __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
1092
+ __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
1093
+ __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
1094
+ __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
1095
+ __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
1096
+ __m256i op_07 = _mm256_unpacklo_epi16(o, p);
1097
+
1098
+ __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
1099
+ __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
1100
+ __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
1101
+ __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
1102
+ __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
1103
+ __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
1104
+ __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
1105
+ __m256i op_8f = _mm256_unpackhi_epi16(o, p);
1106
+
1107
+ __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
1108
+ __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
1109
+ __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
1110
+ __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
1111
+ __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
1112
+ __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
1113
+ __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
1114
+ __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
1115
+
1116
+ __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
1117
+ __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
1118
+ __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
1119
+ __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
1120
+ __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
1121
+ __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
1122
+ __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
1123
+ __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
1124
+
1125
+ __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
1126
+ __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
1127
+ __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
1128
+ __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
1129
+ __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
1130
+ __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
1131
+ __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
1132
+ __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
1133
+ __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
1134
+ __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
1135
+ __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
1136
+ __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
1137
+ __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
1138
+ __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
1139
+ __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
1140
+ __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
1141
+
1142
+ // NOTE: no unpacklo/hi instr in this case, so using permute instr.
1143
+ __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
1144
+ __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
1145
+ __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
1146
+ __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
1147
+ __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
1148
+ __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
1149
+ __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
1150
+ __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
1151
+ __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
1152
+ __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
1153
+ __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
1154
+ __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
1155
+ __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
1156
+ __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
1157
+ __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
1158
+ __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
1159
+
1160
+ kernel.packet[0] = _mm256_castsi256_ph(a_p_0);
1161
+ kernel.packet[1] = _mm256_castsi256_ph(a_p_1);
1162
+ kernel.packet[2] = _mm256_castsi256_ph(a_p_2);
1163
+ kernel.packet[3] = _mm256_castsi256_ph(a_p_3);
1164
+ kernel.packet[4] = _mm256_castsi256_ph(a_p_4);
1165
+ kernel.packet[5] = _mm256_castsi256_ph(a_p_5);
1166
+ kernel.packet[6] = _mm256_castsi256_ph(a_p_6);
1167
+ kernel.packet[7] = _mm256_castsi256_ph(a_p_7);
1168
+ kernel.packet[8] = _mm256_castsi256_ph(a_p_8);
1169
+ kernel.packet[9] = _mm256_castsi256_ph(a_p_9);
1170
+ kernel.packet[10] = _mm256_castsi256_ph(a_p_a);
1171
+ kernel.packet[11] = _mm256_castsi256_ph(a_p_b);
1172
+ kernel.packet[12] = _mm256_castsi256_ph(a_p_c);
1173
+ kernel.packet[13] = _mm256_castsi256_ph(a_p_d);
1174
+ kernel.packet[14] = _mm256_castsi256_ph(a_p_e);
1175
+ kernel.packet[15] = _mm256_castsi256_ph(a_p_f);
1176
+ }
1177
+
1178
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
1179
+ EIGEN_ALIGN64 half in[8][16];
1180
+ pstore<half>(in[0], kernel.packet[0]);
1181
+ pstore<half>(in[1], kernel.packet[1]);
1182
+ pstore<half>(in[2], kernel.packet[2]);
1183
+ pstore<half>(in[3], kernel.packet[3]);
1184
+ pstore<half>(in[4], kernel.packet[4]);
1185
+ pstore<half>(in[5], kernel.packet[5]);
1186
+ pstore<half>(in[6], kernel.packet[6]);
1187
+ pstore<half>(in[7], kernel.packet[7]);
1188
+
1189
+ EIGEN_ALIGN64 half out[8][16];
1190
+
1191
+ for (int i = 0; i < 8; ++i) {
1192
+ for (int j = 0; j < 8; ++j) {
1193
+ out[i][j] = in[j][2 * i];
1194
+ }
1195
+ for (int j = 0; j < 8; ++j) {
1196
+ out[i][j + 8] = in[j][2 * i + 1];
1197
+ }
1198
+ }
1199
+
1200
+ kernel.packet[0] = pload<Packet16h>(out[0]);
1201
+ kernel.packet[1] = pload<Packet16h>(out[1]);
1202
+ kernel.packet[2] = pload<Packet16h>(out[2]);
1203
+ kernel.packet[3] = pload<Packet16h>(out[3]);
1204
+ kernel.packet[4] = pload<Packet16h>(out[4]);
1205
+ kernel.packet[5] = pload<Packet16h>(out[5]);
1206
+ kernel.packet[6] = pload<Packet16h>(out[6]);
1207
+ kernel.packet[7] = pload<Packet16h>(out[7]);
1208
+ }
1209
+
1210
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
1211
+ EIGEN_ALIGN64 half in[4][16];
1212
+ pstore<half>(in[0], kernel.packet[0]);
1213
+ pstore<half>(in[1], kernel.packet[1]);
1214
+ pstore<half>(in[2], kernel.packet[2]);
1215
+ pstore<half>(in[3], kernel.packet[3]);
1216
+
1217
+ EIGEN_ALIGN64 half out[4][16];
1218
+
1219
+ for (int i = 0; i < 4; ++i) {
1220
+ for (int j = 0; j < 4; ++j) {
1221
+ out[i][j] = in[j][4 * i];
1222
+ }
1223
+ for (int j = 0; j < 4; ++j) {
1224
+ out[i][j + 4] = in[j][4 * i + 1];
1225
+ }
1226
+ for (int j = 0; j < 4; ++j) {
1227
+ out[i][j + 8] = in[j][4 * i + 2];
1228
+ }
1229
+ for (int j = 0; j < 4; ++j) {
1230
+ out[i][j + 12] = in[j][4 * i + 3];
1231
+ }
1232
+ }
1233
+
1234
+ kernel.packet[0] = pload<Packet16h>(out[0]);
1235
+ kernel.packet[1] = pload<Packet16h>(out[1]);
1236
+ kernel.packet[2] = pload<Packet16h>(out[2]);
1237
+ kernel.packet[3] = pload<Packet16h>(out[3]);
1238
+ }
1239
+
1240
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) {
1241
+ __m128i a = _mm_castph_si128(kernel.packet[0]);
1242
+ __m128i b = _mm_castph_si128(kernel.packet[1]);
1243
+ __m128i c = _mm_castph_si128(kernel.packet[2]);
1244
+ __m128i d = _mm_castph_si128(kernel.packet[3]);
1245
+ __m128i e = _mm_castph_si128(kernel.packet[4]);
1246
+ __m128i f = _mm_castph_si128(kernel.packet[5]);
1247
+ __m128i g = _mm_castph_si128(kernel.packet[6]);
1248
+ __m128i h = _mm_castph_si128(kernel.packet[7]);
1249
+
1250
+ __m128i a03b03 = _mm_unpacklo_epi16(a, b);
1251
+ __m128i c03d03 = _mm_unpacklo_epi16(c, d);
1252
+ __m128i e03f03 = _mm_unpacklo_epi16(e, f);
1253
+ __m128i g03h03 = _mm_unpacklo_epi16(g, h);
1254
+ __m128i a47b47 = _mm_unpackhi_epi16(a, b);
1255
+ __m128i c47d47 = _mm_unpackhi_epi16(c, d);
1256
+ __m128i e47f47 = _mm_unpackhi_epi16(e, f);
1257
+ __m128i g47h47 = _mm_unpackhi_epi16(g, h);
1258
+
1259
+ __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
1260
+ __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
1261
+ __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
1262
+ __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
1263
+ __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
1264
+ __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
1265
+ __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
1266
+ __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
1267
+
1268
+ __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
1269
+ __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
1270
+ __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
1271
+ __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
1272
+ __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
1273
+ __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
1274
+ __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
1275
+ __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
1276
+
1277
+ kernel.packet[0] = _mm_castsi128_ph(a0b0c0d0e0f0g0h0);
1278
+ kernel.packet[1] = _mm_castsi128_ph(a1b1c1d1e1f1g1h1);
1279
+ kernel.packet[2] = _mm_castsi128_ph(a2b2c2d2e2f2g2h2);
1280
+ kernel.packet[3] = _mm_castsi128_ph(a3b3c3d3e3f3g3h3);
1281
+ kernel.packet[4] = _mm_castsi128_ph(a4b4c4d4e4f4g4h4);
1282
+ kernel.packet[5] = _mm_castsi128_ph(a5b5c5d5e5f5g5h5);
1283
+ kernel.packet[6] = _mm_castsi128_ph(a6b6c6d6e6f6g6h6);
1284
+ kernel.packet[7] = _mm_castsi128_ph(a7b7c7d7e7f7g7h7);
1285
+ }
1286
+
1287
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
1288
+ EIGEN_ALIGN32 Eigen::half in[4][8];
1289
+ pstore<Eigen::half>(in[0], kernel.packet[0]);
1290
+ pstore<Eigen::half>(in[1], kernel.packet[1]);
1291
+ pstore<Eigen::half>(in[2], kernel.packet[2]);
1292
+ pstore<Eigen::half>(in[3], kernel.packet[3]);
1293
+
1294
+ EIGEN_ALIGN32 Eigen::half out[4][8];
1295
+
1296
+ for (int i = 0; i < 4; ++i) {
1297
+ for (int j = 0; j < 4; ++j) {
1298
+ out[i][j] = in[j][2 * i];
1299
+ }
1300
+ for (int j = 0; j < 4; ++j) {
1301
+ out[i][j + 4] = in[j][2 * i + 1];
1302
+ }
1303
+ }
1304
+
1305
+ kernel.packet[0] = pload<Packet8h>(out[0]);
1306
+ kernel.packet[1] = pload<Packet8h>(out[1]);
1307
+ kernel.packet[2] = pload<Packet8h>(out[2]);
1308
+ kernel.packet[3] = pload<Packet8h>(out[3]);
1309
+ }
1310
+
1311
+ // preverse
1312
+
1313
+ template <>
1314
+ EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
1315
+ return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
1316
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
1317
+ a);
1318
+ }
1319
+
1320
+ template <>
1321
+ EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
1322
+ __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1323
+ return _mm256_castsi256_ph(_mm256_insertf128_si256(
1324
+ _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 1), m)),
1325
+ _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 0), m), 1));
1326
+ }
1327
+
1328
+ template <>
1329
+ EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
1330
+ __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1331
+ return _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a), m));
1332
+ }
1333
+
1334
+ // pscatter
1335
+
1336
+ template <>
1337
+ EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
1338
+ EIGEN_ALIGN64 half aux[32];
1339
+ pstore(aux, from);
1340
+
1341
+ EIGEN_UNROLL_LOOP
1342
+ for (int i = 0; i < 32; i++) {
1343
+ to[stride * i] = aux[i];
1344
+ }
1345
+ }
1346
+ template <>
1347
+ EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
1348
+ EIGEN_ALIGN64 half aux[16];
1349
+ pstore(aux, from);
1350
+ to[stride * 0] = aux[0];
1351
+ to[stride * 1] = aux[1];
1352
+ to[stride * 2] = aux[2];
1353
+ to[stride * 3] = aux[3];
1354
+ to[stride * 4] = aux[4];
1355
+ to[stride * 5] = aux[5];
1356
+ to[stride * 6] = aux[6];
1357
+ to[stride * 7] = aux[7];
1358
+ to[stride * 8] = aux[8];
1359
+ to[stride * 9] = aux[9];
1360
+ to[stride * 10] = aux[10];
1361
+ to[stride * 11] = aux[11];
1362
+ to[stride * 12] = aux[12];
1363
+ to[stride * 13] = aux[13];
1364
+ to[stride * 14] = aux[14];
1365
+ to[stride * 15] = aux[15];
1366
+ }
1367
+
1368
+ template <>
1369
+ EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
1370
+ EIGEN_ALIGN32 Eigen::half aux[8];
1371
+ pstore(aux, from);
1372
+ to[stride * 0] = aux[0];
1373
+ to[stride * 1] = aux[1];
1374
+ to[stride * 2] = aux[2];
1375
+ to[stride * 3] = aux[3];
1376
+ to[stride * 4] = aux[4];
1377
+ to[stride * 5] = aux[5];
1378
+ to[stride * 6] = aux[6];
1379
+ to[stride * 7] = aux[7];
1380
+ }
1381
+
1382
+ // pgather
1383
+
1384
+ template <>
1385
+ EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
1386
+ return _mm512_set_ph(from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x,
1387
+ from[27 * stride].x, from[26 * stride].x, from[25 * stride].x, from[24 * stride].x,
1388
+ from[23 * stride].x, from[22 * stride].x, from[21 * stride].x, from[20 * stride].x,
1389
+ from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, from[16 * stride].x,
1390
+ from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
1391
+ from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
1392
+ from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
1393
+ from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
1394
+ }
1395
+
1396
+ template <>
1397
+ EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
1398
+ return _mm256_set_ph(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
1399
+ from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
1400
+ from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
1401
+ from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
1402
+ }
1403
+
1404
+ template <>
1405
+ EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
1406
+ return _mm_set_ph(from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x,
1407
+ from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
1408
+ }
1409
+
1410
+ } // end namespace internal
1411
+ } // end namespace Eigen
1412
+
1413
+ #endif // EIGEN_PACKET_MATH_FP16_AVX512_H