@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -1,1124 +0,0 @@
1
- // This file is part of Eigen, a lightweight C++ template library
2
- // for linear algebra.
3
- //
4
- // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
- //
6
- // This Source Code Form is subject to the terms of the Mozilla
7
- // Public License v. 2.0. If a copy of the MPL was not distributed
8
- // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
-
10
- #ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
11
- #define EIGEN_PACKET_MATH_HALF_CUDA_H
12
-
13
-
14
- namespace Eigen {
15
- namespace internal {
16
-
17
- // Most of the following operations require arch >= 3.0
18
- #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
19
-
20
- template<> struct is_arithmetic<half2> { enum { value = true }; };
21
-
22
- template<> struct packet_traits<Eigen::half> : default_packet_traits
23
- {
24
- typedef half2 type;
25
- typedef half2 half;
26
- enum {
27
- Vectorizable = 1,
28
- AlignedOnScalar = 1,
29
- size=2,
30
- HasHalfPacket = 0,
31
- HasAdd = 1,
32
- HasMul = 1,
33
- HasDiv = 1,
34
- HasSqrt = 1,
35
- HasRsqrt = 1,
36
- HasExp = 1,
37
- HasLog = 1,
38
- HasLog1p = 1
39
- };
40
- };
41
-
42
- template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
43
-
44
- template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
45
- return __half2half2(from);
46
- }
47
-
48
- template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
49
- return *reinterpret_cast<const half2*>(from);
50
- }
51
-
52
- template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
53
- return __halves2half2(from[0], from[1]);
54
- }
55
-
56
- template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
57
- return __halves2half2(from[0], from[0]);
58
- }
59
-
60
- template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
61
- *reinterpret_cast<half2*>(to) = from;
62
- }
63
-
64
- template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
65
- to[0] = __low2half(from);
66
- to[1] = __high2half(from);
67
- }
68
-
69
- template<>
70
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
71
- #if __CUDA_ARCH__ >= 350
72
- return __ldg((const half2*)from);
73
- #else
74
- return __halves2half2(*(from+0), *(from+1));
75
- #endif
76
- }
77
-
78
- template<>
79
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
80
- #if __CUDA_ARCH__ >= 350
81
- return __halves2half2(__ldg(from+0), __ldg(from+1));
82
- #else
83
- return __halves2half2(*(from+0), *(from+1));
84
- #endif
85
- }
86
-
87
- template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
88
- return __halves2half2(from[0*stride], from[1*stride]);
89
- }
90
-
91
- template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
92
- to[stride*0] = __low2half(from);
93
- to[stride*1] = __high2half(from);
94
- }
95
-
96
- template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
97
- return __low2half(a);
98
- }
99
-
100
- template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
101
- half2 result;
102
- unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
103
- *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
104
- return result;
105
- }
106
-
107
-
108
- __device__ EIGEN_STRONG_INLINE void
109
- ptranspose(PacketBlock<half2,2>& kernel) {
110
- __half a1 = __low2half(kernel.packet[0]);
111
- __half a2 = __high2half(kernel.packet[0]);
112
- __half b1 = __low2half(kernel.packet[1]);
113
- __half b2 = __high2half(kernel.packet[1]);
114
- kernel.packet[0] = __halves2half2(a1, b1);
115
- kernel.packet[1] = __halves2half2(a2, b2);
116
- }
117
-
118
- template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
119
- #if __CUDA_ARCH__ >= 530
120
- return __halves2half2(a, __hadd(a, __float2half(1.0f)));
121
- #else
122
- float f = __half2float(a) + 1.0f;
123
- return __halves2half2(a, __float2half(f));
124
- #endif
125
- }
126
-
127
- template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
128
- #if __CUDA_ARCH__ >= 530
129
- return __hadd2(a, b);
130
- #else
131
- float a1 = __low2float(a);
132
- float a2 = __high2float(a);
133
- float b1 = __low2float(b);
134
- float b2 = __high2float(b);
135
- float r1 = a1 + b1;
136
- float r2 = a2 + b2;
137
- return __floats2half2_rn(r1, r2);
138
- #endif
139
- }
140
-
141
- template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
142
- #if __CUDA_ARCH__ >= 530
143
- return __hsub2(a, b);
144
- #else
145
- float a1 = __low2float(a);
146
- float a2 = __high2float(a);
147
- float b1 = __low2float(b);
148
- float b2 = __high2float(b);
149
- float r1 = a1 - b1;
150
- float r2 = a2 - b2;
151
- return __floats2half2_rn(r1, r2);
152
- #endif
153
- }
154
-
155
- template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
156
- #if __CUDA_ARCH__ >= 530
157
- return __hneg2(a);
158
- #else
159
- float a1 = __low2float(a);
160
- float a2 = __high2float(a);
161
- return __floats2half2_rn(-a1, -a2);
162
- #endif
163
- }
164
-
165
- template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
166
-
167
- template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
168
- #if __CUDA_ARCH__ >= 530
169
- return __hmul2(a, b);
170
- #else
171
- float a1 = __low2float(a);
172
- float a2 = __high2float(a);
173
- float b1 = __low2float(b);
174
- float b2 = __high2float(b);
175
- float r1 = a1 * b1;
176
- float r2 = a2 * b2;
177
- return __floats2half2_rn(r1, r2);
178
- #endif
179
- }
180
-
181
- template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
182
- #if __CUDA_ARCH__ >= 530
183
- return __hfma2(a, b, c);
184
- #else
185
- float a1 = __low2float(a);
186
- float a2 = __high2float(a);
187
- float b1 = __low2float(b);
188
- float b2 = __high2float(b);
189
- float c1 = __low2float(c);
190
- float c2 = __high2float(c);
191
- float r1 = a1 * b1 + c1;
192
- float r2 = a2 * b2 + c2;
193
- return __floats2half2_rn(r1, r2);
194
- #endif
195
- }
196
-
197
- template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
198
- float a1 = __low2float(a);
199
- float a2 = __high2float(a);
200
- float b1 = __low2float(b);
201
- float b2 = __high2float(b);
202
- float r1 = a1 / b1;
203
- float r2 = a2 / b2;
204
- return __floats2half2_rn(r1, r2);
205
- }
206
-
207
- template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
208
- float a1 = __low2float(a);
209
- float a2 = __high2float(a);
210
- float b1 = __low2float(b);
211
- float b2 = __high2float(b);
212
- __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
213
- __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
214
- return __halves2half2(r1, r2);
215
- }
216
-
217
- template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
218
- float a1 = __low2float(a);
219
- float a2 = __high2float(a);
220
- float b1 = __low2float(b);
221
- float b2 = __high2float(b);
222
- __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
223
- __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
224
- return __halves2half2(r1, r2);
225
- }
226
-
227
- template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
228
- #if __CUDA_ARCH__ >= 530
229
- return __hadd(__low2half(a), __high2half(a));
230
- #else
231
- float a1 = __low2float(a);
232
- float a2 = __high2float(a);
233
- return Eigen::half(__float2half_rn(a1 + a2));
234
- #endif
235
- }
236
-
237
- template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
238
- #if __CUDA_ARCH__ >= 530
239
- __half first = __low2half(a);
240
- __half second = __high2half(a);
241
- return __hgt(first, second) ? first : second;
242
- #else
243
- float a1 = __low2float(a);
244
- float a2 = __high2float(a);
245
- return a1 > a2 ? __low2half(a) : __high2half(a);
246
- #endif
247
- }
248
-
249
- template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
250
- #if __CUDA_ARCH__ >= 530
251
- __half first = __low2half(a);
252
- __half second = __high2half(a);
253
- return __hlt(first, second) ? first : second;
254
- #else
255
- float a1 = __low2float(a);
256
- float a2 = __high2float(a);
257
- return a1 < a2 ? __low2half(a) : __high2half(a);
258
- #endif
259
- }
260
-
261
- template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
262
- #if __CUDA_ARCH__ >= 530
263
- return __hmul(__low2half(a), __high2half(a));
264
- #else
265
- float a1 = __low2float(a);
266
- float a2 = __high2float(a);
267
- return Eigen::half(__float2half_rn(a1 * a2));
268
- #endif
269
- }
270
-
271
- template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
272
- float a1 = __low2float(a);
273
- float a2 = __high2float(a);
274
- float r1 = log1pf(a1);
275
- float r2 = log1pf(a2);
276
- return __floats2half2_rn(r1, r2);
277
- }
278
-
279
- #if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
280
-
281
- template<> __device__ EIGEN_STRONG_INLINE
282
- half2 plog<half2>(const half2& a) {
283
- return h2log(a);
284
- }
285
-
286
- template<> __device__ EIGEN_STRONG_INLINE
287
- half2 pexp<half2>(const half2& a) {
288
- return h2exp(a);
289
- }
290
-
291
- template<> __device__ EIGEN_STRONG_INLINE
292
- half2 psqrt<half2>(const half2& a) {
293
- return h2sqrt(a);
294
- }
295
-
296
- template<> __device__ EIGEN_STRONG_INLINE
297
- half2 prsqrt<half2>(const half2& a) {
298
- return h2rsqrt(a);
299
- }
300
-
301
- #else
302
-
303
- template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
304
- float a1 = __low2float(a);
305
- float a2 = __high2float(a);
306
- float r1 = logf(a1);
307
- float r2 = logf(a2);
308
- return __floats2half2_rn(r1, r2);
309
- }
310
-
311
- template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
312
- float a1 = __low2float(a);
313
- float a2 = __high2float(a);
314
- float r1 = expf(a1);
315
- float r2 = expf(a2);
316
- return __floats2half2_rn(r1, r2);
317
- }
318
-
319
- template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
320
- float a1 = __low2float(a);
321
- float a2 = __high2float(a);
322
- float r1 = sqrtf(a1);
323
- float r2 = sqrtf(a2);
324
- return __floats2half2_rn(r1, r2);
325
- }
326
-
327
- template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
328
- float a1 = __low2float(a);
329
- float a2 = __high2float(a);
330
- float r1 = rsqrtf(a1);
331
- float r2 = rsqrtf(a2);
332
- return __floats2half2_rn(r1, r2);
333
- }
334
-
335
- #endif
336
-
337
- #elif defined EIGEN_VECTORIZE_AVX512
338
-
339
- typedef struct {
340
- __m256i x;
341
- } Packet16h;
342
-
343
-
344
- template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
345
-
346
- template <>
347
- struct packet_traits<half> : default_packet_traits {
348
- typedef Packet16h type;
349
- // There is no half-size packet for Packet16h.
350
- typedef Packet16h half;
351
- enum {
352
- Vectorizable = 1,
353
- AlignedOnScalar = 1,
354
- size = 16,
355
- HasHalfPacket = 0,
356
- HasAdd = 0,
357
- HasSub = 0,
358
- HasMul = 0,
359
- HasNegate = 0,
360
- HasAbs = 0,
361
- HasAbs2 = 0,
362
- HasMin = 0,
363
- HasMax = 0,
364
- HasConj = 0,
365
- HasSetLinear = 0,
366
- HasDiv = 0,
367
- HasSqrt = 0,
368
- HasRsqrt = 0,
369
- HasExp = 0,
370
- HasLog = 0,
371
- HasBlend = 0
372
- };
373
- };
374
-
375
-
376
- template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
377
-
378
- template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
379
- Packet16h result;
380
- result.x = _mm256_set1_epi16(from.x);
381
- return result;
382
- }
383
-
384
- template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
385
- return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
386
- }
387
-
388
- template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
389
- Packet16h result;
390
- result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
391
- return result;
392
- }
393
-
394
- template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
395
- Packet16h result;
396
- result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
397
- return result;
398
- }
399
-
400
- template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
401
- _mm256_store_si256((__m256i*)to, from.x);
402
- }
403
-
404
- template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
405
- _mm256_storeu_si256((__m256i*)to, from.x);
406
- }
407
-
408
- template<> EIGEN_STRONG_INLINE Packet16h
409
- ploadquad(const Eigen::half* from) {
410
- Packet16h result;
411
- unsigned short a = from[0].x;
412
- unsigned short b = from[1].x;
413
- unsigned short c = from[2].x;
414
- unsigned short d = from[3].x;
415
- result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
416
- return result;
417
- }
418
-
419
- EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
420
- #ifdef EIGEN_HAS_FP16_C
421
- return _mm512_cvtph_ps(a.x);
422
- #else
423
- EIGEN_ALIGN64 half aux[16];
424
- pstore(aux, a);
425
- float f0(aux[0]);
426
- float f1(aux[1]);
427
- float f2(aux[2]);
428
- float f3(aux[3]);
429
- float f4(aux[4]);
430
- float f5(aux[5]);
431
- float f6(aux[6]);
432
- float f7(aux[7]);
433
- float f8(aux[8]);
434
- float f9(aux[9]);
435
- float fa(aux[10]);
436
- float fb(aux[11]);
437
- float fc(aux[12]);
438
- float fd(aux[13]);
439
- float fe(aux[14]);
440
- float ff(aux[15]);
441
-
442
- return _mm512_set_ps(
443
- ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
444
- #endif
445
- }
446
-
447
- EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
448
- #ifdef EIGEN_HAS_FP16_C
449
- Packet16h result;
450
- result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
451
- return result;
452
- #else
453
- EIGEN_ALIGN64 float aux[16];
454
- pstore(aux, a);
455
- half h0(aux[0]);
456
- half h1(aux[1]);
457
- half h2(aux[2]);
458
- half h3(aux[3]);
459
- half h4(aux[4]);
460
- half h5(aux[5]);
461
- half h6(aux[6]);
462
- half h7(aux[7]);
463
- half h8(aux[8]);
464
- half h9(aux[9]);
465
- half ha(aux[10]);
466
- half hb(aux[11]);
467
- half hc(aux[12]);
468
- half hd(aux[13]);
469
- half he(aux[14]);
470
- half hf(aux[15]);
471
-
472
- Packet16h result;
473
- result.x = _mm256_set_epi16(
474
- hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
475
- h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
476
- return result;
477
- #endif
478
- }
479
-
480
- template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
481
- Packet16f af = half2float(a);
482
- Packet16f bf = half2float(b);
483
- Packet16f rf = padd(af, bf);
484
- return float2half(rf);
485
- }
486
-
487
- template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
488
- Packet16f af = half2float(a);
489
- Packet16f bf = half2float(b);
490
- Packet16f rf = pmul(af, bf);
491
- return float2half(rf);
492
- }
493
-
494
- template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
495
- Packet16f from_float = half2float(from);
496
- return half(predux(from_float));
497
- }
498
-
499
- template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
500
- {
501
- Packet16h result;
502
- result.x = _mm256_set_epi16(
503
- from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
504
- from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
505
- from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
506
- from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
507
- return result;
508
- }
509
-
510
- template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
511
- {
512
- EIGEN_ALIGN64 half aux[16];
513
- pstore(aux, from);
514
- to[stride*0].x = aux[0].x;
515
- to[stride*1].x = aux[1].x;
516
- to[stride*2].x = aux[2].x;
517
- to[stride*3].x = aux[3].x;
518
- to[stride*4].x = aux[4].x;
519
- to[stride*5].x = aux[5].x;
520
- to[stride*6].x = aux[6].x;
521
- to[stride*7].x = aux[7].x;
522
- to[stride*8].x = aux[8].x;
523
- to[stride*9].x = aux[9].x;
524
- to[stride*10].x = aux[10].x;
525
- to[stride*11].x = aux[11].x;
526
- to[stride*12].x = aux[12].x;
527
- to[stride*13].x = aux[13].x;
528
- to[stride*14].x = aux[14].x;
529
- to[stride*15].x = aux[15].x;
530
- }
531
-
532
- EIGEN_STRONG_INLINE void
533
- ptranspose(PacketBlock<Packet16h,16>& kernel) {
534
- __m256i a = kernel.packet[0].x;
535
- __m256i b = kernel.packet[1].x;
536
- __m256i c = kernel.packet[2].x;
537
- __m256i d = kernel.packet[3].x;
538
- __m256i e = kernel.packet[4].x;
539
- __m256i f = kernel.packet[5].x;
540
- __m256i g = kernel.packet[6].x;
541
- __m256i h = kernel.packet[7].x;
542
- __m256i i = kernel.packet[8].x;
543
- __m256i j = kernel.packet[9].x;
544
- __m256i k = kernel.packet[10].x;
545
- __m256i l = kernel.packet[11].x;
546
- __m256i m = kernel.packet[12].x;
547
- __m256i n = kernel.packet[13].x;
548
- __m256i o = kernel.packet[14].x;
549
- __m256i p = kernel.packet[15].x;
550
-
551
- __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
552
- __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
553
- __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
554
- __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
555
- __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
556
- __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
557
- __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
558
- __m256i op_07 = _mm256_unpacklo_epi16(o, p);
559
-
560
- __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
561
- __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
562
- __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
563
- __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
564
- __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
565
- __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
566
- __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
567
- __m256i op_8f = _mm256_unpackhi_epi16(o, p);
568
-
569
- __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
570
- __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
571
- __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
572
- __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
573
- __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
574
- __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
575
- __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
576
- __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
577
-
578
- __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
579
- __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
580
- __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
581
- __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
582
- __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
583
- __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
584
- __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
585
- __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
586
-
587
- __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
588
- __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
589
- __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
590
- __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
591
- __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
592
- __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
593
- __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
594
- __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
595
- __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
596
- __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
597
- __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
598
- __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
599
- __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
600
- __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
601
- __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
602
- __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
603
-
604
- // NOTE: no unpacklo/hi instr in this case, so using permute instr.
605
- __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
606
- __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
607
- __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
608
- __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
609
- __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
610
- __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
611
- __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
612
- __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
613
- __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
614
- __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
615
- __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
616
- __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
617
- __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
618
- __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
619
- __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
620
- __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
621
-
622
- kernel.packet[0].x = a_p_0;
623
- kernel.packet[1].x = a_p_1;
624
- kernel.packet[2].x = a_p_2;
625
- kernel.packet[3].x = a_p_3;
626
- kernel.packet[4].x = a_p_4;
627
- kernel.packet[5].x = a_p_5;
628
- kernel.packet[6].x = a_p_6;
629
- kernel.packet[7].x = a_p_7;
630
- kernel.packet[8].x = a_p_8;
631
- kernel.packet[9].x = a_p_9;
632
- kernel.packet[10].x = a_p_a;
633
- kernel.packet[11].x = a_p_b;
634
- kernel.packet[12].x = a_p_c;
635
- kernel.packet[13].x = a_p_d;
636
- kernel.packet[14].x = a_p_e;
637
- kernel.packet[15].x = a_p_f;
638
- }
639
-
640
- EIGEN_STRONG_INLINE void
641
- ptranspose(PacketBlock<Packet16h,8>& kernel) {
642
- EIGEN_ALIGN64 half in[8][16];
643
- pstore<half>(in[0], kernel.packet[0]);
644
- pstore<half>(in[1], kernel.packet[1]);
645
- pstore<half>(in[2], kernel.packet[2]);
646
- pstore<half>(in[3], kernel.packet[3]);
647
- pstore<half>(in[4], kernel.packet[4]);
648
- pstore<half>(in[5], kernel.packet[5]);
649
- pstore<half>(in[6], kernel.packet[6]);
650
- pstore<half>(in[7], kernel.packet[7]);
651
-
652
- EIGEN_ALIGN64 half out[8][16];
653
-
654
- for (int i = 0; i < 8; ++i) {
655
- for (int j = 0; j < 8; ++j) {
656
- out[i][j] = in[j][2*i];
657
- }
658
- for (int j = 0; j < 8; ++j) {
659
- out[i][j+8] = in[j][2*i+1];
660
- }
661
- }
662
-
663
- kernel.packet[0] = pload<Packet16h>(out[0]);
664
- kernel.packet[1] = pload<Packet16h>(out[1]);
665
- kernel.packet[2] = pload<Packet16h>(out[2]);
666
- kernel.packet[3] = pload<Packet16h>(out[3]);
667
- kernel.packet[4] = pload<Packet16h>(out[4]);
668
- kernel.packet[5] = pload<Packet16h>(out[5]);
669
- kernel.packet[6] = pload<Packet16h>(out[6]);
670
- kernel.packet[7] = pload<Packet16h>(out[7]);
671
- }
672
-
673
- EIGEN_STRONG_INLINE void
674
- ptranspose(PacketBlock<Packet16h,4>& kernel) {
675
- EIGEN_ALIGN64 half in[4][16];
676
- pstore<half>(in[0], kernel.packet[0]);
677
- pstore<half>(in[1], kernel.packet[1]);
678
- pstore<half>(in[2], kernel.packet[2]);
679
- pstore<half>(in[3], kernel.packet[3]);
680
-
681
- EIGEN_ALIGN64 half out[4][16];
682
-
683
- for (int i = 0; i < 4; ++i) {
684
- for (int j = 0; j < 4; ++j) {
685
- out[i][j] = in[j][4*i];
686
- }
687
- for (int j = 0; j < 4; ++j) {
688
- out[i][j+4] = in[j][4*i+1];
689
- }
690
- for (int j = 0; j < 4; ++j) {
691
- out[i][j+8] = in[j][4*i+2];
692
- }
693
- for (int j = 0; j < 4; ++j) {
694
- out[i][j+12] = in[j][4*i+3];
695
- }
696
- }
697
-
698
- kernel.packet[0] = pload<Packet16h>(out[0]);
699
- kernel.packet[1] = pload<Packet16h>(out[1]);
700
- kernel.packet[2] = pload<Packet16h>(out[2]);
701
- kernel.packet[3] = pload<Packet16h>(out[3]);
702
- }
703
-
704
-
705
- #elif defined EIGEN_VECTORIZE_AVX
706
-
707
- typedef struct {
708
- __m128i x;
709
- } Packet8h;
710
-
711
-
712
- template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
713
-
714
- template <>
715
- struct packet_traits<Eigen::half> : default_packet_traits {
716
- typedef Packet8h type;
717
- // There is no half-size packet for Packet8h.
718
- typedef Packet8h half;
719
- enum {
720
- Vectorizable = 1,
721
- AlignedOnScalar = 1,
722
- size = 8,
723
- HasHalfPacket = 0,
724
- HasAdd = 0,
725
- HasSub = 0,
726
- HasMul = 0,
727
- HasNegate = 0,
728
- HasAbs = 0,
729
- HasAbs2 = 0,
730
- HasMin = 0,
731
- HasMax = 0,
732
- HasConj = 0,
733
- HasSetLinear = 0,
734
- HasDiv = 0,
735
- HasSqrt = 0,
736
- HasRsqrt = 0,
737
- HasExp = 0,
738
- HasLog = 0,
739
- HasBlend = 0
740
- };
741
- };
742
-
743
-
744
- template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
745
-
746
- template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
747
- Packet8h result;
748
- result.x = _mm_set1_epi16(from.x);
749
- return result;
750
- }
751
-
752
- template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
753
- return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
754
- }
755
-
756
- template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
757
- Packet8h result;
758
- result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
759
- return result;
760
- }
761
-
762
- template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
763
- Packet8h result;
764
- result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
765
- return result;
766
- }
767
-
768
- template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
769
- _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
770
- }
771
-
772
- template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
773
- _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
774
- }
775
-
776
- template<> EIGEN_STRONG_INLINE Packet8h
777
- ploadquad<Packet8h>(const Eigen::half* from) {
778
- Packet8h result;
779
- unsigned short a = from[0].x;
780
- unsigned short b = from[1].x;
781
- result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
782
- return result;
783
- }
784
-
785
- EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
786
- #ifdef EIGEN_HAS_FP16_C
787
- return _mm256_cvtph_ps(a.x);
788
- #else
789
- EIGEN_ALIGN32 Eigen::half aux[8];
790
- pstore(aux, a);
791
- float f0(aux[0]);
792
- float f1(aux[1]);
793
- float f2(aux[2]);
794
- float f3(aux[3]);
795
- float f4(aux[4]);
796
- float f5(aux[5]);
797
- float f6(aux[6]);
798
- float f7(aux[7]);
799
-
800
- return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
801
- #endif
802
- }
803
-
804
- EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
805
- #ifdef EIGEN_HAS_FP16_C
806
- Packet8h result;
807
- result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
808
- return result;
809
- #else
810
- EIGEN_ALIGN32 float aux[8];
811
- pstore(aux, a);
812
- Eigen::half h0(aux[0]);
813
- Eigen::half h1(aux[1]);
814
- Eigen::half h2(aux[2]);
815
- Eigen::half h3(aux[3]);
816
- Eigen::half h4(aux[4]);
817
- Eigen::half h5(aux[5]);
818
- Eigen::half h6(aux[6]);
819
- Eigen::half h7(aux[7]);
820
-
821
- Packet8h result;
822
- result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
823
- return result;
824
- #endif
825
- }
826
-
827
- template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
828
-
829
- template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
830
- Packet8f af = half2float(a);
831
- Packet8f bf = half2float(b);
832
- Packet8f rf = padd(af, bf);
833
- return float2half(rf);
834
- }
835
-
836
- template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
837
- Packet8f af = half2float(a);
838
- Packet8f bf = half2float(b);
839
- Packet8f rf = pmul(af, bf);
840
- return float2half(rf);
841
- }
842
-
843
- template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
844
- {
845
- Packet8h result;
846
- result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
847
- return result;
848
- }
849
-
850
- template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
851
- {
852
- EIGEN_ALIGN32 Eigen::half aux[8];
853
- pstore(aux, from);
854
- to[stride*0].x = aux[0].x;
855
- to[stride*1].x = aux[1].x;
856
- to[stride*2].x = aux[2].x;
857
- to[stride*3].x = aux[3].x;
858
- to[stride*4].x = aux[4].x;
859
- to[stride*5].x = aux[5].x;
860
- to[stride*6].x = aux[6].x;
861
- to[stride*7].x = aux[7].x;
862
- }
863
-
864
- template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
865
- Packet8f af = half2float(a);
866
- float reduced = predux<Packet8f>(af);
867
- return Eigen::half(reduced);
868
- }
869
-
870
- template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
871
- Packet8f af = half2float(a);
872
- float reduced = predux_max<Packet8f>(af);
873
- return Eigen::half(reduced);
874
- }
875
-
876
- template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
877
- Packet8f af = half2float(a);
878
- float reduced = predux_min<Packet8f>(af);
879
- return Eigen::half(reduced);
880
- }
881
-
882
- template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
883
- Packet8f af = half2float(a);
884
- float reduced = predux_mul<Packet8f>(af);
885
- return Eigen::half(reduced);
886
- }
887
-
888
- EIGEN_STRONG_INLINE void
889
- ptranspose(PacketBlock<Packet8h,8>& kernel) {
890
- __m128i a = kernel.packet[0].x;
891
- __m128i b = kernel.packet[1].x;
892
- __m128i c = kernel.packet[2].x;
893
- __m128i d = kernel.packet[3].x;
894
- __m128i e = kernel.packet[4].x;
895
- __m128i f = kernel.packet[5].x;
896
- __m128i g = kernel.packet[6].x;
897
- __m128i h = kernel.packet[7].x;
898
-
899
- __m128i a03b03 = _mm_unpacklo_epi16(a, b);
900
- __m128i c03d03 = _mm_unpacklo_epi16(c, d);
901
- __m128i e03f03 = _mm_unpacklo_epi16(e, f);
902
- __m128i g03h03 = _mm_unpacklo_epi16(g, h);
903
- __m128i a47b47 = _mm_unpackhi_epi16(a, b);
904
- __m128i c47d47 = _mm_unpackhi_epi16(c, d);
905
- __m128i e47f47 = _mm_unpackhi_epi16(e, f);
906
- __m128i g47h47 = _mm_unpackhi_epi16(g, h);
907
-
908
- __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
909
- __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
910
- __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
911
- __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
912
- __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
913
- __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
914
- __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
915
- __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
916
-
917
- __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
918
- __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
919
- __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
920
- __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
921
- __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
922
- __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
923
- __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
924
- __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
925
-
926
- kernel.packet[0].x = a0b0c0d0e0f0g0h0;
927
- kernel.packet[1].x = a1b1c1d1e1f1g1h1;
928
- kernel.packet[2].x = a2b2c2d2e2f2g2h2;
929
- kernel.packet[3].x = a3b3c3d3e3f3g3h3;
930
- kernel.packet[4].x = a4b4c4d4e4f4g4h4;
931
- kernel.packet[5].x = a5b5c5d5e5f5g5h5;
932
- kernel.packet[6].x = a6b6c6d6e6f6g6h6;
933
- kernel.packet[7].x = a7b7c7d7e7f7g7h7;
934
- }
935
-
936
- EIGEN_STRONG_INLINE void
937
- ptranspose(PacketBlock<Packet8h,4>& kernel) {
938
- EIGEN_ALIGN32 Eigen::half in[4][8];
939
- pstore<Eigen::half>(in[0], kernel.packet[0]);
940
- pstore<Eigen::half>(in[1], kernel.packet[1]);
941
- pstore<Eigen::half>(in[2], kernel.packet[2]);
942
- pstore<Eigen::half>(in[3], kernel.packet[3]);
943
-
944
- EIGEN_ALIGN32 Eigen::half out[4][8];
945
-
946
- for (int i = 0; i < 4; ++i) {
947
- for (int j = 0; j < 4; ++j) {
948
- out[i][j] = in[j][2*i];
949
- }
950
- for (int j = 0; j < 4; ++j) {
951
- out[i][j+4] = in[j][2*i+1];
952
- }
953
- }
954
-
955
- kernel.packet[0] = pload<Packet8h>(out[0]);
956
- kernel.packet[1] = pload<Packet8h>(out[1]);
957
- kernel.packet[2] = pload<Packet8h>(out[2]);
958
- kernel.packet[3] = pload<Packet8h>(out[3]);
959
- }
960
-
961
-
962
- // Disable the following code since it's broken on too many platforms / compilers.
963
- //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
964
- #elif 0
965
-
966
- typedef struct {
967
- __m64 x;
968
- } Packet4h;
969
-
970
-
971
- template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
972
-
973
- template <>
974
- struct packet_traits<Eigen::half> : default_packet_traits {
975
- typedef Packet4h type;
976
- // There is no half-size packet for Packet4h.
977
- typedef Packet4h half;
978
- enum {
979
- Vectorizable = 1,
980
- AlignedOnScalar = 1,
981
- size = 4,
982
- HasHalfPacket = 0,
983
- HasAdd = 0,
984
- HasSub = 0,
985
- HasMul = 0,
986
- HasNegate = 0,
987
- HasAbs = 0,
988
- HasAbs2 = 0,
989
- HasMin = 0,
990
- HasMax = 0,
991
- HasConj = 0,
992
- HasSetLinear = 0,
993
- HasDiv = 0,
994
- HasSqrt = 0,
995
- HasRsqrt = 0,
996
- HasExp = 0,
997
- HasLog = 0,
998
- HasBlend = 0
999
- };
1000
- };
1001
-
1002
-
1003
- template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
1004
-
1005
- template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
1006
- Packet4h result;
1007
- result.x = _mm_set1_pi16(from.x);
1008
- return result;
1009
- }
1010
-
1011
- template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
1012
- return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
1013
- }
1014
-
1015
- template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
1016
-
1017
- template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
1018
- __int64_t a64 = _mm_cvtm64_si64(a.x);
1019
- __int64_t b64 = _mm_cvtm64_si64(b.x);
1020
-
1021
- Eigen::half h[4];
1022
-
1023
- Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1024
- Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1025
- h[0] = ha + hb;
1026
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1027
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1028
- h[1] = ha + hb;
1029
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1030
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1031
- h[2] = ha + hb;
1032
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1033
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1034
- h[3] = ha + hb;
1035
- Packet4h result;
1036
- result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1037
- return result;
1038
- }
1039
-
1040
- template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
1041
- __int64_t a64 = _mm_cvtm64_si64(a.x);
1042
- __int64_t b64 = _mm_cvtm64_si64(b.x);
1043
-
1044
- Eigen::half h[4];
1045
-
1046
- Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1047
- Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1048
- h[0] = ha * hb;
1049
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1050
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1051
- h[1] = ha * hb;
1052
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1053
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1054
- h[2] = ha * hb;
1055
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1056
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1057
- h[3] = ha * hb;
1058
- Packet4h result;
1059
- result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1060
- return result;
1061
- }
1062
-
1063
- template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
1064
- Packet4h result;
1065
- result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1066
- return result;
1067
- }
1068
-
1069
- template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
1070
- Packet4h result;
1071
- result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1072
- return result;
1073
- }
1074
-
1075
- template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1076
- __int64_t r = _mm_cvtm64_si64(from.x);
1077
- *(reinterpret_cast<__int64_t*>(to)) = r;
1078
- }
1079
-
1080
- template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1081
- __int64_t r = _mm_cvtm64_si64(from.x);
1082
- *(reinterpret_cast<__int64_t*>(to)) = r;
1083
- }
1084
-
1085
- template<> EIGEN_STRONG_INLINE Packet4h
1086
- ploadquad<Packet4h>(const Eigen::half* from) {
1087
- return pset1<Packet4h>(*from);
1088
- }
1089
-
1090
- template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
1091
- {
1092
- Packet4h result;
1093
- result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
1094
- return result;
1095
- }
1096
-
1097
- template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
1098
- {
1099
- __int64_t a = _mm_cvtm64_si64(from.x);
1100
- to[stride*0].x = static_cast<unsigned short>(a);
1101
- to[stride*1].x = static_cast<unsigned short>(a >> 16);
1102
- to[stride*2].x = static_cast<unsigned short>(a >> 32);
1103
- to[stride*3].x = static_cast<unsigned short>(a >> 48);
1104
- }
1105
-
1106
- EIGEN_STRONG_INLINE void
1107
- ptranspose(PacketBlock<Packet4h,4>& kernel) {
1108
- __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
1109
- __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
1110
- __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
1111
- __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
1112
-
1113
- kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
1114
- kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
1115
- kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
1116
- kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
1117
- }
1118
-
1119
- #endif
1120
-
1121
- }
1122
- }
1123
-
1124
- #endif // EIGEN_PACKET_MATH_HALF_CUDA_H