@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -0,0 +1,1091 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // This Source Code Form is subject to the terms of the Mozilla
5
+ // Public License v. 2.0. If a copy of the MPL was not distributed
6
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
+ //
8
+ // The conversion routines are Copyright (c) Fabian Giesen, 2016.
9
+ // The original license follows:
10
+ //
11
+ // Copyright (c) Fabian Giesen, 2016
12
+ // All rights reserved.
13
+ // Redistribution and use in source and binary forms, with or without
14
+ // modification, are permitted.
15
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19
+ // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ // Standard 16-bit float type, mostly useful for GPUs. Defines a new
28
+ // type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
29
+ // operator overloads such that it behaves basically as an arithmetic
30
+ // type. It will be quite slow on CPUs (so it is recommended to stay
31
+ // in fp32 for CPUs, except for simple parameter conversions, I/O
32
+ // to disk and the likes), but fast on GPUs.
33
+
34
+ #ifndef EIGEN_HALF_H
35
+ #define EIGEN_HALF_H
36
+
37
+ // IWYU pragma: private
38
+ #include "../../InternalHeaderCheck.h"
39
+
40
+ // When compiling with GPU support, the "__half_raw" base class as well as
41
+ // some other routines are defined in the GPU compiler header files
42
+ // (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr
43
+ // As a consequence, we get compile failures when compiling Eigen with
44
+ // GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
45
+ // Eigen with GPU support.
46
+ // Any functions that require `numext::bit_cast` may also not be constexpr,
47
+ // including any native types when setting via raw bit values.
48
+ #if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
49
+ #define _EIGEN_MAYBE_CONSTEXPR
50
+ #else
51
+ #define _EIGEN_MAYBE_CONSTEXPR constexpr
52
+ #endif
53
+
54
+ #define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \
55
+ template <> \
56
+ EIGEN_UNUSED EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
57
+ return float2half(METHOD<PACKET_F>(half2float(_x))); \
58
+ }
59
+
60
+ namespace Eigen {
61
+
62
+ struct half;
63
+
64
+ namespace half_impl {
65
+
66
+ // We want to use the __half_raw struct from the HIP header file only during the device compile phase.
67
+ // This is required because of a quirk in the way TensorFlow GPU builds are done.
68
+ // When compiling TensorFlow source code with GPU support, files that
69
+ // * contain GPU kernels (i.e. *.cu.cc files) are compiled via hipcc
70
+ // * do not contain GPU kernels ( i.e. *.cc files) are compiled via gcc (typically)
71
+ //
72
+ // Tensorflow uses the Eigen::half type as its FP16 type, and there are functions that
73
+ // * are defined in a file that gets compiled via hipcc AND
74
+ // * have Eigen::half as a pass-by-value argument AND
75
+ // * are called in a file that gets compiled via gcc
76
+ //
77
+ // In the scenario described above the caller and callee will see different versions
78
+ // of the Eigen::half base class __half_raw, and they will be compiled by different compilers
79
+ //
80
+ // There appears to be an ABI mismatch between gcc and clang (which is called by hipcc) that results in
81
+ // the callee getting corrupted values for the Eigen::half argument.
82
+ //
83
+ // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
84
+ // this error, and hence the following convoluted #if condition
85
+ #if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
86
+
87
+ // Make our own __half_raw definition that is similar to CUDA's.
88
+ struct __half_raw {
89
+ struct construct_from_rep_tag {};
90
+ #if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
91
+ // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
92
+ // The element type for shared memory cannot have non-trivial constructors
93
+ // and hence the following special casing (which skips the zero-initilization).
94
+ // Note that this check gets done even in the host compilation phase, and
95
+ // hence the need for this
96
+ EIGEN_DEVICE_FUNC __half_raw() {}
97
+ #else
98
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw() : x(0) {}
99
+ #endif
100
+
101
+ #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
102
+ explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {}
103
+ EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, __fp16 rep) : x{rep} {}
104
+ __fp16 x;
105
+ #elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
106
+ explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<_Float16>(raw)) {}
107
+ EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, _Float16 rep) : x{rep} {}
108
+ _Float16 x;
109
+ #else
110
+ explicit EIGEN_DEVICE_FUNC constexpr __half_raw(numext::uint16_t raw) : x(raw) {}
111
+ EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, numext::uint16_t rep) : x{rep} {}
112
+ numext::uint16_t x;
113
+ #endif
114
+ };
115
+
116
+ #elif defined(EIGEN_HAS_HIP_FP16)
117
+ // HIP GPU compile phase: nothing to do here.
118
+ // HIP fp16 header file has a definition for __half_raw
119
+ #elif defined(EIGEN_HAS_CUDA_FP16)
120
+
121
+ // CUDA GPU compile phase.
122
+ #if EIGEN_CUDA_SDK_VER < 90000
123
+ // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
124
+ typedef __half __half_raw;
125
+ #endif // defined(EIGEN_HAS_CUDA_FP16)
126
+
127
+ #elif defined(SYCL_DEVICE_ONLY)
128
+ typedef cl::sycl::half __half_raw;
129
+ #endif
130
+
131
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
132
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
133
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
134
+
135
+ struct half_base : public __half_raw {
136
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
137
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
138
+
139
+ #if defined(EIGEN_HAS_GPU_FP16)
140
+ #if defined(EIGEN_HAS_HIP_FP16)
141
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
142
+ #elif defined(EIGEN_HAS_CUDA_FP16)
143
+ #if EIGEN_CUDA_SDK_VER >= 90000
144
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
145
+ #endif
146
+ #endif
147
+ #endif
148
+ };
149
+
150
+ } // namespace half_impl
151
+
152
+ // Class definition.
153
+ struct half : public half_impl::half_base {
154
+ // Writing this out as separate #if-else blocks to make the code easier to follow
155
+ // The same applies to most #if-else blocks in this file
156
+ #if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
157
+ // Use the same base class for the following two scenarios
158
+ // * when compiling without GPU support enabled
159
+ // * during host compile phase when compiling with GPU support enabled
160
+ typedef half_impl::__half_raw __half_raw;
161
+ #elif defined(EIGEN_HAS_HIP_FP16)
162
+ // Nothing to do here
163
+ // HIP fp16 header file has a definition for __half_raw
164
+ #elif defined(EIGEN_HAS_CUDA_FP16)
165
+ // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
166
+ // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within
167
+ // #if defined(EIGEN_HAS_CUDA_FP16) is needed
168
+ #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
169
+ typedef half_impl::__half_raw __half_raw;
170
+ #endif
171
+ #endif
172
+
173
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}
174
+
175
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
176
+
177
+ #if defined(EIGEN_HAS_GPU_FP16)
178
+ #if defined(EIGEN_HAS_HIP_FP16)
179
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
180
+ #elif defined(EIGEN_HAS_CUDA_FP16)
181
+ #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
182
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
183
+ #endif
184
+ #endif
185
+ #endif
186
+
187
+ #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
188
+ explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
189
+ : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {}
190
+ #elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
191
+ explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(_Float16 b)
192
+ : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {}
193
+ #endif
194
+
195
+ explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(bool b)
196
+ : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
197
+ template <class T>
198
+ explicit EIGEN_DEVICE_FUNC half(T val)
199
+ : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
200
+ explicit EIGEN_DEVICE_FUNC half(float f) : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
201
+
202
+ // Following the convention of numpy, converting between complex and
203
+ // float will lead to loss of imag value.
204
+ template <typename RealScalar>
205
+ explicit EIGEN_DEVICE_FUNC half(std::complex<RealScalar> c)
206
+ : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(c.real()))) {}
207
+
208
+ EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless.
209
+ return half_impl::half_to_float(*this);
210
+ }
211
+
212
+ #if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
213
+ EIGEN_DEVICE_FUNC operator __half() const {
214
+ ::__half_raw hr;
215
+ hr.x = x;
216
+ return __half(hr);
217
+ }
218
+ #endif
219
+ };
220
+
221
+ // TODO(majnemer): Get rid of this once we can rely on C++17 inline variables do
222
+ // solve the ODR issue.
223
+ namespace half_impl {
224
+ template <typename = void>
225
+ struct numeric_limits_half_impl {
226
+ static constexpr const bool is_specialized = true;
227
+ static constexpr const bool is_signed = true;
228
+ static constexpr const bool is_integer = false;
229
+ static constexpr const bool is_exact = false;
230
+ static constexpr const bool has_infinity = true;
231
+ static constexpr const bool has_quiet_NaN = true;
232
+ static constexpr const bool has_signaling_NaN = true;
233
+ EIGEN_DIAGNOSTICS(push)
234
+ EIGEN_DISABLE_DEPRECATED_WARNING
235
+ static constexpr const std::float_denorm_style has_denorm = std::denorm_present;
236
+ static constexpr const bool has_denorm_loss = false;
237
+ EIGEN_DIAGNOSTICS(pop)
238
+ static constexpr const std::float_round_style round_style = std::round_to_nearest;
239
+ static constexpr const bool is_iec559 = true;
240
+ // The C++ standard defines this as "true if the set of values representable
241
+ // by the type is finite." Half has finite precision.
242
+ static constexpr const bool is_bounded = true;
243
+ static constexpr const bool is_modulo = false;
244
+ static constexpr const int digits = 11;
245
+ static constexpr const int digits10 =
246
+ 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
247
+ static constexpr const int max_digits10 =
248
+ 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
249
+ static constexpr const int radix = std::numeric_limits<float>::radix;
250
+ static constexpr const int min_exponent = -13;
251
+ static constexpr const int min_exponent10 = -4;
252
+ static constexpr const int max_exponent = 16;
253
+ static constexpr const int max_exponent10 = 4;
254
+ static constexpr const bool traps = std::numeric_limits<float>::traps;
255
+ // IEEE754: "The implementer shall choose how tininess is detected, but shall
256
+ // detect tininess in the same way for all operations in radix two"
257
+ static constexpr const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
258
+
259
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
260
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
261
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
262
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); }
263
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
264
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
265
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
266
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
267
+ static _EIGEN_MAYBE_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
268
+ };
269
+
270
+ template <typename T>
271
+ constexpr const bool numeric_limits_half_impl<T>::is_specialized;
272
+ template <typename T>
273
+ constexpr const bool numeric_limits_half_impl<T>::is_signed;
274
+ template <typename T>
275
+ constexpr const bool numeric_limits_half_impl<T>::is_integer;
276
+ template <typename T>
277
+ constexpr const bool numeric_limits_half_impl<T>::is_exact;
278
+ template <typename T>
279
+ constexpr const bool numeric_limits_half_impl<T>::has_infinity;
280
+ template <typename T>
281
+ constexpr const bool numeric_limits_half_impl<T>::has_quiet_NaN;
282
+ template <typename T>
283
+ constexpr const bool numeric_limits_half_impl<T>::has_signaling_NaN;
284
+ EIGEN_DIAGNOSTICS(push)
285
+ EIGEN_DISABLE_DEPRECATED_WARNING
286
+ template <typename T>
287
+ constexpr const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm;
288
+ template <typename T>
289
+ constexpr const bool numeric_limits_half_impl<T>::has_denorm_loss;
290
+ EIGEN_DIAGNOSTICS(pop)
291
+ template <typename T>
292
+ constexpr const std::float_round_style numeric_limits_half_impl<T>::round_style;
293
+ template <typename T>
294
+ constexpr const bool numeric_limits_half_impl<T>::is_iec559;
295
+ template <typename T>
296
+ constexpr const bool numeric_limits_half_impl<T>::is_bounded;
297
+ template <typename T>
298
+ constexpr const bool numeric_limits_half_impl<T>::is_modulo;
299
+ template <typename T>
300
+ constexpr const int numeric_limits_half_impl<T>::digits;
301
+ template <typename T>
302
+ constexpr const int numeric_limits_half_impl<T>::digits10;
303
+ template <typename T>
304
+ constexpr const int numeric_limits_half_impl<T>::max_digits10;
305
+ template <typename T>
306
+ constexpr const int numeric_limits_half_impl<T>::radix;
307
+ template <typename T>
308
+ constexpr const int numeric_limits_half_impl<T>::min_exponent;
309
+ template <typename T>
310
+ constexpr const int numeric_limits_half_impl<T>::min_exponent10;
311
+ template <typename T>
312
+ constexpr const int numeric_limits_half_impl<T>::max_exponent;
313
+ template <typename T>
314
+ constexpr const int numeric_limits_half_impl<T>::max_exponent10;
315
+ template <typename T>
316
+ constexpr const bool numeric_limits_half_impl<T>::traps;
317
+ template <typename T>
318
+ constexpr const bool numeric_limits_half_impl<T>::tinyness_before;
319
+ } // end namespace half_impl
320
+ } // end namespace Eigen
321
+
322
+ namespace std {
323
+ // If std::numeric_limits<T> is specialized, should also specialize
324
+ // std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
325
+ // std::numeric_limits<const volatile T>
326
+ // https://stackoverflow.com/a/16519653/
327
+ template <>
328
+ class numeric_limits<Eigen::half> : public Eigen::half_impl::numeric_limits_half_impl<> {};
329
+ template <>
330
+ class numeric_limits<const Eigen::half> : public numeric_limits<Eigen::half> {};
331
+ template <>
332
+ class numeric_limits<volatile Eigen::half> : public numeric_limits<Eigen::half> {};
333
+ template <>
334
+ class numeric_limits<const volatile Eigen::half> : public numeric_limits<Eigen::half> {};
335
+ } // end namespace std
336
+
337
+ namespace Eigen {
338
+
339
+ namespace half_impl {
340
+
341
+ #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
342
+ (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
343
+ // Note: We deliberately do *not* define this to 1 even if we have Arm's native
344
+ // fp16 type since GPU half types are rather different from native CPU half types.
345
+ #define EIGEN_HAS_NATIVE_GPU_FP16
346
+ #endif
347
+
348
+ // Intrinsics for native fp16 support. Note that on current hardware,
349
+ // these are no faster than fp32 arithmetic (you need to use the half2
350
+ // versions to get the ALU speed increased), but you do save the
351
+ // conversion steps back and forth.
352
+
353
+ #if defined(EIGEN_HAS_NATIVE_GPU_FP16)
354
+ EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
355
+ #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
356
+ return __hadd(::__half(a), ::__half(b));
357
+ #else
358
+ return __hadd(a, b);
359
+ #endif
360
+ }
361
+ EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
362
+ EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
363
+ EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
364
+ #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
365
+ return __hdiv(a, b);
366
+ #else
367
+ float num = __half2float(a);
368
+ float denom = __half2float(b);
369
+ return __float2half(num / denom);
370
+ #endif
371
+ }
372
+ EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
373
+ EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
374
+ a = a + b;
375
+ return a;
376
+ }
377
+ EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) {
378
+ a = a * b;
379
+ return a;
380
+ }
381
+ EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) {
382
+ a = a - b;
383
+ return a;
384
+ }
385
+ EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) {
386
+ a = a / b;
387
+ return a;
388
+ }
389
+ EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) { return __heq(a, b); }
390
+ EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) { return __hne(a, b); }
391
+ EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) { return __hlt(a, b); }
392
+ EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) { return __hle(a, b); }
393
+ EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) { return __hgt(a, b); }
394
+ EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) { return __hge(a, b); }
395
+
396
+ #endif // EIGEN_HAS_NATIVE_GPU_FP16
397
+
398
+ #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
399
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); }
400
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(vmulh_f16(a.x, b.x)); }
401
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(vsubh_f16(a.x, b.x)); }
402
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(vdivh_f16(a.x, b.x)); }
403
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { return half(vnegh_f16(a.x)); }
404
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
405
+ a = half(vaddh_f16(a.x, b.x));
406
+ return a;
407
+ }
408
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
409
+ a = half(vmulh_f16(a.x, b.x));
410
+ return a;
411
+ }
412
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
413
+ a = half(vsubh_f16(a.x, b.x));
414
+ return a;
415
+ }
416
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
417
+ a = half(vdivh_f16(a.x, b.x));
418
+ return a;
419
+ }
420
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) { return vceqh_f16(a.x, b.x); }
421
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return !vceqh_f16(a.x, b.x); }
422
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return vclth_f16(a.x, b.x); }
423
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return vcleh_f16(a.x, b.x); }
424
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return vcgth_f16(a.x, b.x); }
425
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return vcgeh_f16(a.x, b.x); }
426
+
427
+ #elif defined(EIGEN_HAS_BUILTIN_FLOAT16) && !defined(EIGEN_GPU_COMPILE_PHASE)
428
+
429
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(a.x + b.x); }
430
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(a.x * b.x); }
431
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(a.x - b.x); }
432
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(a.x / b.x); }
433
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { return half(-a.x); }
434
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
435
+ a = a + b;
436
+ return a;
437
+ }
438
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
439
+ a = a * b;
440
+ return a;
441
+ }
442
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
443
+ a = a - b;
444
+ return a;
445
+ }
446
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
447
+ a = a / b;
448
+ return a;
449
+ }
450
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) { return a.x == b.x; }
451
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return a.x != b.x; }
452
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return a.x < b.x; }
453
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return a.x <= b.x; }
454
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return a.x > b.x; }
455
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return a.x >= b.x; }
456
+
457
+ // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
458
+ // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
459
+ // of the functions, while the latter can only deal with one of them.
460
+ #elif !defined(EIGEN_HAS_NATIVE_GPU_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
461
+
462
+ #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
463
+ // We need to provide emulated *host-side* FP16 operators for clang.
464
+ #pragma push_macro("EIGEN_DEVICE_FUNC")
465
+ #undef EIGEN_DEVICE_FUNC
466
+ #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
467
+ #define EIGEN_DEVICE_FUNC __host__
468
+ #else // both host and device need emulated ops.
469
+ #define EIGEN_DEVICE_FUNC __host__ __device__
470
+ #endif
471
+ #endif
472
+
473
+ // Definitions for CPUs and older HIP+CUDA, mostly working through conversion
474
+ // to/from fp32.
475
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(float(a) + float(b)); }
476
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(float(a) * float(b)); }
477
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(float(a) - float(b)); }
478
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(float(a) / float(b)); }
479
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) {
480
+ half result;
481
+ result.x = a.x ^ 0x8000;
482
+ return result;
483
+ }
484
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
485
+ a = half(float(a) + float(b));
486
+ return a;
487
+ }
488
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
489
+ a = half(float(a) * float(b));
490
+ return a;
491
+ }
492
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
493
+ a = half(float(a) - float(b));
494
+ return a;
495
+ }
496
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
497
+ a = half(float(a) / float(b));
498
+ return a;
499
+ }
500
+
501
+ // Non-negative floating point numbers have a monotonic mapping to non-negative integers.
502
+ // This property allows floating point numbers to be reinterpreted as integers for comparisons, which is useful if there
503
+ // is no native floating point comparison operator. Floating point signedness is handled by the sign-magnitude
504
+ // representation, whereas integers typically use two's complement. Converting the bit pattern from sign-magnitude to
505
+ // two's complement allows the transformed bit patterns be compared as signed integers. All edge cases (+/-0 and +/-
506
+ // infinity) are handled automatically, except NaN.
507
+ //
508
+ // fp16 uses 1 sign bit, 5 exponent bits, and 10 mantissa bits. The bit pattern conveys NaN when all the exponent
509
+ // bits (5) are set, and at least one mantissa bit is set. The sign bit is irrelevant for determining NaN. To check for
510
+ // NaN, clear the sign bit and check if the integral representation is greater than 01111100000000. To test
511
+ // for non-NaN, clear the sign bit and check if the integeral representation is less than or equal to 01111100000000.
512
+
513
+ // convert sign-magnitude representation to two's complement
514
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int16_t mapToSigned(uint16_t a) {
515
+ constexpr uint16_t kAbsMask = (1 << 15) - 1;
516
+ // If the sign bit is set, clear the sign bit and return the (integer) negation. Otherwise, return the input.
517
+ return (a >> 15) ? -(a & kAbsMask) : a;
518
+ }
519
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool isOrdered(const half& a, const half& b) {
520
+ constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
521
+ constexpr uint16_t kAbsMask = (1 << 15) - 1;
522
+ return numext::maxi(a.x & kAbsMask, b.x & kAbsMask) <= kInf;
523
+ }
524
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) {
525
+ bool result = mapToSigned(a.x) == mapToSigned(b.x);
526
+ result &= isOrdered(a, b);
527
+ return result;
528
+ }
529
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return !(a == b); }
530
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) {
531
+ bool result = mapToSigned(a.x) < mapToSigned(b.x);
532
+ result &= isOrdered(a, b);
533
+ return result;
534
+ }
535
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) {
536
+ bool result = mapToSigned(a.x) <= mapToSigned(b.x);
537
+ result &= isOrdered(a, b);
538
+ return result;
539
+ }
540
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) {
541
+ bool result = mapToSigned(a.x) > mapToSigned(b.x);
542
+ result &= isOrdered(a, b);
543
+ return result;
544
+ }
545
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) {
546
+ bool result = mapToSigned(a.x) >= mapToSigned(b.x);
547
+ result &= isOrdered(a, b);
548
+ return result;
549
+ }
550
+
551
+ #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
552
+ #pragma pop_macro("EIGEN_DEVICE_FUNC")
553
+ #endif
554
+
555
+ #endif // Emulate support for half floats
556
+
557
+ // Division by an index. Do it in full float precision to avoid accuracy
558
+ // issues in converting the denominator to half.
559
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) {
560
+ return half(static_cast<float>(a) / static_cast<float>(b));
561
+ }
562
+
563
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a) {
564
+ a += half(1);
565
+ return a;
566
+ }
567
+
568
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a) {
569
+ a -= half(1);
570
+ return a;
571
+ }
572
+
573
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a, int) {
574
+ half original_value = a;
575
+ ++a;
576
+ return original_value;
577
+ }
578
+
579
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a, int) {
580
+ half original_value = a;
581
+ --a;
582
+ return original_value;
583
+ }
584
+
585
+ // Conversion routines, including fallbacks for the host or older CUDA.
586
+ // Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
587
+ // these in hardware. If we need more performance on older/other CPUs, they are
588
+ // also possible to vectorize directly.
589
+
590
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) {
591
+ // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type
592
+ // in the hip_fp16 header file, and that will trigger a compile error
593
+ // On the other hand, having anything but a return statement also triggers a compile error
594
+ // because this is constexpr function.
595
+ // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
596
+ // of this catch22 by having separate bodies for GPU / non GPU
597
+ #if defined(EIGEN_HAS_GPU_FP16)
598
+ __half_raw h;
599
+ h.x = x;
600
+ return h;
601
+ #else
602
+ return __half_raw(x);
603
+ #endif
604
+ }
605
+
606
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const __half_raw& h) {
607
+ // HIP/CUDA/Default have a member 'x' of type uint16_t.
608
+ // For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast.
609
+ // For SYCL, cl::sycl::half is _Float16, so cast directly.
610
+ #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
611
+ return numext::bit_cast<numext::uint16_t>(h.x);
612
+ #elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
613
+ return numext::bit_cast<numext::uint16_t>(h.x);
614
+ #elif defined(SYCL_DEVICE_ONLY)
615
+ return numext::bit_cast<numext::uint16_t>(h);
616
+ #else
617
+ return h.x;
618
+ #endif
619
+ }
620
+
621
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
622
+ #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
623
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
624
+ __half tmp_ff = __float2half(ff);
625
+ return *(__half_raw*)&tmp_ff;
626
+
627
+ #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
628
+ __half_raw h;
629
+ h.x = static_cast<__fp16>(ff);
630
+ return h;
631
+
632
+ #elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
633
+ __half_raw h;
634
+ h.x = static_cast<_Float16>(ff);
635
+ return h;
636
+
637
+ #elif defined(EIGEN_HAS_FP16_C)
638
+ __half_raw h;
639
+ #if EIGEN_COMP_MSVC
640
+ // MSVC does not have scalar instructions.
641
+ h.x = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0);
642
+ #else
643
+ h.x = _cvtss_sh(ff, 0);
644
+ #endif
645
+ return h;
646
+
647
+ #else
648
+ uint32_t f_bits = Eigen::numext::bit_cast<uint32_t>(ff);
649
+ const uint32_t f32infty_bits = {255 << 23};
650
+ const uint32_t f16max_bits = {(127 + 16) << 23};
651
+ const uint32_t denorm_magic_bits = {((127 - 15) + (23 - 10) + 1) << 23};
652
+ const uint32_t sign_mask = 0x80000000u;
653
+ __half_raw o;
654
+ o.x = static_cast<uint16_t>(0x0u);
655
+
656
+ const uint32_t sign = f_bits & sign_mask;
657
+ f_bits ^= sign;
658
+
659
+ // NOTE all the integer compares in this function can be safely
660
+ // compiled into signed compares since all operands are below
661
+ // 0x80000000. Important if you want fast straight SSE2 code
662
+ // (since there's no unsigned PCMPGTD).
663
+
664
+ if (f_bits >= f16max_bits) { // result is Inf or NaN (all exponent bits set)
665
+ o.x = (f_bits > f32infty_bits) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
666
+ } else { // (De)normalized number or zero
667
+ if (f_bits < (113 << 23)) { // resulting FP16 is subnormal or zero
668
+ // use a magic value to align our 10 mantissa bits at the bottom of
669
+ // the float. as long as FP addition is round-to-nearest-even this
670
+ // just works.
671
+ f_bits = Eigen::numext::bit_cast<uint32_t>(Eigen::numext::bit_cast<float>(f_bits) +
672
+ Eigen::numext::bit_cast<float>(denorm_magic_bits));
673
+
674
+ // and one integer subtract of the bias later, we have our final float!
675
+ o.x = static_cast<numext::uint16_t>(f_bits - denorm_magic_bits);
676
+ } else {
677
+ const uint32_t mant_odd = (f_bits >> 13) & 1; // resulting mantissa is odd
678
+
679
+ // update exponent, rounding bias part 1
680
+ // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
681
+ // without arithmetic overflow.
682
+ f_bits += 0xc8000fffU;
683
+ // rounding bias part 2
684
+ f_bits += mant_odd;
685
+ // take the bits!
686
+ o.x = static_cast<numext::uint16_t>(f_bits >> 13);
687
+ }
688
+ }
689
+
690
+ o.x |= static_cast<numext::uint16_t>(sign >> 16);
691
+ return o;
692
+ #endif
693
+ }
694
+
695
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
696
+ #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
697
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
698
+ return __half2float(h);
699
+ #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
700
+ return static_cast<float>(h.x);
701
+ #elif defined(EIGEN_HAS_FP16_C)
702
+ #if EIGEN_COMP_MSVC
703
+ // MSVC does not have scalar instructions.
704
+ return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x)));
705
+ #else
706
+ return _cvtsh_ss(h.x);
707
+ #endif
708
+ #else
709
+ const float magic = Eigen::numext::bit_cast<float>(static_cast<uint32_t>(113 << 23));
710
+ const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
711
+ uint32_t o_bits = (h.x & 0x7fff) << 13; // exponent/mantissa bits
712
+ const uint32_t exp = shifted_exp & o_bits; // just the exponent
713
+ o_bits += (127 - 15) << 23; // exponent adjust
714
+
715
+ // handle exponent special cases
716
+ if (exp == shifted_exp) { // Inf/NaN?
717
+ o_bits += (128 - 16) << 23; // extra exp adjust
718
+ } else if (exp == 0) { // Zero/Denormal?
719
+ o_bits += 1 << 23; // extra exp adjust
720
+ // renormalize
721
+ o_bits = Eigen::numext::bit_cast<uint32_t>(Eigen::numext::bit_cast<float>(o_bits) - magic);
722
+ }
723
+
724
+ o_bits |= (h.x & 0x8000) << 16; // sign bit
725
+ return Eigen::numext::bit_cast<float>(o_bits);
726
+ #endif
727
+ }
728
+
729
+ // --- standard functions ---
730
+
731
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
732
+ #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
733
+ return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00;
734
+ #else
735
+ return (a.x & 0x7fff) == 0x7c00;
736
+ #endif
737
+ }
738
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
739
+ #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
740
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
741
+ return __hisnan(a);
742
+ #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
743
+ return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
744
+ #else
745
+ return (a.x & 0x7fff) > 0x7c00;
746
+ #endif
747
+ }
748
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
749
+ #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
750
+ return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) < 0x7c00;
751
+ #else
752
+ return (a.x & 0x7fff) < 0x7c00;
753
+ #endif
754
+ }
755
+
756
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
757
+ #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
758
+ return half(vabsh_f16(a.x));
759
+ #elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
760
+ half result;
761
+ result.x =
762
+ numext::bit_cast<_Float16>(static_cast<numext::uint16_t>(numext::bit_cast<numext::uint16_t>(a.x) & 0x7FFF));
763
+ return result;
764
+ #else
765
+ half result;
766
+ result.x = a.x & 0x7FFF;
767
+ return result;
768
+ #endif
769
+ }
770
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
771
+ #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
772
+ defined(EIGEN_HIP_DEVICE_COMPILE)
773
+ return half(hexp(a));
774
+ #else
775
+ return half(::expf(float(a)));
776
+ #endif
777
+ }
778
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
779
+ #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
780
+ defined(EIGEN_HIP_DEVICE_COMPILE)
781
+ return half(hexp2(a));
782
+ #else
783
+ return half(::exp2f(float(a)));
784
+ #endif
785
+ }
786
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
787
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
788
+ #if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
789
+ EIGEN_CUDA_ARCH >= 530) || \
790
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
791
+ return half(hlog(a));
792
+ #else
793
+ return half(::logf(float(a)));
794
+ #endif
795
+ }
796
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { return half(numext::log1p(float(a))); }
797
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); }
798
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
799
+ return half(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
800
+ }
801
+
802
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
803
+ #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
804
+ defined(EIGEN_HIP_DEVICE_COMPILE)
805
+ return half(hsqrt(a));
806
+ #else
807
+ return half(::sqrtf(float(a)));
808
+ #endif
809
+ }
810
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
811
+ return half(::powf(float(a), float(b)));
812
+ }
813
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan2(const half& a, const half& b) {
814
+ return half(::atan2f(float(a), float(b)));
815
+ }
816
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { return half(::sinf(float(a))); }
817
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { return half(::cosf(float(a))); }
818
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { return half(::tanf(float(a))); }
819
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); }
820
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) { return half(::asinf(float(a))); }
821
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::acosf(float(a))); }
822
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
823
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
824
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
825
+ #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
826
+ defined(EIGEN_HIP_DEVICE_COMPILE)
827
+ return half(hfloor(a));
828
+ #else
829
+ return half(::floorf(float(a)));
830
+ #endif
831
+ }
832
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
833
+ #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
834
+ defined(EIGEN_HIP_DEVICE_COMPILE)
835
+ return half(hceil(a));
836
+ #else
837
+ return half(::ceilf(float(a)));
838
+ #endif
839
+ }
840
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) { return half(::rintf(float(a))); }
841
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) { return half(::roundf(float(a))); }
842
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half trunc(const half& a) { return half(::truncf(float(a))); }
843
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) {
844
+ return half(::fmodf(float(a), float(b)));
845
+ }
846
+
847
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) { return b < a ? b : a; }
848
+
849
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) { return a < b ? b : a; }
850
+
851
+ EIGEN_DEVICE_FUNC inline half fma(const half& a, const half& b, const half& c) {
852
+ #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
853
+ return half(vfmah_f16(c.x, a.x, b.x));
854
+ #elif defined(EIGEN_VECTORIZE_AVX512FP16)
855
+ // Reduces to vfmadd213sh.
856
+ return half(_mm_cvtsh_h(_mm_fmadd_ph(_mm_set_sh(a.x), _mm_set_sh(b.x), _mm_set_sh(c.x))));
857
+ #else
858
+ // Emulate FMA via float.
859
+ return half(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c)));
860
+ #endif
861
+ }
862
+
863
+ #ifndef EIGEN_NO_IO
864
+ EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) {
865
+ os << static_cast<float>(v);
866
+ return os;
867
+ }
868
+ #endif
869
+
870
+ } // end namespace half_impl
871
+
872
+ // import Eigen::half_impl::half into Eigen namespace
873
+ // using half_impl::half;
874
+
875
+ namespace internal {
876
+
877
+ template <>
878
+ struct is_arithmetic<half> {
879
+ enum { value = true };
880
+ };
881
+
882
+ template <>
883
+ struct random_impl<half> {
884
+ enum : int { MantissaBits = 10 };
885
+ using Impl = random_impl<float>;
886
+ static EIGEN_DEVICE_FUNC inline half run(const half& x, const half& y) {
887
+ float result = Impl::run(x, y, MantissaBits);
888
+ return half(result);
889
+ }
890
+ static EIGEN_DEVICE_FUNC inline half run() {
891
+ float result = Impl::run(MantissaBits);
892
+ return half(result);
893
+ }
894
+ };
895
+
896
+ } // end namespace internal
897
+
898
+ template <>
899
+ struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
900
+ enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
901
+
902
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
903
+ return half_impl::raw_uint16_to_half(0x0800);
904
+ }
905
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
906
+ return half_impl::raw_uint16_to_half(0x211f); // Eigen::half(1e-2f);
907
+ }
908
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
909
+ return half_impl::raw_uint16_to_half(0x7bff);
910
+ }
911
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() {
912
+ return half_impl::raw_uint16_to_half(0xfbff);
913
+ }
914
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() {
915
+ return half_impl::raw_uint16_to_half(0x7c00);
916
+ }
917
+ EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
918
+ return half_impl::raw_uint16_to_half(0x7e00);
919
+ }
920
+ };
921
+
922
+ } // end namespace Eigen
923
+
924
+ #undef _EIGEN_MAYBE_CONSTEXPR
925
+
926
+ namespace Eigen {
927
+ namespace numext {
928
+
929
+ #if defined(EIGEN_GPU_COMPILE_PHASE)
930
+
931
+ template <>
932
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) {
933
+ return (half_impl::isnan)(h);
934
+ }
935
+
936
+ template <>
937
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) {
938
+ return (half_impl::isinf)(h);
939
+ }
940
+
941
+ template <>
942
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) {
943
+ return (half_impl::isfinite)(h);
944
+ }
945
+
946
+ #endif
947
+
948
+ template <>
949
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bit_cast<Eigen::half, uint16_t>(const uint16_t& src) {
950
+ return Eigen::half(Eigen::half_impl::raw_uint16_to_half(src));
951
+ }
952
+
953
+ template <>
954
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::half>(const Eigen::half& src) {
955
+ return Eigen::half_impl::raw_half_as_uint16(src);
956
+ }
957
+
958
+ // Specialize multiply-add to match packet operations and reduce conversions to/from float.
959
+ template<>
960
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen::half& x, const Eigen::half& y, const Eigen::half& z) {
961
+ return Eigen::half(static_cast<float>(x) * static_cast<float>(y) + static_cast<float>(z));
962
+ }
963
+
964
+ } // namespace numext
965
+ } // namespace Eigen
966
+
967
+ // Add the missing shfl* intrinsics.
968
+ // The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
969
+ // CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
970
+ //
971
+ // HIP and CUDA prior to SDK 9.0 define
972
+ // __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
973
+ // CUDA since 9.0 deprecates those and instead defines
974
+ // __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
975
+ // with native support for __half and __nv_bfloat16
976
+ //
977
+ // Note that the following are __device__ - only functions.
978
+ #if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
979
+
980
+ #if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
981
+
982
+ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
983
+ int width = warpSize) {
984
+ const __half h = var;
985
+ return static_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width));
986
+ }
987
+
988
+ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta,
989
+ int width = warpSize) {
990
+ const __half h = var;
991
+ return static_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width));
992
+ }
993
+
994
+ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta,
995
+ int width = warpSize) {
996
+ const __half h = var;
997
+ return static_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width));
998
+ }
999
+
1000
+ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask,
1001
+ int width = warpSize) {
1002
+ const __half h = var;
1003
+ return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
1004
+ }
1005
+
1006
+ #else // HIP or CUDA SDK < 9.0
1007
+
1008
+ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
1009
+ const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
1010
+ return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
1011
+ }
1012
+
1013
+ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
1014
+ const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
1015
+ return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
1016
+ }
1017
+
1018
+ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
1019
+ const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
1020
+ return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
1021
+ }
1022
+
1023
+ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
1024
+ const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
1025
+ return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
1026
+ }
1027
+
1028
+ #endif // HIP vs CUDA
1029
+ #endif // __shfl*
1030
+
1031
+ // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
1032
+ #if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
1033
+ EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
1034
+ return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
1035
+ }
1036
+ #endif // __ldg
1037
+
1038
+ #if EIGEN_HAS_STD_HASH
1039
+ namespace std {
1040
+ template <>
1041
+ struct hash<Eigen::half> {
1042
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const {
1043
+ return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
1044
+ }
1045
+ };
1046
+ } // end namespace std
1047
+ #endif
1048
+
1049
+ namespace Eigen {
1050
+ namespace internal {
1051
+
1052
+ template <>
1053
+ struct cast_impl<float, half> {
1054
+ EIGEN_DEVICE_FUNC static inline half run(const float& a) {
1055
+ #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
1056
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
1057
+ return __float2half(a);
1058
+ #else
1059
+ return half(a);
1060
+ #endif
1061
+ }
1062
+ };
1063
+
1064
+ template <>
1065
+ struct cast_impl<int, half> {
1066
+ EIGEN_DEVICE_FUNC static inline half run(const int& a) {
1067
+ #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
1068
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
1069
+ return __float2half(static_cast<float>(a));
1070
+ #else
1071
+ return half(static_cast<float>(a));
1072
+ #endif
1073
+ }
1074
+ };
1075
+
1076
+ template <>
1077
+ struct cast_impl<half, float> {
1078
+ EIGEN_DEVICE_FUNC static inline float run(const half& a) {
1079
+ #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
1080
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
1081
+ return __half2float(a);
1082
+ #else
1083
+ return static_cast<float>(a);
1084
+ #endif
1085
+ }
1086
+ };
1087
+
1088
+ } // namespace internal
1089
+ } // namespace Eigen
1090
+
1091
+ #endif // EIGEN_HALF_H