npm - @smake/eigen - Versions diffs - 1.0.2 → 1.1.1 - Mend

@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (435) hide show

package/README.md +1 -1
package/eigen/Eigen/AccelerateSupport +52 -0
package/eigen/Eigen/Cholesky +18 -21
package/eigen/Eigen/CholmodSupport +28 -28
package/eigen/Eigen/Core +235 -326
package/eigen/Eigen/Eigenvalues +16 -14
package/eigen/Eigen/Geometry +21 -24
package/eigen/Eigen/Householder +9 -8
package/eigen/Eigen/IterativeLinearSolvers +8 -4
package/eigen/Eigen/Jacobi +14 -14
package/eigen/Eigen/KLUSupport +43 -0
package/eigen/Eigen/LU +16 -20
package/eigen/Eigen/MetisSupport +12 -12
package/eigen/Eigen/OrderingMethods +54 -54
package/eigen/Eigen/PaStiXSupport +23 -20
package/eigen/Eigen/PardisoSupport +17 -14
package/eigen/Eigen/QR +18 -21
package/eigen/Eigen/QtAlignedMalloc +5 -13
package/eigen/Eigen/SPQRSupport +21 -14
package/eigen/Eigen/SVD +23 -18
package/eigen/Eigen/Sparse +1 -4
package/eigen/Eigen/SparseCholesky +18 -23
package/eigen/Eigen/SparseCore +18 -17
package/eigen/Eigen/SparseLU +12 -8
package/eigen/Eigen/SparseQR +16 -14
package/eigen/Eigen/StdDeque +5 -2
package/eigen/Eigen/StdList +5 -2
package/eigen/Eigen/StdVector +5 -2
package/eigen/Eigen/SuperLUSupport +30 -24
package/eigen/Eigen/ThreadPool +80 -0
package/eigen/Eigen/UmfPackSupport +19 -17
package/eigen/Eigen/Version +14 -0
package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
package/eigen/Eigen/src/Core/Array.h +341 -294
package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
package/eigen/Eigen/src/Core/Assign.h +30 -40
package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
package/eigen/Eigen/src/Core/Block.h +375 -398
package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
package/eigen/Eigen/src/Core/DenseBase.h +632 -571
package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
package/eigen/Eigen/src/Core/Diagonal.h +169 -210
package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
package/eigen/Eigen/src/Core/Dot.h +172 -222
package/eigen/Eigen/src/Core/EigenBase.h +75 -85
package/eigen/Eigen/src/Core/Fill.h +138 -0
package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
package/eigen/Eigen/src/Core/IO.h +147 -139
package/eigen/Eigen/src/Core/IndexedView.h +321 -0
package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Core/Inverse.h +56 -66
package/eigen/Eigen/src/Core/Map.h +124 -142
package/eigen/Eigen/src/Core/MapBase.h +256 -281
package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
package/eigen/Eigen/src/Core/Matrix.h +491 -416
package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
package/eigen/Eigen/src/Core/NestByValue.h +66 -85
package/eigen/Eigen/src/Core/NoAlias.h +79 -85
package/eigen/Eigen/src/Core/NumTraits.h +235 -148
package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
package/eigen/Eigen/src/Core/Product.h +260 -139
package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
package/eigen/Eigen/src/Core/Random.h +161 -136
package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
package/eigen/Eigen/src/Core/RealView.h +250 -0
package/eigen/Eigen/src/Core/Redux.h +366 -336
package/eigen/Eigen/src/Core/Ref.h +308 -209
package/eigen/Eigen/src/Core/Replicate.h +94 -106
package/eigen/Eigen/src/Core/Reshaped.h +398 -0
package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
package/eigen/Eigen/src/Core/Reverse.h +136 -145
package/eigen/Eigen/src/Core/Select.h +70 -140
package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
package/eigen/Eigen/src/Core/Solve.h +97 -111
package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
package/eigen/Eigen/src/Core/SolverBase.h +138 -101
package/eigen/Eigen/src/Core/StableNorm.h +156 -160
package/eigen/Eigen/src/Core/StlIterators.h +619 -0
package/eigen/Eigen/src/Core/Stride.h +91 -88
package/eigen/Eigen/src/Core/Swap.h +70 -38
package/eigen/Eigen/src/Core/Transpose.h +295 -273
package/eigen/Eigen/src/Core/Transpositions.h +272 -317
package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
package/eigen/Eigen/src/Core/Visitor.h +480 -216
package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
package/eigen/Eigen/src/Core/util/Assert.h +158 -0
package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
package/eigen/Eigen/src/Core/util/Constants.h +314 -263
package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
package/eigen/Eigen/src/Core/util/Macros.h +939 -646
package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
package/eigen/Eigen/src/Core/util/Meta.h +618 -426
package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
package/eigen/Eigen/src/Geometry/Transform.h +896 -953
package/eigen/Eigen/src/Geometry/Translation.h +100 -98
package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
package/eigen/Eigen/src/Householder/Householder.h +104 -122
package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
package/eigen/Eigen/src/LU/Determinant.h +60 -63
package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
package/eigen/Eigen/src/StlSupport/details.h +48 -50
package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
package/eigen/Eigen/src/misc/Image.h +41 -43
package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
package/eigen/Eigen/src/misc/Kernel.h +39 -41
package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
package/eigen/Eigen/src/misc/blas.h +83 -426
package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
package/lib/LibEigen.d.ts +4 -0
package/lib/LibEigen.js +14 -0
package/lib/index.d.ts +1 -1
package/lib/index.js +7 -3
package/package.json +2 -10
package/eigen/Eigen/CMakeLists.txt +0 -19
package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
package/eigen/Eigen/src/misc/lapack.h +0 -152
package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
package/lib/eigen.d.ts +0 -2
package/lib/eigen.js +0 -15

package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h ADDED Viewed

@@ -0,0 +1,1712 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_PACKET_MATH_GPU_H
+#define EIGEN_PACKET_MATH_GPU_H
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+namespace Eigen {
+namespace internal {
+// Read-only data cached load available.
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
+#define EIGEN_GPU_HAS_LDG 1
+#endif
+// FP16 math available.
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
+#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
+#endif
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
+#endif
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 1
+#else
+#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 0
+#endif
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+template <>
+struct is_arithmetic<float4> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<double2> {
+  enum { value = true };
+};
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasDiv = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+    HasBlend = 0,
+    HasFloor = 1,
+    HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
+  };
+};
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasDiv = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+    HasBlend = 0,
+  };
+};
+template <>
+struct unpacket_traits<float4> {
+  typedef float type;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef float4 half;
+};
+template <>
+struct unpacket_traits<double2> {
+  typedef double type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef double2 half;
+};
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
+  return make_float4(from, from, from, from);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+  return make_double2(from, from);
+}
+#if EIGEN_HAS_GPU_DEVICE_FUNCTIONS
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
+  return __int_as_float(__float_as_int(a) & __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) & __double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, const float& b) {
+  return __int_as_float(__float_as_int(a) | __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) | __double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, const float& b) {
+  return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) ^ __double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, const float& b) {
+  return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) & ~__double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, const float& b) {
+  return __int_as_float(a == b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, const double& b) {
+  return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, const float& b) {
+  return __int_as_float(a < b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, const double& b) {
+  return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a, const float& b) {
+  return __int_as_float(a <= b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a, const double& b) {
+  return __longlong_as_double(a <= b ? 0xffffffffffffffffull : 0ull);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), bitwise_andnot(a.z, b.z),
+                     bitwise_andnot(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pandnot<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a, const float4& b) {
+  return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), eq_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a, const float4& b) {
+  return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), lt_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(const float4& a, const float4& b) {
+  return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z), le_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq<double2>(const double2& a, const double2& b) {
+  return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt<double2>(const double2& a, const double2& b) {
+  return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
+  return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
+}
+#endif  // EIGEN_HAS_GPU_DEVICE_FUNCTIONS
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+  return make_float4(a, a + 1, a + 2, a + 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
+  return make_double2(a, a + 1);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x + b.x, a.y + b.y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x - b.x, a.y - b.y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+  return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+  return make_double2(-a.x, -a.y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) {
+  return a;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) {
+  return a;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x * b.x, a.y * b.y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x / b.x, a.y / b.y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+  return *reinterpret_cast<const float4*>(from);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+  return *reinterpret_cast<const double2*>(from);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+  return make_float4(from[0], from[1], from[2], from[3]);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+  return make_double2(from[0], from[1]);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
+  return make_float4(from[0], from[0], from[1], from[1]);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
+  return make_double2(from[0], from[0]);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
+  *reinterpret_cast<float4*>(to) = from;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+  *reinterpret_cast<double2*>(to) = from;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+  to[2] = from.z;
+  to[3] = from.w;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __ldg(reinterpret_cast<const float4*>(from));
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __ldg(reinterpret_cast<const double2*>(from));
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return make_double2(__ldg(from + 0), __ldg(from + 1));
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
+  return make_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
+  return make_double2(from[0 * stride], from[1 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
+  to[stride * 0] = from.x;
+  to[stride * 1] = from.y;
+  to[stride * 2] = from.z;
+  to[stride * 3] = from.w;
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
+  to[stride * 0] = from.x;
+  to[stride * 1] = from.y;
+}
+template <>
+EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template <>
+EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+template <>
+EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
+  return a.x + a.y + a.z + a.w;
+}
+template <>
+EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+  return a.x + a.y;
+}
+template <>
+EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
+  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+  return fmax(a.x, a.y);
+}
+template <>
+EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
+  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+  return fmin(a.x, a.y);
+}
+template <>
+EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
+  return a.x * a.y * a.z * a.w;
+}
+template <>
+EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
+  return a.x * a.y;
+}
+template <>
+EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(fabs(a.x), fabs(a.y));
+}
+template <>
+EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
+  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
+  return make_double2(floor(a.x), floor(a.y));
+}
+template <>
+EIGEN_DEVICE_FUNC inline float4 pceil<float4>(const float4& a) {
+  return make_float4(ceilf(a.x), ceilf(a.y), ceilf(a.z), ceilf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 pceil<double2>(const double2& a) {
+  return make_double2(ceil(a.x), ceil(a.y));
+}
+template <>
+EIGEN_DEVICE_FUNC inline float4 print<float4>(const float4& a) {
+  return make_float4(rintf(a.x), rintf(a.y), rintf(a.z), rintf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 print<double2>(const double2& a) {
+  return make_double2(rint(a.x), rint(a.y));
+}
+template <>
+EIGEN_DEVICE_FUNC inline float4 ptrunc<float4>(const float4& a) {
+  return make_float4(truncf(a.x), truncf(a.y), truncf(a.z), truncf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 ptrunc<double2>(const double2& a) {
+  return make_double2(trunc(a.x), trunc(a.y));
+}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
+  float tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+  tmp = kernel.packet[0].z;
+  kernel.packet[0].z = kernel.packet[2].x;
+  kernel.packet[2].x = tmp;
+  tmp = kernel.packet[0].w;
+  kernel.packet[0].w = kernel.packet[3].x;
+  kernel.packet[3].x = tmp;
+  tmp = kernel.packet[1].z;
+  kernel.packet[1].z = kernel.packet[2].y;
+  kernel.packet[2].y = tmp;
+  tmp = kernel.packet[1].w;
+  kernel.packet[1].w = kernel.packet[3].y;
+  kernel.packet[3].y = tmp;
+  tmp = kernel.packet[2].w;
+  kernel.packet[2].w = kernel.packet[3].z;
+  kernel.packet[3].z = tmp;
+}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+}
+#endif  // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
+// on device. There is no benefit to using them on the host anyways, since they are
+// emulated.
+#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+typedef ulonglong2 Packet4h2;
+template <>
+struct unpacket_traits<Packet4h2> {
+  typedef Eigen::half type;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet4h2 half;
+};
+template <>
+struct is_arithmetic<Packet4h2> {
+  enum { value = true };
+};
+template <>
+struct unpacket_traits<half2> {
+  typedef Eigen::half type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef half2 half;
+};
+template <>
+struct is_arithmetic<half2> {
+  enum { value = true };
+};
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet4h2 type;
+  typedef Packet4h2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasExp = 1,
+    HasExpm1 = 1,
+    HasLog = 1,
+    HasLog1p = 1
+  };
+};
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+  return __half2half2(from);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pset1<Packet4h2>(const Eigen::half& from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = pset1<half2>(from);
+  p_alias[1] = pset1<half2>(from);
+  p_alias[2] = pset1<half2>(from);
+  p_alias[3] = pset1<half2>(from);
+  return r;
+}
+namespace {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
+  return *reinterpret_cast<const half2*>(from);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
+  return __halves2half2(from[0], from[0]);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) {
+  *reinterpret_cast<half2*>(to) = from;
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) {
+  to[0] = __low2half(from);
+  to[1] = __high2half(from);
+}
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  // Input is guaranteed to be properly aligned.
+  return __ldg(reinterpret_cast<const half2*>(from));
+#else
+  return __halves2half2(*(from + 0), *(from + 1));
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __halves2half2(__ldg(from + 0), __ldg(from + 1));
+#else
+  return __halves2half2(*(from + 0), *(from + 1));
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) {
+  return __halves2half2(from[0 * stride], from[1 * stride]);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) {
+  to[stride * 0] = __low2half(from);
+  to[stride * 1] = __high2half(from);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
+  return __halves2half2(result1, result2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  return pset1<half2>(true_half);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  return pset1<half2>(false_half);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& kernel) {
+  __half a1 = __low2half(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
+  __half b1 = __low2half(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
+  kernel.packet[0] = __halves2half2(a1, b1);
+  kernel.packet[1] = __halves2half2(a2, b2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return __halves2half2(a, __float2half(f));
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
+  half mask_low = __low2half(mask);
+  half mask_high = __high2half(mask);
+  half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
+  half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
+  return __halves2half2(result_low, result_high);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
+  return __halves2half2(eq1, eq2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
+  return __halves2half2(eq1, eq2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a, const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half eq1 = __half2float(a1) <= __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) <= __half2float(b2) ? true_half : false_half;
+  return __halves2half2(eq1, eq2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
+  return __halves2half2(result1, result2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
+  return __halves2half2(result1, result2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
+  return __halves2half2(result1, result2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
+  return __halves2half2(result1, result2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hfma2(a, b, c);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c1;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __h2div(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 + a2));
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 * a2));
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = log1pf(a1);
+  float r2 = log1pf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expm1f(a1);
+  float r2 = expm1f(a2);
+  return __floats2half2_rn(r1, r2);
+}
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
+#else
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = logf(a1);
+  float r2 = logf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expf(a1);
+  float r2 = expf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = sqrtf(a1);
+  float r2 = sqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = rsqrtf(a1);
+  float r2 = rsqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+#endif
+}  // namespace
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pload<Packet4h2>(const Eigen::half* from) {
+  return *reinterpret_cast<const Packet4h2*>(from);
+}
+// unaligned load;
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploadu<Packet4h2>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = ploadu(from + 0);
+  p_alias[1] = ploadu(from + 2);
+  p_alias[2] = ploadu(from + 4);
+  p_alias[3] = ploadu(from + 6);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploaddup<Packet4h2>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = ploaddup(from + 0);
+  p_alias[1] = ploaddup(from + 1);
+  p_alias[2] = ploaddup(from + 2);
+  p_alias[3] = ploaddup(from + 3);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
+  *reinterpret_cast<Packet4h2*>(to) = from;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
+  const half2* from_alias = reinterpret_cast<const half2*>(&from);
+  pstoreu(to + 0, from_alias[0]);
+  pstoreu(to + 2, from_alias[1]);
+  pstoreu(to + 4, from_alias[2]);
+  pstoreu(to + 6, from_alias[3]);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  Packet4h2 r;
+  r = __ldg(reinterpret_cast<const Packet4h2*>(from));
+  return r;
+#else
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  r_alias[0] = ploadt_ro_aligned(from + 0);
+  r_alias[1] = ploadt_ro_aligned(from + 2);
+  r_alias[2] = ploadt_ro_aligned(from + 4);
+  r_alias[3] = ploadt_ro_aligned(from + 6);
+  return r;
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  r_alias[0] = ploadt_ro_unaligned(from + 0);
+  r_alias[1] = ploadt_ro_unaligned(from + 2);
+  r_alias[2] = ploadt_ro_unaligned(from + 4);
+  r_alias[3] = ploadt_ro_unaligned(from + 6);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
+  p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
+  p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
+  p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(Eigen::half* to, const Packet4h2& from,
+                                                                            Index stride) {
+  const half2* from_alias = reinterpret_cast<const half2*>(&from);
+  pscatter(to + stride * 0, from_alias[0], stride);
+  pscatter(to + stride * 2, from_alias[1], stride);
+  pscatter(to + stride * 4, from_alias[2], stride);
+  pscatter(to + stride * 6, from_alias[3], stride);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(const Packet4h2& a) {
+  return pfirst(*(reinterpret_cast<const half2*>(&a)));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  p_alias[0] = pabs(a_alias[0]);
+  p_alias[1] = pabs(a_alias[1]);
+  p_alias[2] = pabs(a_alias[2]);
+  p_alias[3] = pabs(a_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(const Packet4h2& /*a*/) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  return pset1<Packet4h2>(true_half);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  return pset1<Packet4h2>(false_half);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(double* d_row0, double* d_row1, double* d_row2,
+                                                             double* d_row3, double* d_row4, double* d_row5,
+                                                             double* d_row6, double* d_row7) {
+  double d_tmp;
+  d_tmp = d_row0[1];
+  d_row0[1] = d_row4[0];
+  d_row4[0] = d_tmp;
+  d_tmp = d_row1[1];
+  d_row1[1] = d_row5[0];
+  d_row5[0] = d_tmp;
+  d_tmp = d_row2[1];
+  d_row2[1] = d_row6[0];
+  d_row6[0] = d_tmp;
+  d_tmp = d_row3[1];
+  d_row3[1] = d_row7[0];
+  d_row7[0] = d_tmp;
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(half2* f_row0, half2* f_row1, half2* f_row2,
+                                                            half2* f_row3) {
+  half2 f_tmp;
+  f_tmp = f_row0[1];
+  f_row0[1] = f_row2[0];
+  f_row2[0] = f_tmp;
+  f_tmp = f_row1[1];
+  f_row1[1] = f_row3[0];
+  f_row3[0] = f_tmp;
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) {
+  __half a1 = __low2half(f0);
+  __half a2 = __high2half(f0);
+  __half b1 = __low2half(f1);
+  __half b2 = __high2half(f1);
+  f0 = __halves2half2(a1, b1);
+  f1 = __halves2half2(a2, b2);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4h2, 8>& kernel) {
+  double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
+  double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
+  double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
+  double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]);
+  double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]);
+  double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
+  double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
+  double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
+  ptranspose_double(d_row0, d_row1, d_row2, d_row3, d_row4, d_row5, d_row6, d_row7);
+  half2* f_row0 = reinterpret_cast<half2*>(d_row0);
+  half2* f_row1 = reinterpret_cast<half2*>(d_row1);
+  half2* f_row2 = reinterpret_cast<half2*>(d_row2);
+  half2* f_row3 = reinterpret_cast<half2*>(d_row3);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+  f_row0 = reinterpret_cast<half2*>(d_row0 + 1);
+  f_row1 = reinterpret_cast<half2*>(d_row1 + 1);
+  f_row2 = reinterpret_cast<half2*>(d_row2 + 1);
+  f_row3 = reinterpret_cast<half2*>(d_row3 + 1);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+  f_row0 = reinterpret_cast<half2*>(d_row4);
+  f_row1 = reinterpret_cast<half2*>(d_row5);
+  f_row2 = reinterpret_cast<half2*>(d_row6);
+  f_row3 = reinterpret_cast<half2*>(d_row7);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+  f_row0 = reinterpret_cast<half2*>(d_row4 + 1);
+  f_row1 = reinterpret_cast<half2*>(d_row5 + 1);
+  f_row2 = reinterpret_cast<half2*>(d_row6 + 1);
+  f_row3 = reinterpret_cast<half2*>(d_row7 + 1);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::half& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
+  p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), __hadd(a, __float2half(3.0f)));
+  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
+  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
+  return r;
+#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  half2 b = pset1<half2>(a);
+  half2 c;
+  half2 half_offset0 = __halves2half2(__float2half(0.0f), __float2half(2.0f));
+  half2 half_offset1 = __halves2half2(__float2half(4.0f), __float2half(6.0f));
+  c = __hadd2(b, half_offset0);
+  r_alias[0] = plset(__low2half(c));
+  r_alias[1] = plset(__high2half(c));
+  c = __hadd2(b, half_offset1);
+  r_alias[2] = plset(__low2half(c));
+  r_alias[3] = plset(__high2half(c));
+  return r;
+#else
+  float f = __half2float(a);
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
+  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
+  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
+  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
+  return r;
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
+                                                                   const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
+  r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
+  r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
+  r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
+  r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
+  r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
+  r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_lt<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pcmp_lt(a_alias[0], b_alias[0]);
+  r_alias[1] = pcmp_lt(a_alias[1], b_alias[1]);
+  r_alias[2] = pcmp_lt(a_alias[2], b_alias[2]);
+  r_alias[3] = pcmp_lt(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_le<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pcmp_le(a_alias[0], b_alias[0]);
+  r_alias[1] = pcmp_le(a_alias[1], b_alias[1]);
+  r_alias[2] = pcmp_le(a_alias[2], b_alias[2]);
+  r_alias[3] = pcmp_le(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pand(a_alias[0], b_alias[0]);
+  r_alias[1] = pand(a_alias[1], b_alias[1]);
+  r_alias[2] = pand(a_alias[2], b_alias[2]);
+  r_alias[3] = pand(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = por(a_alias[0], b_alias[0]);
+  r_alias[1] = por(a_alias[1], b_alias[1]);
+  r_alias[2] = por(a_alias[2], b_alias[2]);
+  r_alias[3] = por(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pxor(a_alias[0], b_alias[0]);
+  r_alias[1] = pxor(a_alias[1], b_alias[1]);
+  r_alias[2] = pxor(a_alias[2], b_alias[2]);
+  r_alias[3] = pxor(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pandnot(a_alias[0], b_alias[0]);
+  r_alias[1] = pandnot(a_alias[1], b_alias[1]);
+  r_alias[2] = pandnot(a_alias[2], b_alias[2]);
+  r_alias[3] = pandnot(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = padd(a_alias[0], b_alias[0]);
+  r_alias[1] = padd(a_alias[1], b_alias[1]);
+  r_alias[2] = padd(a_alias[2], b_alias[2]);
+  r_alias[3] = padd(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = psub(a_alias[0], b_alias[0]);
+  r_alias[1] = psub(a_alias[1], b_alias[1]);
+  r_alias[2] = psub(a_alias[2], b_alias[2]);
+  r_alias[3] = psub(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pnegate(a_alias[0]);
+  r_alias[1] = pnegate(a_alias[1]);
+  r_alias[2] = pnegate(a_alias[2]);
+  r_alias[3] = pnegate(a_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
+  return a;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmul(a_alias[0], b_alias[0]);
+  r_alias[1] = pmul(a_alias[1], b_alias[1]);
+  r_alias[2] = pmul(a_alias[2], b_alias[2]);
+  r_alias[3] = pmul(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(const Packet4h2& a, const Packet4h2& b,
+                                                                 const Packet4h2& c) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  const half2* c_alias = reinterpret_cast<const half2*>(&c);
+  r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
+  r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
+  r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
+  r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pdiv(a_alias[0], b_alias[0]);
+  r_alias[1] = pdiv(a_alias[1], b_alias[1]);
+  r_alias[2] = pdiv(a_alias[2], b_alias[2]);
+  r_alias[3] = pdiv(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmin(a_alias[0], b_alias[0]);
+  r_alias[1] = pmin(a_alias[1], b_alias[1]);
+  r_alias[2] = pmin(a_alias[2], b_alias[2]);
+  r_alias[3] = pmin(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmax(a_alias[0], b_alias[0]);
+  r_alias[1] = pmax(a_alias[1], b_alias[1]);
+  r_alias[2] = pmax(a_alias[2], b_alias[2]);
+  r_alias[3] = pmax(a_alias[3], b_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  return predux(a_alias[0]) + predux(a_alias[1]) + predux(a_alias[2]) + predux(a_alias[3]);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1]));
+  half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
+  __half first = predux_max(m0);
+  __half second = predux_max(m1);
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  return (__hgt(first, second) ? first : second);
+#else
+  float ffirst = __half2float(first);
+  float fsecond = __half2float(second);
+  return (ffirst > fsecond) ? first : second;
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1]));
+  half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
+  __half first = predux_min(m0);
+  __half second = predux_min(m1);
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  return (__hlt(first, second) ? first : second);
+#else
+  float ffirst = __half2float(first);
+  float fsecond = __half2float(second);
+  return (ffirst < fsecond) ? first : second;
+#endif
+}
+// likely overflow/underflow
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), pmul(a_alias[2], a_alias[3])));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog1p<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = plog1p(a_alias[0]);
+  r_alias[1] = plog1p(a_alias[1]);
+  r_alias[2] = plog1p(a_alias[2]);
+  r_alias[3] = plog1p(a_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexpm1<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pexpm1(a_alias[0]);
+  r_alias[1] = pexpm1(a_alias[1]);
+  r_alias[2] = pexpm1(a_alias[2]);
+  r_alias[3] = pexpm1(a_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = plog(a_alias[0]);
+  r_alias[1] = plog(a_alias[1]);
+  r_alias[2] = plog(a_alias[2]);
+  r_alias[3] = plog(a_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pexp(a_alias[0]);
+  r_alias[1] = pexp(a_alias[1]);
+  r_alias[2] = pexp(a_alias[2]);
+  r_alias[3] = pexp(a_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = psqrt(a_alias[0]);
+  r_alias[1] = psqrt(a_alias[1]);
+  r_alias[2] = psqrt(a_alias[2]);
+  r_alias[3] = psqrt(a_alias[3]);
+  return r;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = prsqrt(a_alias[0]);
+  r_alias[1] = prsqrt(a_alias[1]);
+  r_alias[2] = prsqrt(a_alias[2]);
+  r_alias[3] = prsqrt(a_alias[3]);
+  return r;
+}
+// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
+// the implementation of GPU half reduction.
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __h2div(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+#endif  // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+#undef EIGEN_GPU_HAS_LDG
+#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
+#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
+}  // end namespace internal
+}  // end namespace Eigen
+#endif  // EIGEN_PACKET_MATH_GPU_H