@smake/eigen 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (287) hide show
  1. package/README.md +1 -1
  2. package/eigen/COPYING.APACHE +203 -0
  3. package/eigen/COPYING.BSD +26 -0
  4. package/eigen/COPYING.GPL +674 -0
  5. package/eigen/COPYING.LGPL +502 -0
  6. package/eigen/COPYING.MINPACK +51 -0
  7. package/eigen/COPYING.MPL2 +373 -0
  8. package/eigen/COPYING.README +18 -0
  9. package/eigen/Eigen/Cholesky +0 -1
  10. package/eigen/Eigen/Core +108 -266
  11. package/eigen/Eigen/Eigenvalues +0 -1
  12. package/eigen/Eigen/Geometry +3 -6
  13. package/eigen/Eigen/Householder +0 -1
  14. package/eigen/Eigen/Jacobi +0 -1
  15. package/eigen/Eigen/KLUSupport +41 -0
  16. package/eigen/Eigen/LU +2 -5
  17. package/eigen/Eigen/OrderingMethods +0 -3
  18. package/eigen/Eigen/PaStiXSupport +1 -0
  19. package/eigen/Eigen/PardisoSupport +0 -0
  20. package/eigen/Eigen/QR +0 -1
  21. package/eigen/Eigen/QtAlignedMalloc +0 -1
  22. package/eigen/Eigen/SVD +0 -1
  23. package/eigen/Eigen/Sparse +0 -2
  24. package/eigen/Eigen/SparseCholesky +0 -8
  25. package/eigen/Eigen/SparseLU +4 -0
  26. package/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
  27. package/eigen/Eigen/src/Cholesky/LLT.h +39 -23
  28. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
  29. package/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
  30. package/eigen/Eigen/src/Core/Array.h +99 -11
  31. package/eigen/Eigen/src/Core/ArrayBase.h +1 -1
  32. package/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
  33. package/eigen/Eigen/src/Core/Assign.h +1 -1
  34. package/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
  35. package/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
  36. package/eigen/Eigen/src/Core/BandMatrix.h +16 -16
  37. package/eigen/Eigen/src/Core/Block.h +56 -60
  38. package/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
  39. package/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
  40. package/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
  41. package/eigen/Eigen/src/Core/CoreIterators.h +5 -0
  42. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
  43. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
  44. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
  45. package/eigen/Eigen/src/Core/CwiseUnaryView.h +12 -10
  46. package/eigen/Eigen/src/Core/DenseBase.h +128 -39
  47. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
  48. package/eigen/Eigen/src/Core/DenseStorage.h +150 -68
  49. package/eigen/Eigen/src/Core/Diagonal.h +21 -23
  50. package/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
  51. package/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
  52. package/eigen/Eigen/src/Core/Dot.h +10 -10
  53. package/eigen/Eigen/src/Core/EigenBase.h +10 -9
  54. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
  55. package/eigen/Eigen/src/Core/Fuzzy.h +3 -3
  56. package/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
  57. package/eigen/Eigen/src/Core/GenericPacketMath.h +597 -147
  58. package/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
  59. package/eigen/Eigen/src/Core/IO.h +40 -7
  60. package/eigen/Eigen/src/Core/IndexedView.h +237 -0
  61. package/eigen/Eigen/src/Core/Inverse.h +9 -10
  62. package/eigen/Eigen/src/Core/Map.h +7 -7
  63. package/eigen/Eigen/src/Core/MapBase.h +5 -3
  64. package/eigen/Eigen/src/Core/MathFunctions.h +756 -120
  65. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
  66. package/eigen/Eigen/src/Core/Matrix.h +131 -25
  67. package/eigen/Eigen/src/Core/MatrixBase.h +19 -2
  68. package/eigen/Eigen/src/Core/NestByValue.h +25 -50
  69. package/eigen/Eigen/src/Core/NoAlias.h +4 -3
  70. package/eigen/Eigen/src/Core/NumTraits.h +107 -20
  71. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  72. package/eigen/Eigen/src/Core/PermutationMatrix.h +3 -3
  73. package/eigen/Eigen/src/Core/PlainObjectBase.h +145 -54
  74. package/eigen/Eigen/src/Core/Product.h +30 -25
  75. package/eigen/Eigen/src/Core/ProductEvaluators.h +183 -142
  76. package/eigen/Eigen/src/Core/Random.h +37 -1
  77. package/eigen/Eigen/src/Core/Redux.h +180 -170
  78. package/eigen/Eigen/src/Core/Ref.h +118 -21
  79. package/eigen/Eigen/src/Core/Replicate.h +8 -8
  80. package/eigen/Eigen/src/Core/Reshaped.h +454 -0
  81. package/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
  82. package/eigen/Eigen/src/Core/Reverse.h +18 -12
  83. package/eigen/Eigen/src/Core/Select.h +8 -6
  84. package/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
  85. package/eigen/Eigen/src/Core/Solve.h +14 -14
  86. package/eigen/Eigen/src/Core/SolveTriangular.h +13 -13
  87. package/eigen/Eigen/src/Core/SolverBase.h +41 -3
  88. package/eigen/Eigen/src/Core/StableNorm.h +100 -70
  89. package/eigen/Eigen/src/Core/StlIterators.h +463 -0
  90. package/eigen/Eigen/src/Core/Stride.h +9 -4
  91. package/eigen/Eigen/src/Core/Swap.h +5 -4
  92. package/eigen/Eigen/src/Core/Transpose.h +86 -27
  93. package/eigen/Eigen/src/Core/Transpositions.h +26 -8
  94. package/eigen/Eigen/src/Core/TriangularMatrix.h +88 -72
  95. package/eigen/Eigen/src/Core/VectorBlock.h +5 -5
  96. package/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
  97. package/eigen/Eigen/src/Core/Visitor.h +137 -29
  98. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
  99. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
  100. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
  101. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
  102. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  103. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +186 -213
  104. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1250 -252
  105. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  106. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
  107. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
  108. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  109. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  110. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  111. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
  112. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
  113. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  114. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
  115. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  116. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  117. package/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
  118. package/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
  119. package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  120. package/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
  121. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  122. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  123. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  124. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  125. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  126. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  127. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
  128. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  129. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
  130. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
  131. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  132. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
  133. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
  134. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
  135. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
  136. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  137. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  138. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  139. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  140. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  141. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  142. package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  143. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  144. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
  145. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
  146. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
  147. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
  148. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
  149. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
  150. package/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
  151. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +354 -15
  152. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1073 -585
  153. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +29 -7
  154. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +4 -4
  155. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +1 -1
  156. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
  157. package/eigen/Eigen/src/Core/products/Parallelizer.h +23 -9
  158. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +23 -6
  159. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
  160. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +2 -2
  161. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
  162. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +3 -3
  163. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +5 -3
  164. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
  165. package/eigen/Eigen/src/Core/util/BlasUtil.h +208 -124
  166. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  167. package/eigen/Eigen/src/Core/util/Constants.h +25 -9
  168. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +14 -2
  169. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +28 -4
  170. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  171. package/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
  172. package/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
  173. package/eigen/Eigen/src/Core/util/Macros.h +661 -250
  174. package/eigen/Eigen/src/Core/util/Memory.h +222 -52
  175. package/eigen/Eigen/src/Core/util/Meta.h +349 -105
  176. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  177. package/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
  178. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  179. package/eigen/Eigen/src/Core/util/XprHelper.h +48 -30
  180. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
  181. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +1 -1
  182. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
  183. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
  184. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
  185. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
  186. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
  187. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +10 -5
  188. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +75 -42
  189. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
  190. package/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
  191. package/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
  192. package/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
  193. package/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
  194. package/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
  195. package/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
  196. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
  197. package/eigen/Eigen/src/Geometry/Quaternion.h +52 -14
  198. package/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
  199. package/eigen/Eigen/src/Geometry/Scaling.h +22 -4
  200. package/eigen/Eigen/src/Geometry/Transform.h +86 -65
  201. package/eigen/Eigen/src/Geometry/Translation.h +6 -6
  202. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  203. package/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
  204. package/eigen/Eigen/src/Householder/Householder.h +8 -4
  205. package/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
  206. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
  207. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
  208. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
  209. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
  210. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
  211. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
  212. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
  213. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
  214. package/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
  215. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  216. package/eigen/Eigen/src/LU/Determinant.h +35 -19
  217. package/eigen/Eigen/src/LU/FullPivLU.h +29 -43
  218. package/eigen/Eigen/src/LU/InverseImpl.h +25 -8
  219. package/eigen/Eigen/src/LU/PartialPivLU.h +67 -57
  220. package/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
  221. package/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
  222. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
  223. package/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
  224. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
  225. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +10 -9
  226. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
  227. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
  228. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
  229. package/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
  230. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
  231. package/eigen/Eigen/src/SVD/BDCSVD.h +137 -48
  232. package/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
  233. package/eigen/Eigen/src/SVD/SVDBase.h +82 -21
  234. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
  235. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +16 -8
  236. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +11 -36
  237. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
  238. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
  239. package/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
  240. package/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
  241. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
  242. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
  243. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +2 -2
  244. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
  245. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +124 -10
  246. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
  247. package/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
  248. package/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
  249. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +3 -0
  250. package/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
  251. package/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
  252. package/eigen/Eigen/src/SparseLU/SparseLU.h +160 -10
  253. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
  254. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
  255. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
  256. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
  257. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
  258. package/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
  259. package/eigen/Eigen/src/StlSupport/StdDeque.h +2 -14
  260. package/eigen/Eigen/src/StlSupport/StdList.h +2 -2
  261. package/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
  262. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
  263. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
  264. package/eigen/Eigen/src/misc/lapacke.h +5 -4
  265. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +27 -1
  266. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
  267. package/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
  268. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
  269. package/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  270. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
  271. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
  272. package/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
  273. package/eigen/README.md +5 -0
  274. package/lib/LibEigen.d.ts +4 -0
  275. package/lib/LibEigen.js +14 -0
  276. package/lib/index.d.ts +1 -1
  277. package/lib/index.js +7 -3
  278. package/package.json +2 -10
  279. package/eigen/Eigen/CMakeLists.txt +0 -19
  280. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  281. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  282. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  283. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  284. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  285. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  286. package/lib/eigen.d.ts +0 -2
  287. package/lib/eigen.js +0 -15
@@ -1,1124 +0,0 @@
1
- // This file is part of Eigen, a lightweight C++ template library
2
- // for linear algebra.
3
- //
4
- // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
- //
6
- // This Source Code Form is subject to the terms of the Mozilla
7
- // Public License v. 2.0. If a copy of the MPL was not distributed
8
- // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
-
10
- #ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
11
- #define EIGEN_PACKET_MATH_HALF_CUDA_H
12
-
13
-
14
- namespace Eigen {
15
- namespace internal {
16
-
17
- // Most of the following operations require arch >= 3.0
18
- #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
19
-
20
- template<> struct is_arithmetic<half2> { enum { value = true }; };
21
-
22
- template<> struct packet_traits<Eigen::half> : default_packet_traits
23
- {
24
- typedef half2 type;
25
- typedef half2 half;
26
- enum {
27
- Vectorizable = 1,
28
- AlignedOnScalar = 1,
29
- size=2,
30
- HasHalfPacket = 0,
31
- HasAdd = 1,
32
- HasMul = 1,
33
- HasDiv = 1,
34
- HasSqrt = 1,
35
- HasRsqrt = 1,
36
- HasExp = 1,
37
- HasLog = 1,
38
- HasLog1p = 1
39
- };
40
- };
41
-
42
- template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
43
-
44
- template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
45
- return __half2half2(from);
46
- }
47
-
48
- template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
49
- return *reinterpret_cast<const half2*>(from);
50
- }
51
-
52
- template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
53
- return __halves2half2(from[0], from[1]);
54
- }
55
-
56
- template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
57
- return __halves2half2(from[0], from[0]);
58
- }
59
-
60
- template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
61
- *reinterpret_cast<half2*>(to) = from;
62
- }
63
-
64
- template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
65
- to[0] = __low2half(from);
66
- to[1] = __high2half(from);
67
- }
68
-
69
- template<>
70
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
71
- #if __CUDA_ARCH__ >= 350
72
- return __ldg((const half2*)from);
73
- #else
74
- return __halves2half2(*(from+0), *(from+1));
75
- #endif
76
- }
77
-
78
- template<>
79
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
80
- #if __CUDA_ARCH__ >= 350
81
- return __halves2half2(__ldg(from+0), __ldg(from+1));
82
- #else
83
- return __halves2half2(*(from+0), *(from+1));
84
- #endif
85
- }
86
-
87
- template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
88
- return __halves2half2(from[0*stride], from[1*stride]);
89
- }
90
-
91
- template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
92
- to[stride*0] = __low2half(from);
93
- to[stride*1] = __high2half(from);
94
- }
95
-
96
- template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
97
- return __low2half(a);
98
- }
99
-
100
- template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
101
- half2 result;
102
- unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
103
- *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
104
- return result;
105
- }
106
-
107
-
108
- __device__ EIGEN_STRONG_INLINE void
109
- ptranspose(PacketBlock<half2,2>& kernel) {
110
- __half a1 = __low2half(kernel.packet[0]);
111
- __half a2 = __high2half(kernel.packet[0]);
112
- __half b1 = __low2half(kernel.packet[1]);
113
- __half b2 = __high2half(kernel.packet[1]);
114
- kernel.packet[0] = __halves2half2(a1, b1);
115
- kernel.packet[1] = __halves2half2(a2, b2);
116
- }
117
-
118
- template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
119
- #if __CUDA_ARCH__ >= 530
120
- return __halves2half2(a, __hadd(a, __float2half(1.0f)));
121
- #else
122
- float f = __half2float(a) + 1.0f;
123
- return __halves2half2(a, __float2half(f));
124
- #endif
125
- }
126
-
127
- template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
128
- #if __CUDA_ARCH__ >= 530
129
- return __hadd2(a, b);
130
- #else
131
- float a1 = __low2float(a);
132
- float a2 = __high2float(a);
133
- float b1 = __low2float(b);
134
- float b2 = __high2float(b);
135
- float r1 = a1 + b1;
136
- float r2 = a2 + b2;
137
- return __floats2half2_rn(r1, r2);
138
- #endif
139
- }
140
-
141
- template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
142
- #if __CUDA_ARCH__ >= 530
143
- return __hsub2(a, b);
144
- #else
145
- float a1 = __low2float(a);
146
- float a2 = __high2float(a);
147
- float b1 = __low2float(b);
148
- float b2 = __high2float(b);
149
- float r1 = a1 - b1;
150
- float r2 = a2 - b2;
151
- return __floats2half2_rn(r1, r2);
152
- #endif
153
- }
154
-
155
- template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
156
- #if __CUDA_ARCH__ >= 530
157
- return __hneg2(a);
158
- #else
159
- float a1 = __low2float(a);
160
- float a2 = __high2float(a);
161
- return __floats2half2_rn(-a1, -a2);
162
- #endif
163
- }
164
-
165
- template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
166
-
167
- template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
168
- #if __CUDA_ARCH__ >= 530
169
- return __hmul2(a, b);
170
- #else
171
- float a1 = __low2float(a);
172
- float a2 = __high2float(a);
173
- float b1 = __low2float(b);
174
- float b2 = __high2float(b);
175
- float r1 = a1 * b1;
176
- float r2 = a2 * b2;
177
- return __floats2half2_rn(r1, r2);
178
- #endif
179
- }
180
-
181
- template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
182
- #if __CUDA_ARCH__ >= 530
183
- return __hfma2(a, b, c);
184
- #else
185
- float a1 = __low2float(a);
186
- float a2 = __high2float(a);
187
- float b1 = __low2float(b);
188
- float b2 = __high2float(b);
189
- float c1 = __low2float(c);
190
- float c2 = __high2float(c);
191
- float r1 = a1 * b1 + c1;
192
- float r2 = a2 * b2 + c2;
193
- return __floats2half2_rn(r1, r2);
194
- #endif
195
- }
196
-
197
- template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
198
- float a1 = __low2float(a);
199
- float a2 = __high2float(a);
200
- float b1 = __low2float(b);
201
- float b2 = __high2float(b);
202
- float r1 = a1 / b1;
203
- float r2 = a2 / b2;
204
- return __floats2half2_rn(r1, r2);
205
- }
206
-
207
- template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
208
- float a1 = __low2float(a);
209
- float a2 = __high2float(a);
210
- float b1 = __low2float(b);
211
- float b2 = __high2float(b);
212
- __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
213
- __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
214
- return __halves2half2(r1, r2);
215
- }
216
-
217
- template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
218
- float a1 = __low2float(a);
219
- float a2 = __high2float(a);
220
- float b1 = __low2float(b);
221
- float b2 = __high2float(b);
222
- __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
223
- __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
224
- return __halves2half2(r1, r2);
225
- }
226
-
227
- template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
228
- #if __CUDA_ARCH__ >= 530
229
- return __hadd(__low2half(a), __high2half(a));
230
- #else
231
- float a1 = __low2float(a);
232
- float a2 = __high2float(a);
233
- return Eigen::half(__float2half_rn(a1 + a2));
234
- #endif
235
- }
236
-
237
- template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
238
- #if __CUDA_ARCH__ >= 530
239
- __half first = __low2half(a);
240
- __half second = __high2half(a);
241
- return __hgt(first, second) ? first : second;
242
- #else
243
- float a1 = __low2float(a);
244
- float a2 = __high2float(a);
245
- return a1 > a2 ? __low2half(a) : __high2half(a);
246
- #endif
247
- }
248
-
249
- template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
250
- #if __CUDA_ARCH__ >= 530
251
- __half first = __low2half(a);
252
- __half second = __high2half(a);
253
- return __hlt(first, second) ? first : second;
254
- #else
255
- float a1 = __low2float(a);
256
- float a2 = __high2float(a);
257
- return a1 < a2 ? __low2half(a) : __high2half(a);
258
- #endif
259
- }
260
-
261
- template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
262
- #if __CUDA_ARCH__ >= 530
263
- return __hmul(__low2half(a), __high2half(a));
264
- #else
265
- float a1 = __low2float(a);
266
- float a2 = __high2float(a);
267
- return Eigen::half(__float2half_rn(a1 * a2));
268
- #endif
269
- }
270
-
271
- template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
272
- float a1 = __low2float(a);
273
- float a2 = __high2float(a);
274
- float r1 = log1pf(a1);
275
- float r2 = log1pf(a2);
276
- return __floats2half2_rn(r1, r2);
277
- }
278
-
279
- #if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
280
-
281
- template<> __device__ EIGEN_STRONG_INLINE
282
- half2 plog<half2>(const half2& a) {
283
- return h2log(a);
284
- }
285
-
286
- template<> __device__ EIGEN_STRONG_INLINE
287
- half2 pexp<half2>(const half2& a) {
288
- return h2exp(a);
289
- }
290
-
291
- template<> __device__ EIGEN_STRONG_INLINE
292
- half2 psqrt<half2>(const half2& a) {
293
- return h2sqrt(a);
294
- }
295
-
296
- template<> __device__ EIGEN_STRONG_INLINE
297
- half2 prsqrt<half2>(const half2& a) {
298
- return h2rsqrt(a);
299
- }
300
-
301
- #else
302
-
303
- template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
304
- float a1 = __low2float(a);
305
- float a2 = __high2float(a);
306
- float r1 = logf(a1);
307
- float r2 = logf(a2);
308
- return __floats2half2_rn(r1, r2);
309
- }
310
-
311
- template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
312
- float a1 = __low2float(a);
313
- float a2 = __high2float(a);
314
- float r1 = expf(a1);
315
- float r2 = expf(a2);
316
- return __floats2half2_rn(r1, r2);
317
- }
318
-
319
- template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
320
- float a1 = __low2float(a);
321
- float a2 = __high2float(a);
322
- float r1 = sqrtf(a1);
323
- float r2 = sqrtf(a2);
324
- return __floats2half2_rn(r1, r2);
325
- }
326
-
327
- template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
328
- float a1 = __low2float(a);
329
- float a2 = __high2float(a);
330
- float r1 = rsqrtf(a1);
331
- float r2 = rsqrtf(a2);
332
- return __floats2half2_rn(r1, r2);
333
- }
334
-
335
- #endif
336
-
337
- #elif defined EIGEN_VECTORIZE_AVX512
338
-
339
- typedef struct {
340
- __m256i x;
341
- } Packet16h;
342
-
343
-
344
- template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
345
-
346
- template <>
347
- struct packet_traits<half> : default_packet_traits {
348
- typedef Packet16h type;
349
- // There is no half-size packet for Packet16h.
350
- typedef Packet16h half;
351
- enum {
352
- Vectorizable = 1,
353
- AlignedOnScalar = 1,
354
- size = 16,
355
- HasHalfPacket = 0,
356
- HasAdd = 0,
357
- HasSub = 0,
358
- HasMul = 0,
359
- HasNegate = 0,
360
- HasAbs = 0,
361
- HasAbs2 = 0,
362
- HasMin = 0,
363
- HasMax = 0,
364
- HasConj = 0,
365
- HasSetLinear = 0,
366
- HasDiv = 0,
367
- HasSqrt = 0,
368
- HasRsqrt = 0,
369
- HasExp = 0,
370
- HasLog = 0,
371
- HasBlend = 0
372
- };
373
- };
374
-
375
-
376
- template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
377
-
378
- template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
379
- Packet16h result;
380
- result.x = _mm256_set1_epi16(from.x);
381
- return result;
382
- }
383
-
384
- template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
385
- return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
386
- }
387
-
388
- template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
389
- Packet16h result;
390
- result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
391
- return result;
392
- }
393
-
394
- template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
395
- Packet16h result;
396
- result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
397
- return result;
398
- }
399
-
400
- template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
401
- _mm256_store_si256((__m256i*)to, from.x);
402
- }
403
-
404
- template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
405
- _mm256_storeu_si256((__m256i*)to, from.x);
406
- }
407
-
408
- template<> EIGEN_STRONG_INLINE Packet16h
409
- ploadquad(const Eigen::half* from) {
410
- Packet16h result;
411
- unsigned short a = from[0].x;
412
- unsigned short b = from[1].x;
413
- unsigned short c = from[2].x;
414
- unsigned short d = from[3].x;
415
- result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
416
- return result;
417
- }
418
-
419
- EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
420
- #ifdef EIGEN_HAS_FP16_C
421
- return _mm512_cvtph_ps(a.x);
422
- #else
423
- EIGEN_ALIGN64 half aux[16];
424
- pstore(aux, a);
425
- float f0(aux[0]);
426
- float f1(aux[1]);
427
- float f2(aux[2]);
428
- float f3(aux[3]);
429
- float f4(aux[4]);
430
- float f5(aux[5]);
431
- float f6(aux[6]);
432
- float f7(aux[7]);
433
- float f8(aux[8]);
434
- float f9(aux[9]);
435
- float fa(aux[10]);
436
- float fb(aux[11]);
437
- float fc(aux[12]);
438
- float fd(aux[13]);
439
- float fe(aux[14]);
440
- float ff(aux[15]);
441
-
442
- return _mm512_set_ps(
443
- ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
444
- #endif
445
- }
446
-
447
- EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
448
- #ifdef EIGEN_HAS_FP16_C
449
- Packet16h result;
450
- result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
451
- return result;
452
- #else
453
- EIGEN_ALIGN64 float aux[16];
454
- pstore(aux, a);
455
- half h0(aux[0]);
456
- half h1(aux[1]);
457
- half h2(aux[2]);
458
- half h3(aux[3]);
459
- half h4(aux[4]);
460
- half h5(aux[5]);
461
- half h6(aux[6]);
462
- half h7(aux[7]);
463
- half h8(aux[8]);
464
- half h9(aux[9]);
465
- half ha(aux[10]);
466
- half hb(aux[11]);
467
- half hc(aux[12]);
468
- half hd(aux[13]);
469
- half he(aux[14]);
470
- half hf(aux[15]);
471
-
472
- Packet16h result;
473
- result.x = _mm256_set_epi16(
474
- hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
475
- h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
476
- return result;
477
- #endif
478
- }
479
-
480
- template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
481
- Packet16f af = half2float(a);
482
- Packet16f bf = half2float(b);
483
- Packet16f rf = padd(af, bf);
484
- return float2half(rf);
485
- }
486
-
487
- template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
488
- Packet16f af = half2float(a);
489
- Packet16f bf = half2float(b);
490
- Packet16f rf = pmul(af, bf);
491
- return float2half(rf);
492
- }
493
-
494
- template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
495
- Packet16f from_float = half2float(from);
496
- return half(predux(from_float));
497
- }
498
-
499
- template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
500
- {
501
- Packet16h result;
502
- result.x = _mm256_set_epi16(
503
- from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
504
- from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
505
- from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
506
- from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
507
- return result;
508
- }
509
-
510
- template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
511
- {
512
- EIGEN_ALIGN64 half aux[16];
513
- pstore(aux, from);
514
- to[stride*0].x = aux[0].x;
515
- to[stride*1].x = aux[1].x;
516
- to[stride*2].x = aux[2].x;
517
- to[stride*3].x = aux[3].x;
518
- to[stride*4].x = aux[4].x;
519
- to[stride*5].x = aux[5].x;
520
- to[stride*6].x = aux[6].x;
521
- to[stride*7].x = aux[7].x;
522
- to[stride*8].x = aux[8].x;
523
- to[stride*9].x = aux[9].x;
524
- to[stride*10].x = aux[10].x;
525
- to[stride*11].x = aux[11].x;
526
- to[stride*12].x = aux[12].x;
527
- to[stride*13].x = aux[13].x;
528
- to[stride*14].x = aux[14].x;
529
- to[stride*15].x = aux[15].x;
530
- }
531
-
532
- EIGEN_STRONG_INLINE void
533
- ptranspose(PacketBlock<Packet16h,16>& kernel) {
534
- __m256i a = kernel.packet[0].x;
535
- __m256i b = kernel.packet[1].x;
536
- __m256i c = kernel.packet[2].x;
537
- __m256i d = kernel.packet[3].x;
538
- __m256i e = kernel.packet[4].x;
539
- __m256i f = kernel.packet[5].x;
540
- __m256i g = kernel.packet[6].x;
541
- __m256i h = kernel.packet[7].x;
542
- __m256i i = kernel.packet[8].x;
543
- __m256i j = kernel.packet[9].x;
544
- __m256i k = kernel.packet[10].x;
545
- __m256i l = kernel.packet[11].x;
546
- __m256i m = kernel.packet[12].x;
547
- __m256i n = kernel.packet[13].x;
548
- __m256i o = kernel.packet[14].x;
549
- __m256i p = kernel.packet[15].x;
550
-
551
- __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
552
- __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
553
- __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
554
- __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
555
- __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
556
- __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
557
- __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
558
- __m256i op_07 = _mm256_unpacklo_epi16(o, p);
559
-
560
- __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
561
- __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
562
- __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
563
- __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
564
- __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
565
- __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
566
- __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
567
- __m256i op_8f = _mm256_unpackhi_epi16(o, p);
568
-
569
- __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
570
- __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
571
- __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
572
- __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
573
- __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
574
- __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
575
- __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
576
- __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
577
-
578
- __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
579
- __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
580
- __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
581
- __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
582
- __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
583
- __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
584
- __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
585
- __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
586
-
587
- __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
588
- __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
589
- __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
590
- __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
591
- __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
592
- __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
593
- __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
594
- __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
595
- __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
596
- __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
597
- __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
598
- __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
599
- __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
600
- __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
601
- __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
602
- __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
603
-
604
- // NOTE: no unpacklo/hi instr in this case, so using permute instr.
605
- __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
606
- __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
607
- __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
608
- __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
609
- __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
610
- __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
611
- __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
612
- __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
613
- __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
614
- __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
615
- __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
616
- __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
617
- __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
618
- __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
619
- __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
620
- __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
621
-
622
- kernel.packet[0].x = a_p_0;
623
- kernel.packet[1].x = a_p_1;
624
- kernel.packet[2].x = a_p_2;
625
- kernel.packet[3].x = a_p_3;
626
- kernel.packet[4].x = a_p_4;
627
- kernel.packet[5].x = a_p_5;
628
- kernel.packet[6].x = a_p_6;
629
- kernel.packet[7].x = a_p_7;
630
- kernel.packet[8].x = a_p_8;
631
- kernel.packet[9].x = a_p_9;
632
- kernel.packet[10].x = a_p_a;
633
- kernel.packet[11].x = a_p_b;
634
- kernel.packet[12].x = a_p_c;
635
- kernel.packet[13].x = a_p_d;
636
- kernel.packet[14].x = a_p_e;
637
- kernel.packet[15].x = a_p_f;
638
- }
639
-
640
- EIGEN_STRONG_INLINE void
641
- ptranspose(PacketBlock<Packet16h,8>& kernel) {
642
- EIGEN_ALIGN64 half in[8][16];
643
- pstore<half>(in[0], kernel.packet[0]);
644
- pstore<half>(in[1], kernel.packet[1]);
645
- pstore<half>(in[2], kernel.packet[2]);
646
- pstore<half>(in[3], kernel.packet[3]);
647
- pstore<half>(in[4], kernel.packet[4]);
648
- pstore<half>(in[5], kernel.packet[5]);
649
- pstore<half>(in[6], kernel.packet[6]);
650
- pstore<half>(in[7], kernel.packet[7]);
651
-
652
- EIGEN_ALIGN64 half out[8][16];
653
-
654
- for (int i = 0; i < 8; ++i) {
655
- for (int j = 0; j < 8; ++j) {
656
- out[i][j] = in[j][2*i];
657
- }
658
- for (int j = 0; j < 8; ++j) {
659
- out[i][j+8] = in[j][2*i+1];
660
- }
661
- }
662
-
663
- kernel.packet[0] = pload<Packet16h>(out[0]);
664
- kernel.packet[1] = pload<Packet16h>(out[1]);
665
- kernel.packet[2] = pload<Packet16h>(out[2]);
666
- kernel.packet[3] = pload<Packet16h>(out[3]);
667
- kernel.packet[4] = pload<Packet16h>(out[4]);
668
- kernel.packet[5] = pload<Packet16h>(out[5]);
669
- kernel.packet[6] = pload<Packet16h>(out[6]);
670
- kernel.packet[7] = pload<Packet16h>(out[7]);
671
- }
672
-
673
- EIGEN_STRONG_INLINE void
674
- ptranspose(PacketBlock<Packet16h,4>& kernel) {
675
- EIGEN_ALIGN64 half in[4][16];
676
- pstore<half>(in[0], kernel.packet[0]);
677
- pstore<half>(in[1], kernel.packet[1]);
678
- pstore<half>(in[2], kernel.packet[2]);
679
- pstore<half>(in[3], kernel.packet[3]);
680
-
681
- EIGEN_ALIGN64 half out[4][16];
682
-
683
- for (int i = 0; i < 4; ++i) {
684
- for (int j = 0; j < 4; ++j) {
685
- out[i][j] = in[j][4*i];
686
- }
687
- for (int j = 0; j < 4; ++j) {
688
- out[i][j+4] = in[j][4*i+1];
689
- }
690
- for (int j = 0; j < 4; ++j) {
691
- out[i][j+8] = in[j][4*i+2];
692
- }
693
- for (int j = 0; j < 4; ++j) {
694
- out[i][j+12] = in[j][4*i+3];
695
- }
696
- }
697
-
698
- kernel.packet[0] = pload<Packet16h>(out[0]);
699
- kernel.packet[1] = pload<Packet16h>(out[1]);
700
- kernel.packet[2] = pload<Packet16h>(out[2]);
701
- kernel.packet[3] = pload<Packet16h>(out[3]);
702
- }
703
-
704
-
705
- #elif defined EIGEN_VECTORIZE_AVX
706
-
707
- typedef struct {
708
- __m128i x;
709
- } Packet8h;
710
-
711
-
712
- template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
713
-
714
- template <>
715
- struct packet_traits<Eigen::half> : default_packet_traits {
716
- typedef Packet8h type;
717
- // There is no half-size packet for Packet8h.
718
- typedef Packet8h half;
719
- enum {
720
- Vectorizable = 1,
721
- AlignedOnScalar = 1,
722
- size = 8,
723
- HasHalfPacket = 0,
724
- HasAdd = 0,
725
- HasSub = 0,
726
- HasMul = 0,
727
- HasNegate = 0,
728
- HasAbs = 0,
729
- HasAbs2 = 0,
730
- HasMin = 0,
731
- HasMax = 0,
732
- HasConj = 0,
733
- HasSetLinear = 0,
734
- HasDiv = 0,
735
- HasSqrt = 0,
736
- HasRsqrt = 0,
737
- HasExp = 0,
738
- HasLog = 0,
739
- HasBlend = 0
740
- };
741
- };
742
-
743
-
744
- template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
745
-
746
- template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
747
- Packet8h result;
748
- result.x = _mm_set1_epi16(from.x);
749
- return result;
750
- }
751
-
752
- template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
753
- return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
754
- }
755
-
756
- template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
757
- Packet8h result;
758
- result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
759
- return result;
760
- }
761
-
762
- template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
763
- Packet8h result;
764
- result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
765
- return result;
766
- }
767
-
768
- template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
769
- _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
770
- }
771
-
772
- template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
773
- _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
774
- }
775
-
776
- template<> EIGEN_STRONG_INLINE Packet8h
777
- ploadquad<Packet8h>(const Eigen::half* from) {
778
- Packet8h result;
779
- unsigned short a = from[0].x;
780
- unsigned short b = from[1].x;
781
- result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
782
- return result;
783
- }
784
-
785
- EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
786
- #ifdef EIGEN_HAS_FP16_C
787
- return _mm256_cvtph_ps(a.x);
788
- #else
789
- EIGEN_ALIGN32 Eigen::half aux[8];
790
- pstore(aux, a);
791
- float f0(aux[0]);
792
- float f1(aux[1]);
793
- float f2(aux[2]);
794
- float f3(aux[3]);
795
- float f4(aux[4]);
796
- float f5(aux[5]);
797
- float f6(aux[6]);
798
- float f7(aux[7]);
799
-
800
- return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
801
- #endif
802
- }
803
-
804
- EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
805
- #ifdef EIGEN_HAS_FP16_C
806
- Packet8h result;
807
- result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
808
- return result;
809
- #else
810
- EIGEN_ALIGN32 float aux[8];
811
- pstore(aux, a);
812
- Eigen::half h0(aux[0]);
813
- Eigen::half h1(aux[1]);
814
- Eigen::half h2(aux[2]);
815
- Eigen::half h3(aux[3]);
816
- Eigen::half h4(aux[4]);
817
- Eigen::half h5(aux[5]);
818
- Eigen::half h6(aux[6]);
819
- Eigen::half h7(aux[7]);
820
-
821
- Packet8h result;
822
- result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
823
- return result;
824
- #endif
825
- }
826
-
827
- template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
828
-
829
- template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
830
- Packet8f af = half2float(a);
831
- Packet8f bf = half2float(b);
832
- Packet8f rf = padd(af, bf);
833
- return float2half(rf);
834
- }
835
-
836
- template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
837
- Packet8f af = half2float(a);
838
- Packet8f bf = half2float(b);
839
- Packet8f rf = pmul(af, bf);
840
- return float2half(rf);
841
- }
842
-
843
- template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
844
- {
845
- Packet8h result;
846
- result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
847
- return result;
848
- }
849
-
850
- template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
851
- {
852
- EIGEN_ALIGN32 Eigen::half aux[8];
853
- pstore(aux, from);
854
- to[stride*0].x = aux[0].x;
855
- to[stride*1].x = aux[1].x;
856
- to[stride*2].x = aux[2].x;
857
- to[stride*3].x = aux[3].x;
858
- to[stride*4].x = aux[4].x;
859
- to[stride*5].x = aux[5].x;
860
- to[stride*6].x = aux[6].x;
861
- to[stride*7].x = aux[7].x;
862
- }
863
-
864
- template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
865
- Packet8f af = half2float(a);
866
- float reduced = predux<Packet8f>(af);
867
- return Eigen::half(reduced);
868
- }
869
-
870
- template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
871
- Packet8f af = half2float(a);
872
- float reduced = predux_max<Packet8f>(af);
873
- return Eigen::half(reduced);
874
- }
875
-
876
- template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
877
- Packet8f af = half2float(a);
878
- float reduced = predux_min<Packet8f>(af);
879
- return Eigen::half(reduced);
880
- }
881
-
882
- template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
883
- Packet8f af = half2float(a);
884
- float reduced = predux_mul<Packet8f>(af);
885
- return Eigen::half(reduced);
886
- }
887
-
888
- EIGEN_STRONG_INLINE void
889
- ptranspose(PacketBlock<Packet8h,8>& kernel) {
890
- __m128i a = kernel.packet[0].x;
891
- __m128i b = kernel.packet[1].x;
892
- __m128i c = kernel.packet[2].x;
893
- __m128i d = kernel.packet[3].x;
894
- __m128i e = kernel.packet[4].x;
895
- __m128i f = kernel.packet[5].x;
896
- __m128i g = kernel.packet[6].x;
897
- __m128i h = kernel.packet[7].x;
898
-
899
- __m128i a03b03 = _mm_unpacklo_epi16(a, b);
900
- __m128i c03d03 = _mm_unpacklo_epi16(c, d);
901
- __m128i e03f03 = _mm_unpacklo_epi16(e, f);
902
- __m128i g03h03 = _mm_unpacklo_epi16(g, h);
903
- __m128i a47b47 = _mm_unpackhi_epi16(a, b);
904
- __m128i c47d47 = _mm_unpackhi_epi16(c, d);
905
- __m128i e47f47 = _mm_unpackhi_epi16(e, f);
906
- __m128i g47h47 = _mm_unpackhi_epi16(g, h);
907
-
908
- __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
909
- __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
910
- __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
911
- __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
912
- __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
913
- __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
914
- __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
915
- __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
916
-
917
- __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
918
- __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
919
- __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
920
- __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
921
- __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
922
- __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
923
- __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
924
- __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
925
-
926
- kernel.packet[0].x = a0b0c0d0e0f0g0h0;
927
- kernel.packet[1].x = a1b1c1d1e1f1g1h1;
928
- kernel.packet[2].x = a2b2c2d2e2f2g2h2;
929
- kernel.packet[3].x = a3b3c3d3e3f3g3h3;
930
- kernel.packet[4].x = a4b4c4d4e4f4g4h4;
931
- kernel.packet[5].x = a5b5c5d5e5f5g5h5;
932
- kernel.packet[6].x = a6b6c6d6e6f6g6h6;
933
- kernel.packet[7].x = a7b7c7d7e7f7g7h7;
934
- }
935
-
936
- EIGEN_STRONG_INLINE void
937
- ptranspose(PacketBlock<Packet8h,4>& kernel) {
938
- EIGEN_ALIGN32 Eigen::half in[4][8];
939
- pstore<Eigen::half>(in[0], kernel.packet[0]);
940
- pstore<Eigen::half>(in[1], kernel.packet[1]);
941
- pstore<Eigen::half>(in[2], kernel.packet[2]);
942
- pstore<Eigen::half>(in[3], kernel.packet[3]);
943
-
944
- EIGEN_ALIGN32 Eigen::half out[4][8];
945
-
946
- for (int i = 0; i < 4; ++i) {
947
- for (int j = 0; j < 4; ++j) {
948
- out[i][j] = in[j][2*i];
949
- }
950
- for (int j = 0; j < 4; ++j) {
951
- out[i][j+4] = in[j][2*i+1];
952
- }
953
- }
954
-
955
- kernel.packet[0] = pload<Packet8h>(out[0]);
956
- kernel.packet[1] = pload<Packet8h>(out[1]);
957
- kernel.packet[2] = pload<Packet8h>(out[2]);
958
- kernel.packet[3] = pload<Packet8h>(out[3]);
959
- }
960
-
961
-
962
- // Disable the following code since it's broken on too many platforms / compilers.
963
- //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
964
- #elif 0
965
-
966
- typedef struct {
967
- __m64 x;
968
- } Packet4h;
969
-
970
-
971
- template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
972
-
973
- template <>
974
- struct packet_traits<Eigen::half> : default_packet_traits {
975
- typedef Packet4h type;
976
- // There is no half-size packet for Packet4h.
977
- typedef Packet4h half;
978
- enum {
979
- Vectorizable = 1,
980
- AlignedOnScalar = 1,
981
- size = 4,
982
- HasHalfPacket = 0,
983
- HasAdd = 0,
984
- HasSub = 0,
985
- HasMul = 0,
986
- HasNegate = 0,
987
- HasAbs = 0,
988
- HasAbs2 = 0,
989
- HasMin = 0,
990
- HasMax = 0,
991
- HasConj = 0,
992
- HasSetLinear = 0,
993
- HasDiv = 0,
994
- HasSqrt = 0,
995
- HasRsqrt = 0,
996
- HasExp = 0,
997
- HasLog = 0,
998
- HasBlend = 0
999
- };
1000
- };
1001
-
1002
-
1003
- template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
1004
-
1005
- template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
1006
- Packet4h result;
1007
- result.x = _mm_set1_pi16(from.x);
1008
- return result;
1009
- }
1010
-
1011
- template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
1012
- return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
1013
- }
1014
-
1015
- template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
1016
-
1017
- template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
1018
- __int64_t a64 = _mm_cvtm64_si64(a.x);
1019
- __int64_t b64 = _mm_cvtm64_si64(b.x);
1020
-
1021
- Eigen::half h[4];
1022
-
1023
- Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1024
- Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1025
- h[0] = ha + hb;
1026
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1027
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1028
- h[1] = ha + hb;
1029
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1030
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1031
- h[2] = ha + hb;
1032
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1033
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1034
- h[3] = ha + hb;
1035
- Packet4h result;
1036
- result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1037
- return result;
1038
- }
1039
-
1040
- template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
1041
- __int64_t a64 = _mm_cvtm64_si64(a.x);
1042
- __int64_t b64 = _mm_cvtm64_si64(b.x);
1043
-
1044
- Eigen::half h[4];
1045
-
1046
- Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1047
- Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1048
- h[0] = ha * hb;
1049
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1050
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1051
- h[1] = ha * hb;
1052
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1053
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1054
- h[2] = ha * hb;
1055
- ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1056
- hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1057
- h[3] = ha * hb;
1058
- Packet4h result;
1059
- result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1060
- return result;
1061
- }
1062
-
1063
- template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
1064
- Packet4h result;
1065
- result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1066
- return result;
1067
- }
1068
-
1069
- template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
1070
- Packet4h result;
1071
- result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1072
- return result;
1073
- }
1074
-
1075
- template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1076
- __int64_t r = _mm_cvtm64_si64(from.x);
1077
- *(reinterpret_cast<__int64_t*>(to)) = r;
1078
- }
1079
-
1080
- template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1081
- __int64_t r = _mm_cvtm64_si64(from.x);
1082
- *(reinterpret_cast<__int64_t*>(to)) = r;
1083
- }
1084
-
1085
- template<> EIGEN_STRONG_INLINE Packet4h
1086
- ploadquad<Packet4h>(const Eigen::half* from) {
1087
- return pset1<Packet4h>(*from);
1088
- }
1089
-
1090
- template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
1091
- {
1092
- Packet4h result;
1093
- result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
1094
- return result;
1095
- }
1096
-
1097
- template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
1098
- {
1099
- __int64_t a = _mm_cvtm64_si64(from.x);
1100
- to[stride*0].x = static_cast<unsigned short>(a);
1101
- to[stride*1].x = static_cast<unsigned short>(a >> 16);
1102
- to[stride*2].x = static_cast<unsigned short>(a >> 32);
1103
- to[stride*3].x = static_cast<unsigned short>(a >> 48);
1104
- }
1105
-
1106
- EIGEN_STRONG_INLINE void
1107
- ptranspose(PacketBlock<Packet4h,4>& kernel) {
1108
- __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
1109
- __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
1110
- __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
1111
- __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
1112
-
1113
- kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
1114
- kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
1115
- kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
1116
- kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
1117
- }
1118
-
1119
- #endif
1120
-
1121
- }
1122
- }
1123
-
1124
- #endif // EIGEN_PACKET_MATH_HALF_CUDA_H