@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -0,0 +1,1088 @@
1
+
2
+ #ifndef EIGEN_HVX_PACKET_MATH_H
3
+ #define EIGEN_HVX_PACKET_MATH_H
4
+
5
+ // Only support 128B HVX now.
6
+ // Floating-point operations are supported only since V68.
7
+ #if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
8
+
9
+ // All the floating-point operations do not support IEEE standard.
10
+ // From HVX document:
11
+ // There is no concept of infinity or NaN. QFloat saturates to maximum
12
+ // exponent with maximum positive or minimum negative significand.
13
+
14
+ #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
15
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
16
+ #endif
17
+
18
+ namespace Eigen {
19
+ namespace internal {
20
+
21
+ // HVX utilities.
22
+
23
+ template <int D>
24
+ EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) {
25
+ HVX_Vector v;
26
+ #if EIGEN_COMP_CLANG
27
+ // Use inlined assembly for aligned vmem load on unaligned memory.
28
+ // Use type cast to HVX_Vector* may mess up with compiler data alignment.
29
+ __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory");
30
+ #else
31
+ void* aligned_mem =
32
+ reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
33
+ memcpy(&v, aligned_mem, __HVX_LENGTH__);
34
+ #endif
35
+ return v;
36
+ }
37
+
38
+ template <typename T>
39
+ EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) {
40
+ HVX_Vector v;
41
+ memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__);
42
+ return v;
43
+ }
44
+
45
+ template <typename T>
46
+ EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) {
47
+ HVX_Vector v;
48
+ memcpy(&v, mem, __HVX_LENGTH__);
49
+ return v;
50
+ }
51
+
52
+ template <size_t Size, size_t Alignment, typename T>
53
+ EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) {
54
+ #if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
55
+ // Fast partial vector load through aligned vmem load.
56
+ // The load may past end of array but is aligned to prevent memory fault.
57
+ HVX_Vector v0 = HVX_vmem<0>(mem);
58
+ HVX_Vector v1 = v0;
59
+ uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
60
+ EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) {
61
+ // Data size less than alignment will never cross multiple aligned vectors.
62
+ v1 = v0;
63
+ }
64
+ else {
65
+ uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
66
+ if (left_off + Size * sizeof(T) > __HVX_LENGTH__) {
67
+ v1 = HVX_vmem<1>(mem);
68
+ } else {
69
+ v1 = v0;
70
+ }
71
+ }
72
+ return Q6_V_valign_VVR(v1, v0, mem_addr);
73
+ #else
74
+ HVX_Vector v;
75
+ memcpy(&v, mem, Size * sizeof(T));
76
+ return v;
77
+ #endif
78
+ }
79
+
80
+ template <typename T>
81
+ EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) {
82
+ memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__);
83
+ }
84
+
85
+ template <typename T>
86
+ EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) {
87
+ memcpy(mem, &v, __HVX_LENGTH__);
88
+ }
89
+
90
+ template <size_t Size, size_t Alignment, typename T>
91
+ EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) {
92
+ uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
93
+ HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
94
+ uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
95
+ uintptr_t right_off = left_off + Size * sizeof(T);
96
+
97
+ HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
98
+ HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
99
+
100
+ EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) {
101
+ if (right_off > __HVX_LENGTH__) {
102
+ Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value);
103
+ qr = Q6_Q_vcmp_eq_VbVb(value, value);
104
+ }
105
+ }
106
+
107
+ ql_not = Q6_Q_or_QQn(ql_not, qr);
108
+ Q6_vmem_QnRIV(ql_not, mem, value);
109
+ }
110
+
111
+ // Packet definitions.
112
+ enum class HVXPacketSize {
113
+ Full,
114
+ Half,
115
+ Quarter,
116
+ };
117
+
118
+ // Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
119
+ // Wrap different vector type (float32, int32, etc) to different class with
120
+ // explicit constructor and casting back-and-force to HVX_Vector.
121
+ template <HVXPacketSize T>
122
+ class HVXPacket {
123
+ public:
124
+ HVXPacket() = default;
125
+ static HVXPacket Create(HVX_Vector v) { return HVXPacket(v); }
126
+ HVX_Vector Get() const { return m_val; }
127
+
128
+ private:
129
+ explicit HVXPacket(HVX_Vector v) : m_val(v) {}
130
+ HVX_Vector m_val = Q6_V_vzero();
131
+ };
132
+
133
+ typedef HVXPacket<HVXPacketSize::Full> Packet32f;
134
+ typedef HVXPacket<HVXPacketSize::Half> Packet16f;
135
+ typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
136
+
137
+ // Packet traits.
138
+ template <>
139
+ struct packet_traits<float> : default_packet_traits {
140
+ typedef Packet32f type;
141
+ typedef Packet16f half;
142
+ enum {
143
+ Vectorizable = 1,
144
+ AlignedOnScalar = 1,
145
+ size = 32,
146
+
147
+ HasCmp = 1,
148
+ HasAdd = 1,
149
+ HasSub = 1,
150
+ HasShift = 0,
151
+ HasMul = 1,
152
+ HasNegate = 1,
153
+ HasAbs = 1,
154
+ HasArg = 0,
155
+ HasAbs2 = 0,
156
+ HasAbsDiff = 0,
157
+ HasMin = 1,
158
+ HasMax = 1,
159
+ HasConj = 0,
160
+ HasSetLinear = 0,
161
+ HasBlend = 0,
162
+
163
+ HasDiv = 0,
164
+
165
+ HasSin = 0,
166
+ HasCos = 0,
167
+ HasACos = 0,
168
+ HasASin = 0,
169
+ HasATan = 0,
170
+ HasATanh = 0,
171
+ HasLog = 0,
172
+ HasExp = 0,
173
+ HasSqrt = 0,
174
+ HasRsqrt = 0,
175
+ HasTanh = 0,
176
+ HasErf = 0,
177
+ HasBessel = 0,
178
+ HasNdtri = 0
179
+ };
180
+ };
181
+
182
+ template <>
183
+ struct unpacket_traits<Packet32f> {
184
+ typedef float type;
185
+ typedef Packet16f half;
186
+ enum {
187
+ size = 32,
188
+ alignment = Aligned128,
189
+ vectorizable = true,
190
+ masked_load_available = false,
191
+ masked_store_available = false
192
+ };
193
+ };
194
+
195
+ template <>
196
+ struct unpacket_traits<Packet16f> {
197
+ typedef float type;
198
+ typedef Packet8f half;
199
+ enum {
200
+ size = 16,
201
+ // Many code assume alignment on packet size instead of following trait
202
+ // So we do not use Aligned128 to optimize aligned load/store,
203
+ alignment = Aligned64,
204
+ vectorizable = true,
205
+ masked_load_available = false,
206
+ masked_store_available = false
207
+ };
208
+ };
209
+
210
+ template <>
211
+ struct unpacket_traits<Packet8f> {
212
+ typedef float type;
213
+ typedef Packet8f half;
214
+ enum {
215
+ size = 8,
216
+ // Many code assume alignment on packet size instead of following trait
217
+ // So we do not use Aligned128 to optimize aligned load/store,
218
+ alignment = Aligned32,
219
+ vectorizable = true,
220
+ masked_load_available = false,
221
+ masked_store_available = false
222
+ };
223
+ };
224
+
225
+ // float32 operations.
226
+ template <HVXPacketSize T>
227
+ EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) {
228
+ return HVXPacket<T>::Create(Q6_V_vzero());
229
+ }
230
+ template <>
231
+ EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) {
232
+ return pzero_hvx(Packet32f());
233
+ }
234
+ template <>
235
+ EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) {
236
+ return pzero_hvx(Packet16f());
237
+ }
238
+ template <>
239
+ EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
240
+ return pzero_hvx(Packet8f());
241
+ }
242
+
243
+ template <HVXPacketSize T>
244
+ EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
245
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
246
+ return unpacket_traits<HVXPacket<T>>::half::Create(
247
+ Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
248
+ }
249
+ template <>
250
+ EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
251
+ return predux_half_dowto4_hvx(a);
252
+ }
253
+ template <>
254
+ EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
255
+ return predux_half_dowto4_hvx(a);
256
+ }
257
+
258
+ template <HVXPacketSize T>
259
+ EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) {
260
+ union {
261
+ float f;
262
+ int32_t i;
263
+ } u;
264
+ u.f = from;
265
+ return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
266
+ }
267
+ template <>
268
+ EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
269
+ return pset1_hvx<HVXPacketSize::Full>(from);
270
+ }
271
+ template <>
272
+ EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
273
+ return pset1_hvx<HVXPacketSize::Half>(from);
274
+ }
275
+ template <>
276
+ EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
277
+ return pset1_hvx<HVXPacketSize::Quarter>(from);
278
+ }
279
+
280
+ template <>
281
+ EIGEN_STRONG_INLINE Packet32f pload<Packet32f>(const float* from) {
282
+ return Packet32f::Create(HVX_load(from));
283
+ }
284
+ template <>
285
+ EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
286
+ return Packet16f::Create(
287
+ HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from));
288
+ }
289
+ template <>
290
+ EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
291
+ return Packet8f::Create(
292
+ HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from));
293
+ }
294
+
295
+ template <>
296
+ EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) {
297
+ return Packet32f::Create(HVX_loadu(from));
298
+ }
299
+ template <>
300
+ EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
301
+ return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
302
+ }
303
+ template <>
304
+ EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
305
+ return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
306
+ }
307
+
308
+ template <>
309
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) {
310
+ HVX_store(to, from.Get());
311
+ }
312
+ template <>
313
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
314
+ HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get());
315
+ }
316
+ template <>
317
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
318
+ HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get());
319
+ }
320
+
321
+ template <>
322
+ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) {
323
+ HVX_storeu(to, from.Get());
324
+ }
325
+ template <>
326
+ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
327
+ HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get());
328
+ }
329
+ template <>
330
+ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
331
+ HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get());
332
+ }
333
+
334
+ template <HVXPacketSize T>
335
+ EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
336
+ return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
337
+ }
338
+ template <>
339
+ EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
340
+ return pmul_hvx(a, b);
341
+ }
342
+ template <>
343
+ EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
344
+ return pmul_hvx(a, b);
345
+ }
346
+ template <>
347
+ EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
348
+ return pmul_hvx(a, b);
349
+ }
350
+
351
+ template <HVXPacketSize T>
352
+ EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
353
+ return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
354
+ }
355
+ template <>
356
+ EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
357
+ return padd_hvx(a, b);
358
+ }
359
+ template <>
360
+ EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
361
+ return padd_hvx(a, b);
362
+ }
363
+ template <>
364
+ EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
365
+ return padd_hvx(a, b);
366
+ }
367
+
368
+ template <HVXPacketSize T>
369
+ EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
370
+ return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
371
+ }
372
+ template <>
373
+ EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
374
+ return psub_hvx(a, b);
375
+ }
376
+ template <>
377
+ EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
378
+ return psub_hvx(a, b);
379
+ }
380
+ template <>
381
+ EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
382
+ return psub_hvx(a, b);
383
+ }
384
+
385
+ template <HVXPacketSize T>
386
+ EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) {
387
+ return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
388
+ }
389
+ template <>
390
+ EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) {
391
+ return pnegate_hvx(a);
392
+ }
393
+ template <>
394
+ EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
395
+ return pnegate_hvx(a);
396
+ }
397
+ template <>
398
+ EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
399
+ return pnegate_hvx(a);
400
+ }
401
+
402
+ template <HVXPacketSize T>
403
+ EIGEN_STRONG_INLINE HVXPacket<T> ptrue_hvx(const HVXPacket<T>& a) {
404
+ return HVXPacket<T>::Create(Q6_V_vsplat_R(0x3f800000));
405
+ }
406
+ template <>
407
+ EIGEN_STRONG_INLINE Packet32f ptrue(const Packet32f& a) {
408
+ return ptrue_hvx(a);
409
+ }
410
+ template <>
411
+ EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) {
412
+ return ptrue_hvx(a);
413
+ }
414
+ template <>
415
+ EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) {
416
+ return ptrue_hvx(a);
417
+ }
418
+
419
+ template <HVXPacketSize T>
420
+ EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
421
+ HVX_Vector v_true = ptrue(a).Get();
422
+ HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
423
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
424
+ }
425
+ template <>
426
+ EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
427
+ return pcmp_le_hvx(a, b);
428
+ }
429
+ template <>
430
+ EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
431
+ return pcmp_le_hvx(a, b);
432
+ }
433
+ template <>
434
+ EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
435
+ return pcmp_le_hvx(a, b);
436
+ }
437
+
438
+ template <HVXPacketSize T>
439
+ EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
440
+ HVX_Vector v_true = ptrue(a).Get();
441
+ HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
442
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
443
+ }
444
+ template <>
445
+ EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
446
+ return pcmp_eq_hvx(a, b);
447
+ }
448
+ template <>
449
+ EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
450
+ return pcmp_eq_hvx(a, b);
451
+ }
452
+ template <>
453
+ EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
454
+ return pcmp_eq_hvx(a, b);
455
+ }
456
+
457
+ template <HVXPacketSize T>
458
+ EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
459
+ HVX_Vector v_true = ptrue(a).Get();
460
+ HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
461
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
462
+ }
463
+ template <>
464
+ EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) {
465
+ return pcmp_lt_hvx(a, b);
466
+ }
467
+ template <>
468
+ EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
469
+ return pcmp_lt_hvx(a, b);
470
+ }
471
+ template <>
472
+ EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
473
+ return pcmp_lt_hvx(a, b);
474
+ }
475
+
476
+ template <HVXPacketSize T>
477
+ EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
478
+ HVX_Vector v_true = ptrue(a).Get();
479
+ HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
480
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
481
+ }
482
+ template <>
483
+ EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
484
+ return pcmp_lt_or_nan_hvx(a, b);
485
+ }
486
+ template <>
487
+ EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
488
+ return pcmp_lt_or_nan_hvx(a, b);
489
+ }
490
+ template <>
491
+ EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
492
+ return pcmp_lt_or_nan_hvx(a, b);
493
+ }
494
+
495
+ template <HVXPacketSize T>
496
+ EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) {
497
+ return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
498
+ }
499
+ template <>
500
+ EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) {
501
+ return pabs_hvx(a);
502
+ }
503
+ template <>
504
+ EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
505
+ return pabs_hvx(a);
506
+ }
507
+ template <>
508
+ EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
509
+ return pabs_hvx(a);
510
+ }
511
+
512
+ template <HVXPacketSize T>
513
+ EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) {
514
+ union {
515
+ float array[1];
516
+ HVX_Vector vector;
517
+ } HVX_and_array;
518
+ HVX_and_array.vector = a.Get();
519
+ return HVX_and_array.array[0];
520
+ }
521
+ template <>
522
+ EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) {
523
+ return pfirst_hvx(a);
524
+ }
525
+ template <>
526
+ EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) {
527
+ return pfirst_hvx(a);
528
+ }
529
+ template <>
530
+ EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) {
531
+ return pfirst_hvx(a);
532
+ }
533
+
534
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
535
+ // Shuffle the 32-bit lanes.
536
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
537
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
538
+
539
+ // Shuffle the 64-bit lanes.
540
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
541
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
542
+ kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
543
+ kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
544
+ kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
545
+ kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
546
+ }
547
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
548
+ // Shuffle the 32-bit lanes.
549
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
550
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
551
+
552
+ // Shuffle the 64-bit lanes.
553
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
554
+
555
+ kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
556
+ kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
557
+ kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
558
+ kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
559
+ }
560
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
561
+ // Shuffle the 32-bit lanes.
562
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
563
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
564
+
565
+ // Shuffle the 64-bit lanes.
566
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
567
+
568
+ kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
569
+ kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
570
+ kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
571
+ kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
572
+ }
573
+
574
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
575
+ // Shuffle the 32-bit lanes.
576
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
577
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
578
+ HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
579
+ HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
580
+
581
+ // Shuffle the 64-bit lanes.
582
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
583
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
584
+
585
+ // Shuffle the 128-bit lanes.
586
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
587
+
588
+ kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
589
+ kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
590
+ kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
591
+ kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
592
+ kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
593
+ kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
594
+ kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
595
+ kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
596
+ }
597
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
598
+ // Shuffle the 32-bit lanes.
599
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
600
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
601
+ HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
602
+ HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
603
+ HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
604
+ HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
605
+ HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
606
+ HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
607
+
608
+ // Shuffle the 64-bit lanes.
609
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
610
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
611
+ HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
612
+ HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
613
+
614
+ // Shuffle the 128-bit lanes.
615
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
616
+ v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
617
+ v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
618
+ v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
619
+
620
+ // Shuffle the 256-bit lanes.
621
+ v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
622
+ v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
623
+ v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
624
+ v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
625
+
626
+ kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
627
+ kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
628
+ kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
629
+ kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
630
+ kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
631
+ kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
632
+ kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
633
+ kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
634
+ kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
635
+ kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
636
+ kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
637
+ kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
638
+ kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
639
+ kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
640
+ kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
641
+ kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
642
+ }
643
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
644
+ // Shuffle the 32-bit lanes.
645
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
646
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
647
+ HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
648
+ HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
649
+ HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
650
+ HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
651
+ HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
652
+ HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
653
+ HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
654
+ HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
655
+ HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
656
+ HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
657
+ HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
658
+ HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
659
+ HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
660
+ HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
661
+
662
+ // Shuffle the 64-bit lanes.
663
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
664
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
665
+ HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
666
+ HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
667
+ HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
668
+ HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
669
+ HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
670
+ HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
671
+ HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
672
+ HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
673
+ HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
674
+ HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
675
+ HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
676
+ HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
677
+ HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
678
+ HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
679
+
680
+ // Shuffle the 128-bit lanes.
681
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
682
+ v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
683
+ v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
684
+ v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
685
+ v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
686
+ v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
687
+ v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
688
+ v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
689
+ v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
690
+ v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
691
+ v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
692
+ v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
693
+ v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
694
+ v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
695
+ v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
696
+ v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
697
+
698
+ // Shuffle the 256-bit lanes.
699
+ v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
700
+ v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
701
+ v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
702
+ v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
703
+ v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
704
+ v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
705
+ v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
706
+ v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
707
+ v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
708
+ v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
709
+ v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
710
+ v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
711
+ v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
712
+ v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
713
+ v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
714
+ v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
715
+
716
+ // Shuffle the 512-bit lanes.
717
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
718
+ v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
719
+ v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
720
+ v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
721
+ v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
722
+ v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
723
+ v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
724
+ v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
725
+ v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
726
+ v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
727
+ v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
728
+ v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
729
+ v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
730
+ v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
731
+ v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
732
+ v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
733
+
734
+ kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
735
+ kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
736
+ kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_3_2));
737
+ kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_3_2));
738
+ kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_5_4));
739
+ kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_5_4));
740
+ kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_7_6));
741
+ kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_7_6));
742
+ kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_9_8));
743
+ kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_9_8));
744
+ kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_11_10));
745
+ kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_11_10));
746
+ kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_13_12));
747
+ kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_13_12));
748
+ kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_15_14));
749
+ kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_15_14));
750
+ kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_17_16));
751
+ kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_17_16));
752
+ kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_19_18));
753
+ kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_19_18));
754
+ kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_21_20));
755
+ kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_21_20));
756
+ kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_23_22));
757
+ kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_23_22));
758
+ kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_25_24));
759
+ kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_25_24));
760
+ kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_27_26));
761
+ kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_27_26));
762
+ kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_29_28));
763
+ kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_29_28));
764
+ kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_31_30));
765
+ kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
766
+ }
767
+
768
+ template <HVXPacketSize T>
769
+ EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
770
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
771
+ HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
772
+ for (int i = 2; i < packet_size; i <<= 1) {
773
+ vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
774
+ }
775
+ return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
776
+ }
777
+ template <>
778
+ EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
779
+ return predux_hvx(a);
780
+ }
781
+ template <>
782
+ EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
783
+ return predux_hvx(a);
784
+ }
785
+ template <>
786
+ EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
787
+ return predux_hvx(a);
788
+ }
789
+
790
+ template <HVXPacketSize T>
791
+ EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) {
792
+ constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
793
+ HVX_Vector load = HVX_load_partial<size, 0>(from);
794
+ HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
795
+ return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
796
+ }
797
+ template <>
798
+ EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) {
799
+ return ploaddup_hvx<HVXPacketSize::Full>(from);
800
+ }
801
+ template <>
802
+ EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) {
803
+ return ploaddup_hvx<HVXPacketSize::Half>(from);
804
+ }
805
+ template <>
806
+ EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) {
807
+ return ploaddup_hvx<HVXPacketSize::Quarter>(from);
808
+ }
809
+
810
+ template <HVXPacketSize T>
811
+ EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) {
812
+ constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
813
+ HVX_Vector load = HVX_load_partial<size, 0>(from);
814
+ HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
815
+ HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
816
+ return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
817
+ }
818
+ template <>
819
+ EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
820
+ return ploadquad_hvx<HVXPacketSize::Full>(from);
821
+ }
822
+ template <>
823
+ EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) {
824
+ return ploadquad_hvx<HVXPacketSize::Half>(from);
825
+ }
826
+ template <>
827
+ EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) {
828
+ return ploadquad_hvx<HVXPacketSize::Quarter>(from);
829
+ }
830
+
831
+ template <>
832
+ EIGEN_STRONG_INLINE Packet32f preverse(const Packet32f& a) {
833
+ HVX_Vector delta = Q6_Vb_vsplat_R(0x7c);
834
+ return Packet32f::Create(Q6_V_vdelta_VV(a.Get(), delta));
835
+ }
836
+
837
+ template <>
838
+ EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
839
+ HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
840
+ return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
841
+ }
842
+
843
+ template <>
844
+ EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
845
+ HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
846
+ return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
847
+ }
848
+
849
+ template <HVXPacketSize T>
850
+ EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
851
+ return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
852
+ }
853
+ template <>
854
+ EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
855
+ return pmin_hvx(a, b);
856
+ }
857
+ template <>
858
+ EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) {
859
+ return pmin_hvx(a, b);
860
+ }
861
+ template <>
862
+ EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) {
863
+ return pmin_hvx(a, b);
864
+ }
865
+
866
+ template <HVXPacketSize T>
867
+ EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
868
+ return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
869
+ }
870
+ template <>
871
+ EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) {
872
+ return pmax_hvx(a, b);
873
+ }
874
+ template <>
875
+ EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) {
876
+ return pmax_hvx(a, b);
877
+ }
878
+ template <>
879
+ EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) {
880
+ return pmax_hvx(a, b);
881
+ }
882
+
883
+ template <HVXPacketSize T>
884
+ EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
885
+ return HVXPacket<T>::Create(a.Get() & b.Get());
886
+ }
887
+ template <>
888
+ EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) {
889
+ return pand_hvx(a, b);
890
+ }
891
+ template <>
892
+ EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) {
893
+ return pand_hvx(a, b);
894
+ }
895
+ template <>
896
+ EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) {
897
+ return pand_hvx(a, b);
898
+ }
899
+
900
+ template <HVXPacketSize T>
901
+ EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
902
+ return HVXPacket<T>::Create(a.Get() | b.Get());
903
+ }
904
+ template <>
905
+ EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) {
906
+ return por_hvx(a, b);
907
+ }
908
+ template <>
909
+ EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) {
910
+ return por_hvx(a, b);
911
+ }
912
+ template <>
913
+ EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) {
914
+ return por_hvx(a, b);
915
+ }
916
+
917
+ template <HVXPacketSize T>
918
+ EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
919
+ return HVXPacket<T>::Create(a.Get() ^ b.Get());
920
+ }
921
+ template <>
922
+ EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) {
923
+ return pxor_hvx(a, b);
924
+ }
925
+ template <>
926
+ EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) {
927
+ return pxor_hvx(a, b);
928
+ }
929
+ template <>
930
+ EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) {
931
+ return pxor_hvx(a, b);
932
+ }
933
+
934
+ template <HVXPacketSize T>
935
+ EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) {
936
+ return HVXPacket<T>::Create(~a.Get());
937
+ }
938
+ template <>
939
+ EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) {
940
+ return pnot_hvx(a);
941
+ }
942
+ template <>
943
+ EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) {
944
+ return pnot_hvx(a);
945
+ }
946
+ template <>
947
+ EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) {
948
+ return pnot_hvx(a);
949
+ }
950
+
951
+ template <HVXPacketSize T>
952
+ EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) {
953
+ HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
954
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
955
+ }
956
+ template <>
957
+ EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
958
+ return pselect_hvx(mask, a, b);
959
+ }
960
+ template <>
961
+ EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
962
+ return pselect_hvx(mask, a, b);
963
+ }
964
+ template <>
965
+ EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
966
+ return pselect_hvx(mask, a, b);
967
+ }
968
+
969
+ template <HVXPacketSize T, typename Op>
970
+ EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) {
971
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
972
+ HVXPacket<T> vredux = a;
973
+ for (int i = 1; i < packet_size; i <<= 1) {
974
+ vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float))));
975
+ }
976
+ return pfirst(vredux);
977
+ }
978
+
979
+ template <>
980
+ EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) {
981
+ return predux_generic(a, pmax<Packet32f>);
982
+ }
983
+ template <>
984
+ EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
985
+ return predux_generic(a, pmax<Packet16f>);
986
+ }
987
+ template <>
988
+ EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
989
+ return predux_generic(a, pmax<Packet8f>);
990
+ }
991
+
992
+ template <>
993
+ EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) {
994
+ return predux_generic(a, pmin<Packet32f>);
995
+ }
996
+ template <>
997
+ EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
998
+ return predux_generic(a, pmin<Packet16f>);
999
+ }
1000
+ template <>
1001
+ EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
1002
+ return predux_generic(a, pmin<Packet8f>);
1003
+ }
1004
+
1005
+ template <>
1006
+ EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) {
1007
+ return predux_generic(a, por<Packet32f>) != 0.0f;
1008
+ }
1009
+ template <>
1010
+ EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
1011
+ return predux_generic(a, por<Packet16f>) != 0.0f;
1012
+ }
1013
+ template <>
1014
+ EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
1015
+ return predux_generic(a, por<Packet8f>) != 0.0f;
1016
+ }
1017
+
1018
+ static const float index_vsf[32]
1019
+ __attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1020
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
1021
+
1022
+ template <HVXPacketSize T>
1023
+ EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) {
1024
+ return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
1025
+ }
1026
+ template <>
1027
+ EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
1028
+ return plset_hvx<HVXPacketSize::Full>(a);
1029
+ }
1030
+ template <>
1031
+ EIGEN_STRONG_INLINE Packet16f plset(const float& a) {
1032
+ return plset_hvx<HVXPacketSize::Half>(a);
1033
+ }
1034
+ template <>
1035
+ EIGEN_STRONG_INLINE Packet8f plset(const float& a) {
1036
+ return plset_hvx<HVXPacketSize::Quarter>(a);
1037
+ }
1038
+
1039
+ template <HVXPacketSize T>
1040
+ EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) {
1041
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1042
+ float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1043
+ pstore<float>(elements, from);
1044
+ for (Index i = 0; i < packet_size; ++i) {
1045
+ to[i * stride] = elements[i];
1046
+ }
1047
+ }
1048
+ template <>
1049
+ EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) {
1050
+ pscatter_hvx(to, from, stride);
1051
+ }
1052
+ template <>
1053
+ EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
1054
+ pscatter_hvx(to, from, stride);
1055
+ }
1056
+ template <>
1057
+ EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
1058
+ pscatter_hvx(to, from, stride);
1059
+ }
1060
+
1061
+ template <HVXPacketSize T>
1062
+ EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) {
1063
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1064
+ float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1065
+ for (Index i = 0; i < packet_size; i++) {
1066
+ elements[i] = from[i * stride];
1067
+ }
1068
+ return pload<HVXPacket<T>>(elements);
1069
+ }
1070
+ template <>
1071
+ EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) {
1072
+ return pgather_hvx<HVXPacketSize::Full>(from, stride);
1073
+ }
1074
+ template <>
1075
+ EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
1076
+ return pgather_hvx<HVXPacketSize::Half>(from, stride);
1077
+ }
1078
+ template <>
1079
+ EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
1080
+ return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
1081
+ }
1082
+
1083
+ } // end namespace internal
1084
+ } // end namespace Eigen
1085
+
1086
+ #endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
1087
+
1088
+ #endif // EIGEN_HVX_PACKET_MATH_H