@smake/eigen 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -20
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +187 -120
  6. package/eigen/Eigen/Eigenvalues +16 -13
  7. package/eigen/Eigen/Geometry +18 -18
  8. package/eigen/Eigen/Householder +9 -7
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -13
  11. package/eigen/Eigen/KLUSupport +23 -21
  12. package/eigen/Eigen/LU +15 -16
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -51
  15. package/eigen/Eigen/PaStiXSupport +23 -21
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -20
  18. package/eigen/Eigen/QtAlignedMalloc +5 -12
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -17
  21. package/eigen/Eigen/Sparse +1 -2
  22. package/eigen/Eigen/SparseCholesky +18 -15
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +9 -9
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
  37. package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
  42. package/eigen/Eigen/src/Core/Array.h +329 -370
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
  48. package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
  49. package/eigen/Eigen/src/Core/Block.h +371 -390
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
  53. package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
  59. package/eigen/Eigen/src/Core/DenseBase.h +630 -658
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
  61. package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +168 -207
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +167 -217
  67. package/eigen/Eigen/src/Core/EigenBase.h +74 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
  75. package/eigen/Eigen/src/Core/IO.h +131 -156
  76. package/eigen/Eigen/src/Core/IndexedView.h +209 -125
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +50 -59
  80. package/eigen/Eigen/src/Core/Map.h +123 -141
  81. package/eigen/Eigen/src/Core/MapBase.h +255 -282
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
  84. package/eigen/Eigen/src/Core/Matrix.h +463 -494
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
  86. package/eigen/Eigen/src/Core/NestByValue.h +58 -52
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -86
  88. package/eigen/Eigen/src/Core/NumTraits.h +206 -206
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
  92. package/eigen/Eigen/src/Core/Product.h +246 -130
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
  94. package/eigen/Eigen/src/Core/Random.h +153 -164
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +334 -314
  98. package/eigen/Eigen/src/Core/Ref.h +259 -257
  99. package/eigen/Eigen/src/Core/Replicate.h +92 -104
  100. package/eigen/Eigen/src/Core/Reshaped.h +215 -271
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +133 -148
  103. package/eigen/Eigen/src/Core/Select.h +68 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +88 -102
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
  109. package/eigen/Eigen/src/Core/SolverBase.h +132 -133
  110. package/eigen/Eigen/src/Core/StableNorm.h +113 -147
  111. package/eigen/Eigen/src/Core/StlIterators.h +404 -248
  112. package/eigen/Eigen/src/Core/Stride.h +90 -92
  113. package/eigen/Eigen/src/Core/Swap.h +70 -39
  114. package/eigen/Eigen/src/Core/Transpose.h +258 -295
  115. package/eigen/Eigen/src/Core/Transpositions.h +270 -333
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
  119. package/eigen/Eigen/src/Core/Visitor.h +464 -308
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
  217. package/eigen/Eigen/src/Core/util/Constants.h +297 -262
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
  226. package/eigen/Eigen/src/Core/util/Macros.h +655 -773
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +970 -748
  229. package/eigen/Eigen/src/Core/util/Meta.h +581 -633
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
  264. package/eigen/Eigen/src/Geometry/Transform.h +858 -936
  265. package/eigen/Eigen/src/Geometry/Translation.h +94 -92
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
  269. package/eigen/Eigen/src/Householder/Householder.h +102 -124
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
  285. package/eigen/Eigen/src/LU/Determinant.h +50 -69
  286. package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/package.json +1 -1
  409. package/eigen/COPYING.APACHE +0 -203
  410. package/eigen/COPYING.BSD +0 -26
  411. package/eigen/COPYING.GPL +0 -674
  412. package/eigen/COPYING.LGPL +0 -502
  413. package/eigen/COPYING.MINPACK +0 -51
  414. package/eigen/COPYING.MPL2 +0 -373
  415. package/eigen/COPYING.README +0 -18
  416. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
  417. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
  418. package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
  419. package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
  420. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  421. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  422. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  423. package/eigen/Eigen/src/misc/lapack.h +0 -152
  424. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
  425. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
  426. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  427. package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
  428. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  429. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
  430. package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
  431. package/eigen/README.md +0 -5
@@ -0,0 +1,1088 @@
1
+
2
+ #ifndef EIGEN_HVX_PACKET_MATH_H
3
+ #define EIGEN_HVX_PACKET_MATH_H
4
+
5
+ // Only support 128B HVX now.
6
+ // Floating-point operations are supported only since V68.
7
+ #if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
8
+
9
+ // All the floating-point operations do not support IEEE standard.
10
+ // From HVX document:
11
+ // There is no concept of infinity or NaN. QFloat saturates to maximum
12
+ // exponent with maximum positive or minimum negative significand.
13
+
14
+ #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
15
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
16
+ #endif
17
+
18
+ namespace Eigen {
19
+ namespace internal {
20
+
21
+ // HVX utilities.
22
+
23
+ template <int D>
24
+ EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) {
25
+ HVX_Vector v;
26
+ #if EIGEN_COMP_CLANG
27
+ // Use inlined assembly for aligned vmem load on unaligned memory.
28
+ // Use type cast to HVX_Vector* may mess up with compiler data alignment.
29
+ __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory");
30
+ #else
31
+ void* aligned_mem =
32
+ reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
33
+ memcpy(&v, aligned_mem, __HVX_LENGTH__);
34
+ #endif
35
+ return v;
36
+ }
37
+
38
+ template <typename T>
39
+ EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) {
40
+ HVX_Vector v;
41
+ memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__);
42
+ return v;
43
+ }
44
+
45
+ template <typename T>
46
+ EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) {
47
+ HVX_Vector v;
48
+ memcpy(&v, mem, __HVX_LENGTH__);
49
+ return v;
50
+ }
51
+
52
+ template <size_t Size, size_t Alignment, typename T>
53
+ EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) {
54
+ #if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
55
+ // Fast partial vector load through aligned vmem load.
56
+ // The load may past end of array but is aligned to prevent memory fault.
57
+ HVX_Vector v0 = HVX_vmem<0>(mem);
58
+ HVX_Vector v1 = v0;
59
+ uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
60
+ EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) {
61
+ // Data size less than alignment will never cross multiple aligned vectors.
62
+ v1 = v0;
63
+ }
64
+ else {
65
+ uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
66
+ if (left_off + Size * sizeof(T) > __HVX_LENGTH__) {
67
+ v1 = HVX_vmem<1>(mem);
68
+ } else {
69
+ v1 = v0;
70
+ }
71
+ }
72
+ return Q6_V_valign_VVR(v1, v0, mem_addr);
73
+ #else
74
+ HVX_Vector v;
75
+ memcpy(&v, mem, Size * sizeof(T));
76
+ return v;
77
+ #endif
78
+ }
79
+
80
+ template <typename T>
81
+ EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) {
82
+ memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__);
83
+ }
84
+
85
+ template <typename T>
86
+ EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) {
87
+ memcpy(mem, &v, __HVX_LENGTH__);
88
+ }
89
+
90
+ template <size_t Size, size_t Alignment, typename T>
91
+ EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) {
92
+ uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
93
+ HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
94
+ uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
95
+ uintptr_t right_off = left_off + Size * sizeof(T);
96
+
97
+ HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
98
+ HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
99
+
100
+ EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) {
101
+ if (right_off > __HVX_LENGTH__) {
102
+ Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value);
103
+ qr = Q6_Q_vcmp_eq_VbVb(value, value);
104
+ }
105
+ }
106
+
107
+ ql_not = Q6_Q_or_QQn(ql_not, qr);
108
+ Q6_vmem_QnRIV(ql_not, mem, value);
109
+ }
110
+
111
+ // Packet definitions.
112
+ enum class HVXPacketSize {
113
+ Full,
114
+ Half,
115
+ Quarter,
116
+ };
117
+
118
+ // Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
119
+ // Wrap different vector type (float32, int32, etc) to different class with
120
+ // explicit constructor and casting back-and-force to HVX_Vector.
121
+ template <HVXPacketSize T>
122
+ class HVXPacket {
123
+ public:
124
+ HVXPacket() = default;
125
+ static HVXPacket Create(HVX_Vector v) { return HVXPacket(v); }
126
+ HVX_Vector Get() const { return m_val; }
127
+
128
+ private:
129
+ explicit HVXPacket(HVX_Vector v) : m_val(v) {}
130
+ HVX_Vector m_val = Q6_V_vzero();
131
+ };
132
+
133
+ typedef HVXPacket<HVXPacketSize::Full> Packet32f;
134
+ typedef HVXPacket<HVXPacketSize::Half> Packet16f;
135
+ typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
136
+
137
+ // Packet traits.
138
+ template <>
139
+ struct packet_traits<float> : default_packet_traits {
140
+ typedef Packet32f type;
141
+ typedef Packet16f half;
142
+ enum {
143
+ Vectorizable = 1,
144
+ AlignedOnScalar = 1,
145
+ size = 32,
146
+
147
+ HasCmp = 1,
148
+ HasAdd = 1,
149
+ HasSub = 1,
150
+ HasShift = 0,
151
+ HasMul = 1,
152
+ HasNegate = 1,
153
+ HasAbs = 1,
154
+ HasArg = 0,
155
+ HasAbs2 = 0,
156
+ HasAbsDiff = 0,
157
+ HasMin = 1,
158
+ HasMax = 1,
159
+ HasConj = 0,
160
+ HasSetLinear = 0,
161
+ HasBlend = 0,
162
+
163
+ HasDiv = 0,
164
+
165
+ HasSin = 0,
166
+ HasCos = 0,
167
+ HasACos = 0,
168
+ HasASin = 0,
169
+ HasATan = 0,
170
+ HasATanh = 0,
171
+ HasLog = 0,
172
+ HasExp = 0,
173
+ HasSqrt = 0,
174
+ HasRsqrt = 0,
175
+ HasTanh = 0,
176
+ HasErf = 0,
177
+ HasBessel = 0,
178
+ HasNdtri = 0
179
+ };
180
+ };
181
+
182
+ template <>
183
+ struct unpacket_traits<Packet32f> {
184
+ typedef float type;
185
+ typedef Packet16f half;
186
+ enum {
187
+ size = 32,
188
+ alignment = Aligned128,
189
+ vectorizable = true,
190
+ masked_load_available = false,
191
+ masked_store_available = false
192
+ };
193
+ };
194
+
195
+ template <>
196
+ struct unpacket_traits<Packet16f> {
197
+ typedef float type;
198
+ typedef Packet8f half;
199
+ enum {
200
+ size = 16,
201
+ // Many code assume alignment on packet size instead of following trait
202
+ // So we do not use Aligned128 to optimize aligned load/store,
203
+ alignment = Aligned64,
204
+ vectorizable = true,
205
+ masked_load_available = false,
206
+ masked_store_available = false
207
+ };
208
+ };
209
+
210
+ template <>
211
+ struct unpacket_traits<Packet8f> {
212
+ typedef float type;
213
+ typedef Packet8f half;
214
+ enum {
215
+ size = 8,
216
+ // Many code assume alignment on packet size instead of following trait
217
+ // So we do not use Aligned128 to optimize aligned load/store,
218
+ alignment = Aligned32,
219
+ vectorizable = true,
220
+ masked_load_available = false,
221
+ masked_store_available = false
222
+ };
223
+ };
224
+
225
+ // float32 operations.
226
+ template <HVXPacketSize T>
227
+ EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) {
228
+ return HVXPacket<T>::Create(Q6_V_vzero());
229
+ }
230
+ template <>
231
+ EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) {
232
+ return pzero_hvx(Packet32f());
233
+ }
234
+ template <>
235
+ EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) {
236
+ return pzero_hvx(Packet16f());
237
+ }
238
+ template <>
239
+ EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
240
+ return pzero_hvx(Packet8f());
241
+ }
242
+
243
+ template <HVXPacketSize T>
244
+ EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
245
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
246
+ return unpacket_traits<HVXPacket<T>>::half::Create(
247
+ Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
248
+ }
249
+ template <>
250
+ EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
251
+ return predux_half_dowto4_hvx(a);
252
+ }
253
+ template <>
254
+ EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
255
+ return predux_half_dowto4_hvx(a);
256
+ }
257
+
258
+ template <HVXPacketSize T>
259
+ EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) {
260
+ union {
261
+ float f;
262
+ int32_t i;
263
+ } u;
264
+ u.f = from;
265
+ return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
266
+ }
267
+ template <>
268
+ EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
269
+ return pset1_hvx<HVXPacketSize::Full>(from);
270
+ }
271
+ template <>
272
+ EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
273
+ return pset1_hvx<HVXPacketSize::Half>(from);
274
+ }
275
+ template <>
276
+ EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
277
+ return pset1_hvx<HVXPacketSize::Quarter>(from);
278
+ }
279
+
280
+ template <>
281
+ EIGEN_STRONG_INLINE Packet32f pload<Packet32f>(const float* from) {
282
+ return Packet32f::Create(HVX_load(from));
283
+ }
284
+ template <>
285
+ EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
286
+ return Packet16f::Create(
287
+ HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from));
288
+ }
289
+ template <>
290
+ EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
291
+ return Packet8f::Create(
292
+ HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from));
293
+ }
294
+
295
+ template <>
296
+ EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) {
297
+ return Packet32f::Create(HVX_loadu(from));
298
+ }
299
+ template <>
300
+ EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
301
+ return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
302
+ }
303
+ template <>
304
+ EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
305
+ return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
306
+ }
307
+
308
+ template <>
309
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) {
310
+ HVX_store(to, from.Get());
311
+ }
312
+ template <>
313
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
314
+ HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get());
315
+ }
316
+ template <>
317
+ EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
318
+ HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get());
319
+ }
320
+
321
+ template <>
322
+ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) {
323
+ HVX_storeu(to, from.Get());
324
+ }
325
+ template <>
326
+ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
327
+ HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get());
328
+ }
329
+ template <>
330
+ EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
331
+ HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get());
332
+ }
333
+
334
+ template <HVXPacketSize T>
335
+ EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
336
+ return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
337
+ }
338
+ template <>
339
+ EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
340
+ return pmul_hvx(a, b);
341
+ }
342
+ template <>
343
+ EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
344
+ return pmul_hvx(a, b);
345
+ }
346
+ template <>
347
+ EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
348
+ return pmul_hvx(a, b);
349
+ }
350
+
351
+ template <HVXPacketSize T>
352
+ EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
353
+ return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
354
+ }
355
+ template <>
356
+ EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
357
+ return padd_hvx(a, b);
358
+ }
359
+ template <>
360
+ EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
361
+ return padd_hvx(a, b);
362
+ }
363
+ template <>
364
+ EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
365
+ return padd_hvx(a, b);
366
+ }
367
+
368
+ template <HVXPacketSize T>
369
+ EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
370
+ return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
371
+ }
372
+ template <>
373
+ EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
374
+ return psub_hvx(a, b);
375
+ }
376
+ template <>
377
+ EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
378
+ return psub_hvx(a, b);
379
+ }
380
+ template <>
381
+ EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
382
+ return psub_hvx(a, b);
383
+ }
384
+
385
+ template <HVXPacketSize T>
386
+ EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) {
387
+ return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
388
+ }
389
+ template <>
390
+ EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) {
391
+ return pnegate_hvx(a);
392
+ }
393
+ template <>
394
+ EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
395
+ return pnegate_hvx(a);
396
+ }
397
+ template <>
398
+ EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
399
+ return pnegate_hvx(a);
400
+ }
401
+
402
+ template <HVXPacketSize T>
403
+ EIGEN_STRONG_INLINE HVXPacket<T> ptrue_hvx(const HVXPacket<T>& a) {
404
+ return HVXPacket<T>::Create(Q6_V_vsplat_R(0x3f800000));
405
+ }
406
+ template <>
407
+ EIGEN_STRONG_INLINE Packet32f ptrue(const Packet32f& a) {
408
+ return ptrue_hvx(a);
409
+ }
410
+ template <>
411
+ EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) {
412
+ return ptrue_hvx(a);
413
+ }
414
+ template <>
415
+ EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) {
416
+ return ptrue_hvx(a);
417
+ }
418
+
419
+ template <HVXPacketSize T>
420
+ EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
421
+ HVX_Vector v_true = ptrue(a).Get();
422
+ HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
423
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
424
+ }
425
+ template <>
426
+ EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
427
+ return pcmp_le_hvx(a, b);
428
+ }
429
+ template <>
430
+ EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
431
+ return pcmp_le_hvx(a, b);
432
+ }
433
+ template <>
434
+ EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
435
+ return pcmp_le_hvx(a, b);
436
+ }
437
+
438
+ template <HVXPacketSize T>
439
+ EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
440
+ HVX_Vector v_true = ptrue(a).Get();
441
+ HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
442
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
443
+ }
444
+ template <>
445
+ EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
446
+ return pcmp_eq_hvx(a, b);
447
+ }
448
+ template <>
449
+ EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
450
+ return pcmp_eq_hvx(a, b);
451
+ }
452
+ template <>
453
+ EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
454
+ return pcmp_eq_hvx(a, b);
455
+ }
456
+
457
+ template <HVXPacketSize T>
458
+ EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
459
+ HVX_Vector v_true = ptrue(a).Get();
460
+ HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
461
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
462
+ }
463
+ template <>
464
+ EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) {
465
+ return pcmp_lt_hvx(a, b);
466
+ }
467
+ template <>
468
+ EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
469
+ return pcmp_lt_hvx(a, b);
470
+ }
471
+ template <>
472
+ EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
473
+ return pcmp_lt_hvx(a, b);
474
+ }
475
+
476
+ template <HVXPacketSize T>
477
+ EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
478
+ HVX_Vector v_true = ptrue(a).Get();
479
+ HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
480
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
481
+ }
482
+ template <>
483
+ EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
484
+ return pcmp_lt_or_nan_hvx(a, b);
485
+ }
486
+ template <>
487
+ EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
488
+ return pcmp_lt_or_nan_hvx(a, b);
489
+ }
490
+ template <>
491
+ EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
492
+ return pcmp_lt_or_nan_hvx(a, b);
493
+ }
494
+
495
+ template <HVXPacketSize T>
496
+ EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) {
497
+ return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
498
+ }
499
+ template <>
500
+ EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) {
501
+ return pabs_hvx(a);
502
+ }
503
+ template <>
504
+ EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
505
+ return pabs_hvx(a);
506
+ }
507
+ template <>
508
+ EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
509
+ return pabs_hvx(a);
510
+ }
511
+
512
+ template <HVXPacketSize T>
513
+ EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) {
514
+ union {
515
+ float array[1];
516
+ HVX_Vector vector;
517
+ } HVX_and_array;
518
+ HVX_and_array.vector = a.Get();
519
+ return HVX_and_array.array[0];
520
+ }
521
+ template <>
522
+ EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) {
523
+ return pfirst_hvx(a);
524
+ }
525
+ template <>
526
+ EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) {
527
+ return pfirst_hvx(a);
528
+ }
529
+ template <>
530
+ EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) {
531
+ return pfirst_hvx(a);
532
+ }
533
+
534
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
535
+ // Shuffle the 32-bit lanes.
536
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
537
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
538
+
539
+ // Shuffle the 64-bit lanes.
540
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
541
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
542
+ kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
543
+ kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
544
+ kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
545
+ kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
546
+ }
547
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
548
+ // Shuffle the 32-bit lanes.
549
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
550
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
551
+
552
+ // Shuffle the 64-bit lanes.
553
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
554
+
555
+ kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
556
+ kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
557
+ kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
558
+ kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
559
+ }
560
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
561
+ // Shuffle the 32-bit lanes.
562
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
563
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
564
+
565
+ // Shuffle the 64-bit lanes.
566
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
567
+
568
+ kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
569
+ kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
570
+ kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
571
+ kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
572
+ }
573
+
574
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
575
+ // Shuffle the 32-bit lanes.
576
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
577
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
578
+ HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
579
+ HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
580
+
581
+ // Shuffle the 64-bit lanes.
582
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
583
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
584
+
585
+ // Shuffle the 128-bit lanes.
586
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
587
+
588
+ kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
589
+ kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
590
+ kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
591
+ kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
592
+ kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
593
+ kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
594
+ kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
595
+ kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
596
+ }
597
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
598
+ // Shuffle the 32-bit lanes.
599
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
600
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
601
+ HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
602
+ HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
603
+ HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
604
+ HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
605
+ HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
606
+ HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
607
+
608
+ // Shuffle the 64-bit lanes.
609
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
610
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
611
+ HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
612
+ HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
613
+
614
+ // Shuffle the 128-bit lanes.
615
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
616
+ v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
617
+ v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
618
+ v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
619
+
620
+ // Shuffle the 256-bit lanes.
621
+ v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
622
+ v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
623
+ v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
624
+ v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
625
+
626
+ kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
627
+ kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
628
+ kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
629
+ kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
630
+ kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
631
+ kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
632
+ kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
633
+ kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
634
+ kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
635
+ kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
636
+ kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
637
+ kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
638
+ kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
639
+ kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
640
+ kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
641
+ kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
642
+ }
643
+ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
644
+ // Shuffle the 32-bit lanes.
645
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
646
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
647
+ HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
648
+ HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
649
+ HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
650
+ HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
651
+ HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
652
+ HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
653
+ HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
654
+ HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
655
+ HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
656
+ HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
657
+ HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
658
+ HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
659
+ HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
660
+ HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
661
+
662
+ // Shuffle the 64-bit lanes.
663
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
664
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
665
+ HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
666
+ HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
667
+ HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
668
+ HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
669
+ HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
670
+ HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
671
+ HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
672
+ HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
673
+ HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
674
+ HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
675
+ HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
676
+ HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
677
+ HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
678
+ HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
679
+
680
+ // Shuffle the 128-bit lanes.
681
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
682
+ v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
683
+ v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
684
+ v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
685
+ v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
686
+ v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
687
+ v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
688
+ v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
689
+ v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
690
+ v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
691
+ v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
692
+ v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
693
+ v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
694
+ v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
695
+ v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
696
+ v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
697
+
698
+ // Shuffle the 256-bit lanes.
699
+ v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
700
+ v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
701
+ v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
702
+ v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
703
+ v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
704
+ v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
705
+ v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
706
+ v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
707
+ v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
708
+ v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
709
+ v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
710
+ v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
711
+ v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
712
+ v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
713
+ v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
714
+ v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
715
+
716
+ // Shuffle the 512-bit lanes.
717
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
718
+ v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
719
+ v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
720
+ v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
721
+ v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
722
+ v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
723
+ v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
724
+ v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
725
+ v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
726
+ v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
727
+ v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
728
+ v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
729
+ v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
730
+ v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
731
+ v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
732
+ v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
733
+
734
+ kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
735
+ kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
736
+ kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_3_2));
737
+ kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_3_2));
738
+ kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_5_4));
739
+ kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_5_4));
740
+ kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_7_6));
741
+ kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_7_6));
742
+ kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_9_8));
743
+ kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_9_8));
744
+ kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_11_10));
745
+ kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_11_10));
746
+ kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_13_12));
747
+ kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_13_12));
748
+ kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_15_14));
749
+ kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_15_14));
750
+ kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_17_16));
751
+ kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_17_16));
752
+ kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_19_18));
753
+ kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_19_18));
754
+ kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_21_20));
755
+ kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_21_20));
756
+ kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_23_22));
757
+ kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_23_22));
758
+ kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_25_24));
759
+ kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_25_24));
760
+ kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_27_26));
761
+ kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_27_26));
762
+ kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_29_28));
763
+ kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_29_28));
764
+ kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_31_30));
765
+ kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
766
+ }
767
+
768
+ template <HVXPacketSize T>
769
+ EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
770
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
771
+ HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
772
+ for (int i = 2; i < packet_size; i <<= 1) {
773
+ vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
774
+ }
775
+ return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
776
+ }
777
+ template <>
778
+ EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
779
+ return predux_hvx(a);
780
+ }
781
+ template <>
782
+ EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
783
+ return predux_hvx(a);
784
+ }
785
+ template <>
786
+ EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
787
+ return predux_hvx(a);
788
+ }
789
+
790
+ template <HVXPacketSize T>
791
+ EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) {
792
+ constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
793
+ HVX_Vector load = HVX_load_partial<size, 0>(from);
794
+ HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
795
+ return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
796
+ }
797
+ template <>
798
+ EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) {
799
+ return ploaddup_hvx<HVXPacketSize::Full>(from);
800
+ }
801
+ template <>
802
+ EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) {
803
+ return ploaddup_hvx<HVXPacketSize::Half>(from);
804
+ }
805
+ template <>
806
+ EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) {
807
+ return ploaddup_hvx<HVXPacketSize::Quarter>(from);
808
+ }
809
+
810
+ template <HVXPacketSize T>
811
+ EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) {
812
+ constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
813
+ HVX_Vector load = HVX_load_partial<size, 0>(from);
814
+ HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
815
+ HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
816
+ return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
817
+ }
818
+ template <>
819
+ EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
820
+ return ploadquad_hvx<HVXPacketSize::Full>(from);
821
+ }
822
+ template <>
823
+ EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) {
824
+ return ploadquad_hvx<HVXPacketSize::Half>(from);
825
+ }
826
+ template <>
827
+ EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) {
828
+ return ploadquad_hvx<HVXPacketSize::Quarter>(from);
829
+ }
830
+
831
+ template <>
832
+ EIGEN_STRONG_INLINE Packet32f preverse(const Packet32f& a) {
833
+ HVX_Vector delta = Q6_Vb_vsplat_R(0x7c);
834
+ return Packet32f::Create(Q6_V_vdelta_VV(a.Get(), delta));
835
+ }
836
+
837
+ template <>
838
+ EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
839
+ HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
840
+ return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
841
+ }
842
+
843
+ template <>
844
+ EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
845
+ HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
846
+ return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
847
+ }
848
+
849
+ template <HVXPacketSize T>
850
+ EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
851
+ return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
852
+ }
853
+ template <>
854
+ EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
855
+ return pmin_hvx(a, b);
856
+ }
857
+ template <>
858
+ EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) {
859
+ return pmin_hvx(a, b);
860
+ }
861
+ template <>
862
+ EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) {
863
+ return pmin_hvx(a, b);
864
+ }
865
+
866
+ template <HVXPacketSize T>
867
+ EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
868
+ return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
869
+ }
870
+ template <>
871
+ EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) {
872
+ return pmax_hvx(a, b);
873
+ }
874
+ template <>
875
+ EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) {
876
+ return pmax_hvx(a, b);
877
+ }
878
+ template <>
879
+ EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) {
880
+ return pmax_hvx(a, b);
881
+ }
882
+
883
+ template <HVXPacketSize T>
884
+ EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
885
+ return HVXPacket<T>::Create(a.Get() & b.Get());
886
+ }
887
+ template <>
888
+ EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) {
889
+ return pand_hvx(a, b);
890
+ }
891
+ template <>
892
+ EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) {
893
+ return pand_hvx(a, b);
894
+ }
895
+ template <>
896
+ EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) {
897
+ return pand_hvx(a, b);
898
+ }
899
+
900
+ template <HVXPacketSize T>
901
+ EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
902
+ return HVXPacket<T>::Create(a.Get() | b.Get());
903
+ }
904
+ template <>
905
+ EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) {
906
+ return por_hvx(a, b);
907
+ }
908
+ template <>
909
+ EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) {
910
+ return por_hvx(a, b);
911
+ }
912
+ template <>
913
+ EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) {
914
+ return por_hvx(a, b);
915
+ }
916
+
917
+ template <HVXPacketSize T>
918
+ EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
919
+ return HVXPacket<T>::Create(a.Get() ^ b.Get());
920
+ }
921
+ template <>
922
+ EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) {
923
+ return pxor_hvx(a, b);
924
+ }
925
+ template <>
926
+ EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) {
927
+ return pxor_hvx(a, b);
928
+ }
929
+ template <>
930
+ EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) {
931
+ return pxor_hvx(a, b);
932
+ }
933
+
934
+ template <HVXPacketSize T>
935
+ EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) {
936
+ return HVXPacket<T>::Create(~a.Get());
937
+ }
938
+ template <>
939
+ EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) {
940
+ return pnot_hvx(a);
941
+ }
942
+ template <>
943
+ EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) {
944
+ return pnot_hvx(a);
945
+ }
946
+ template <>
947
+ EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) {
948
+ return pnot_hvx(a);
949
+ }
950
+
951
+ template <HVXPacketSize T>
952
+ EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) {
953
+ HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
954
+ return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
955
+ }
956
+ template <>
957
+ EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
958
+ return pselect_hvx(mask, a, b);
959
+ }
960
+ template <>
961
+ EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
962
+ return pselect_hvx(mask, a, b);
963
+ }
964
+ template <>
965
+ EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
966
+ return pselect_hvx(mask, a, b);
967
+ }
968
+
969
+ template <HVXPacketSize T, typename Op>
970
+ EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) {
971
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
972
+ HVXPacket<T> vredux = a;
973
+ for (int i = 1; i < packet_size; i <<= 1) {
974
+ vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float))));
975
+ }
976
+ return pfirst(vredux);
977
+ }
978
+
979
+ template <>
980
+ EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) {
981
+ return predux_generic(a, pmax<Packet32f>);
982
+ }
983
+ template <>
984
+ EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
985
+ return predux_generic(a, pmax<Packet16f>);
986
+ }
987
+ template <>
988
+ EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
989
+ return predux_generic(a, pmax<Packet8f>);
990
+ }
991
+
992
+ template <>
993
+ EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) {
994
+ return predux_generic(a, pmin<Packet32f>);
995
+ }
996
+ template <>
997
+ EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
998
+ return predux_generic(a, pmin<Packet16f>);
999
+ }
1000
+ template <>
1001
+ EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
1002
+ return predux_generic(a, pmin<Packet8f>);
1003
+ }
1004
+
1005
+ template <>
1006
+ EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) {
1007
+ return predux_generic(a, por<Packet32f>) != 0.0f;
1008
+ }
1009
+ template <>
1010
+ EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
1011
+ return predux_generic(a, por<Packet16f>) != 0.0f;
1012
+ }
1013
+ template <>
1014
+ EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
1015
+ return predux_generic(a, por<Packet8f>) != 0.0f;
1016
+ }
1017
+
1018
+ static const float index_vsf[32]
1019
+ __attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1020
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
1021
+
1022
+ template <HVXPacketSize T>
1023
+ EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) {
1024
+ return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
1025
+ }
1026
+ template <>
1027
+ EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
1028
+ return plset_hvx<HVXPacketSize::Full>(a);
1029
+ }
1030
+ template <>
1031
+ EIGEN_STRONG_INLINE Packet16f plset(const float& a) {
1032
+ return plset_hvx<HVXPacketSize::Half>(a);
1033
+ }
1034
+ template <>
1035
+ EIGEN_STRONG_INLINE Packet8f plset(const float& a) {
1036
+ return plset_hvx<HVXPacketSize::Quarter>(a);
1037
+ }
1038
+
1039
+ template <HVXPacketSize T>
1040
+ EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) {
1041
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1042
+ float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1043
+ pstore<float>(elements, from);
1044
+ for (Index i = 0; i < packet_size; ++i) {
1045
+ to[i * stride] = elements[i];
1046
+ }
1047
+ }
1048
+ template <>
1049
+ EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) {
1050
+ pscatter_hvx(to, from, stride);
1051
+ }
1052
+ template <>
1053
+ EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
1054
+ pscatter_hvx(to, from, stride);
1055
+ }
1056
+ template <>
1057
+ EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
1058
+ pscatter_hvx(to, from, stride);
1059
+ }
1060
+
1061
+ template <HVXPacketSize T>
1062
+ EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) {
1063
+ const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1064
+ float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1065
+ for (Index i = 0; i < packet_size; i++) {
1066
+ elements[i] = from[i * stride];
1067
+ }
1068
+ return pload<HVXPacket<T>>(elements);
1069
+ }
1070
+ template <>
1071
+ EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) {
1072
+ return pgather_hvx<HVXPacketSize::Full>(from, stride);
1073
+ }
1074
+ template <>
1075
+ EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
1076
+ return pgather_hvx<HVXPacketSize::Half>(from, stride);
1077
+ }
1078
+ template <>
1079
+ EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
1080
+ return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
1081
+ }
1082
+
1083
+ } // end namespace internal
1084
+ } // end namespace Eigen
1085
+
1086
+ #endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
1087
+
1088
+ #endif // EIGEN_HVX_PACKET_MATH_H