umappp 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (395) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +25 -0
  3. data/README.md +110 -0
  4. data/ext/umappp/extconf.rb +25 -0
  5. data/ext/umappp/numo.hpp +867 -0
  6. data/ext/umappp/umappp.cpp +225 -0
  7. data/lib/umappp/version.rb +5 -0
  8. data/lib/umappp.rb +41 -0
  9. data/vendor/Eigen/Cholesky +45 -0
  10. data/vendor/Eigen/CholmodSupport +48 -0
  11. data/vendor/Eigen/Core +384 -0
  12. data/vendor/Eigen/Dense +7 -0
  13. data/vendor/Eigen/Eigen +2 -0
  14. data/vendor/Eigen/Eigenvalues +60 -0
  15. data/vendor/Eigen/Geometry +59 -0
  16. data/vendor/Eigen/Householder +29 -0
  17. data/vendor/Eigen/IterativeLinearSolvers +48 -0
  18. data/vendor/Eigen/Jacobi +32 -0
  19. data/vendor/Eigen/KLUSupport +41 -0
  20. data/vendor/Eigen/LU +47 -0
  21. data/vendor/Eigen/MetisSupport +35 -0
  22. data/vendor/Eigen/OrderingMethods +70 -0
  23. data/vendor/Eigen/PaStiXSupport +49 -0
  24. data/vendor/Eigen/PardisoSupport +35 -0
  25. data/vendor/Eigen/QR +50 -0
  26. data/vendor/Eigen/QtAlignedMalloc +39 -0
  27. data/vendor/Eigen/SPQRSupport +34 -0
  28. data/vendor/Eigen/SVD +50 -0
  29. data/vendor/Eigen/Sparse +34 -0
  30. data/vendor/Eigen/SparseCholesky +37 -0
  31. data/vendor/Eigen/SparseCore +69 -0
  32. data/vendor/Eigen/SparseLU +50 -0
  33. data/vendor/Eigen/SparseQR +36 -0
  34. data/vendor/Eigen/StdDeque +27 -0
  35. data/vendor/Eigen/StdList +26 -0
  36. data/vendor/Eigen/StdVector +27 -0
  37. data/vendor/Eigen/SuperLUSupport +64 -0
  38. data/vendor/Eigen/UmfPackSupport +40 -0
  39. data/vendor/Eigen/src/Cholesky/LDLT.h +688 -0
  40. data/vendor/Eigen/src/Cholesky/LLT.h +558 -0
  41. data/vendor/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  42. data/vendor/Eigen/src/CholmodSupport/CholmodSupport.h +682 -0
  43. data/vendor/Eigen/src/Core/ArithmeticSequence.h +413 -0
  44. data/vendor/Eigen/src/Core/Array.h +417 -0
  45. data/vendor/Eigen/src/Core/ArrayBase.h +226 -0
  46. data/vendor/Eigen/src/Core/ArrayWrapper.h +209 -0
  47. data/vendor/Eigen/src/Core/Assign.h +90 -0
  48. data/vendor/Eigen/src/Core/AssignEvaluator.h +1010 -0
  49. data/vendor/Eigen/src/Core/Assign_MKL.h +178 -0
  50. data/vendor/Eigen/src/Core/BandMatrix.h +353 -0
  51. data/vendor/Eigen/src/Core/Block.h +448 -0
  52. data/vendor/Eigen/src/Core/BooleanRedux.h +162 -0
  53. data/vendor/Eigen/src/Core/CommaInitializer.h +164 -0
  54. data/vendor/Eigen/src/Core/ConditionEstimator.h +175 -0
  55. data/vendor/Eigen/src/Core/CoreEvaluators.h +1741 -0
  56. data/vendor/Eigen/src/Core/CoreIterators.h +132 -0
  57. data/vendor/Eigen/src/Core/CwiseBinaryOp.h +183 -0
  58. data/vendor/Eigen/src/Core/CwiseNullaryOp.h +1001 -0
  59. data/vendor/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  60. data/vendor/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  61. data/vendor/Eigen/src/Core/CwiseUnaryView.h +132 -0
  62. data/vendor/Eigen/src/Core/DenseBase.h +701 -0
  63. data/vendor/Eigen/src/Core/DenseCoeffsBase.h +685 -0
  64. data/vendor/Eigen/src/Core/DenseStorage.h +652 -0
  65. data/vendor/Eigen/src/Core/Diagonal.h +258 -0
  66. data/vendor/Eigen/src/Core/DiagonalMatrix.h +391 -0
  67. data/vendor/Eigen/src/Core/DiagonalProduct.h +28 -0
  68. data/vendor/Eigen/src/Core/Dot.h +318 -0
  69. data/vendor/Eigen/src/Core/EigenBase.h +160 -0
  70. data/vendor/Eigen/src/Core/ForceAlignedAccess.h +150 -0
  71. data/vendor/Eigen/src/Core/Fuzzy.h +155 -0
  72. data/vendor/Eigen/src/Core/GeneralProduct.h +465 -0
  73. data/vendor/Eigen/src/Core/GenericPacketMath.h +1040 -0
  74. data/vendor/Eigen/src/Core/GlobalFunctions.h +194 -0
  75. data/vendor/Eigen/src/Core/IO.h +258 -0
  76. data/vendor/Eigen/src/Core/IndexedView.h +237 -0
  77. data/vendor/Eigen/src/Core/Inverse.h +117 -0
  78. data/vendor/Eigen/src/Core/Map.h +171 -0
  79. data/vendor/Eigen/src/Core/MapBase.h +310 -0
  80. data/vendor/Eigen/src/Core/MathFunctions.h +2057 -0
  81. data/vendor/Eigen/src/Core/MathFunctionsImpl.h +200 -0
  82. data/vendor/Eigen/src/Core/Matrix.h +565 -0
  83. data/vendor/Eigen/src/Core/MatrixBase.h +547 -0
  84. data/vendor/Eigen/src/Core/NestByValue.h +85 -0
  85. data/vendor/Eigen/src/Core/NoAlias.h +109 -0
  86. data/vendor/Eigen/src/Core/NumTraits.h +335 -0
  87. data/vendor/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  88. data/vendor/Eigen/src/Core/PermutationMatrix.h +605 -0
  89. data/vendor/Eigen/src/Core/PlainObjectBase.h +1128 -0
  90. data/vendor/Eigen/src/Core/Product.h +191 -0
  91. data/vendor/Eigen/src/Core/ProductEvaluators.h +1179 -0
  92. data/vendor/Eigen/src/Core/Random.h +218 -0
  93. data/vendor/Eigen/src/Core/Redux.h +515 -0
  94. data/vendor/Eigen/src/Core/Ref.h +381 -0
  95. data/vendor/Eigen/src/Core/Replicate.h +142 -0
  96. data/vendor/Eigen/src/Core/Reshaped.h +454 -0
  97. data/vendor/Eigen/src/Core/ReturnByValue.h +119 -0
  98. data/vendor/Eigen/src/Core/Reverse.h +217 -0
  99. data/vendor/Eigen/src/Core/Select.h +164 -0
  100. data/vendor/Eigen/src/Core/SelfAdjointView.h +365 -0
  101. data/vendor/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  102. data/vendor/Eigen/src/Core/Solve.h +188 -0
  103. data/vendor/Eigen/src/Core/SolveTriangular.h +235 -0
  104. data/vendor/Eigen/src/Core/SolverBase.h +168 -0
  105. data/vendor/Eigen/src/Core/StableNorm.h +251 -0
  106. data/vendor/Eigen/src/Core/StlIterators.h +463 -0
  107. data/vendor/Eigen/src/Core/Stride.h +116 -0
  108. data/vendor/Eigen/src/Core/Swap.h +68 -0
  109. data/vendor/Eigen/src/Core/Transpose.h +464 -0
  110. data/vendor/Eigen/src/Core/Transpositions.h +386 -0
  111. data/vendor/Eigen/src/Core/TriangularMatrix.h +1001 -0
  112. data/vendor/Eigen/src/Core/VectorBlock.h +96 -0
  113. data/vendor/Eigen/src/Core/VectorwiseOp.h +784 -0
  114. data/vendor/Eigen/src/Core/Visitor.h +381 -0
  115. data/vendor/Eigen/src/Core/arch/AVX/Complex.h +372 -0
  116. data/vendor/Eigen/src/Core/arch/AVX/MathFunctions.h +228 -0
  117. data/vendor/Eigen/src/Core/arch/AVX/PacketMath.h +1574 -0
  118. data/vendor/Eigen/src/Core/arch/AVX/TypeCasting.h +115 -0
  119. data/vendor/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  120. data/vendor/Eigen/src/Core/arch/AVX512/MathFunctions.h +362 -0
  121. data/vendor/Eigen/src/Core/arch/AVX512/PacketMath.h +2303 -0
  122. data/vendor/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  123. data/vendor/Eigen/src/Core/arch/AltiVec/Complex.h +417 -0
  124. data/vendor/Eigen/src/Core/arch/AltiVec/MathFunctions.h +90 -0
  125. data/vendor/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  126. data/vendor/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  127. data/vendor/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  128. data/vendor/Eigen/src/Core/arch/AltiVec/PacketMath.h +2711 -0
  129. data/vendor/Eigen/src/Core/arch/CUDA/Complex.h +258 -0
  130. data/vendor/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  131. data/vendor/Eigen/src/Core/arch/Default/ConjHelper.h +117 -0
  132. data/vendor/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  133. data/vendor/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  134. data/vendor/Eigen/src/Core/arch/Default/Half.h +942 -0
  135. data/vendor/Eigen/src/Core/arch/Default/Settings.h +49 -0
  136. data/vendor/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  137. data/vendor/Eigen/src/Core/arch/GPU/MathFunctions.h +103 -0
  138. data/vendor/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  139. data/vendor/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  140. data/vendor/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  141. data/vendor/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  142. data/vendor/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  143. data/vendor/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  144. data/vendor/Eigen/src/Core/arch/NEON/Complex.h +584 -0
  145. data/vendor/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  146. data/vendor/Eigen/src/Core/arch/NEON/MathFunctions.h +75 -0
  147. data/vendor/Eigen/src/Core/arch/NEON/PacketMath.h +4587 -0
  148. data/vendor/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  149. data/vendor/Eigen/src/Core/arch/SSE/Complex.h +351 -0
  150. data/vendor/Eigen/src/Core/arch/SSE/MathFunctions.h +199 -0
  151. data/vendor/Eigen/src/Core/arch/SSE/PacketMath.h +1505 -0
  152. data/vendor/Eigen/src/Core/arch/SSE/TypeCasting.h +142 -0
  153. data/vendor/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  154. data/vendor/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  155. data/vendor/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  156. data/vendor/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  157. data/vendor/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  158. data/vendor/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  159. data/vendor/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  160. data/vendor/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  161. data/vendor/Eigen/src/Core/arch/ZVector/Complex.h +426 -0
  162. data/vendor/Eigen/src/Core/arch/ZVector/MathFunctions.h +233 -0
  163. data/vendor/Eigen/src/Core/arch/ZVector/PacketMath.h +1060 -0
  164. data/vendor/Eigen/src/Core/functors/AssignmentFunctors.h +177 -0
  165. data/vendor/Eigen/src/Core/functors/BinaryFunctors.h +541 -0
  166. data/vendor/Eigen/src/Core/functors/NullaryFunctors.h +189 -0
  167. data/vendor/Eigen/src/Core/functors/StlFunctors.h +166 -0
  168. data/vendor/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  169. data/vendor/Eigen/src/Core/functors/UnaryFunctors.h +1131 -0
  170. data/vendor/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2645 -0
  171. data/vendor/Eigen/src/Core/products/GeneralMatrixMatrix.h +517 -0
  172. data/vendor/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +317 -0
  173. data/vendor/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  174. data/vendor/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +124 -0
  175. data/vendor/Eigen/src/Core/products/GeneralMatrixVector.h +518 -0
  176. data/vendor/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  177. data/vendor/Eigen/src/Core/products/Parallelizer.h +180 -0
  178. data/vendor/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +544 -0
  179. data/vendor/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +295 -0
  180. data/vendor/Eigen/src/Core/products/SelfadjointMatrixVector.h +262 -0
  181. data/vendor/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  182. data/vendor/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  183. data/vendor/Eigen/src/Core/products/SelfadjointRank2Update.h +94 -0
  184. data/vendor/Eigen/src/Core/products/TriangularMatrixMatrix.h +472 -0
  185. data/vendor/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +317 -0
  186. data/vendor/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  187. data/vendor/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  188. data/vendor/Eigen/src/Core/products/TriangularSolverMatrix.h +337 -0
  189. data/vendor/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +167 -0
  190. data/vendor/Eigen/src/Core/products/TriangularSolverVector.h +148 -0
  191. data/vendor/Eigen/src/Core/util/BlasUtil.h +583 -0
  192. data/vendor/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  193. data/vendor/Eigen/src/Core/util/Constants.h +563 -0
  194. data/vendor/Eigen/src/Core/util/DisableStupidWarnings.h +106 -0
  195. data/vendor/Eigen/src/Core/util/ForwardDeclarations.h +322 -0
  196. data/vendor/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  197. data/vendor/Eigen/src/Core/util/IntegralConstant.h +272 -0
  198. data/vendor/Eigen/src/Core/util/MKL_support.h +137 -0
  199. data/vendor/Eigen/src/Core/util/Macros.h +1464 -0
  200. data/vendor/Eigen/src/Core/util/Memory.h +1163 -0
  201. data/vendor/Eigen/src/Core/util/Meta.h +812 -0
  202. data/vendor/Eigen/src/Core/util/NonMPL2.h +3 -0
  203. data/vendor/Eigen/src/Core/util/ReenableStupidWarnings.h +31 -0
  204. data/vendor/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  205. data/vendor/Eigen/src/Core/util/StaticAssert.h +221 -0
  206. data/vendor/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  207. data/vendor/Eigen/src/Core/util/XprHelper.h +856 -0
  208. data/vendor/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
  209. data/vendor/Eigen/src/Eigenvalues/ComplexSchur.h +462 -0
  210. data/vendor/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  211. data/vendor/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  212. data/vendor/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
  213. data/vendor/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  214. data/vendor/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  215. data/vendor/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  216. data/vendor/Eigen/src/Eigenvalues/RealQZ.h +657 -0
  217. data/vendor/Eigen/src/Eigenvalues/RealSchur.h +558 -0
  218. data/vendor/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  219. data/vendor/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +904 -0
  220. data/vendor/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  221. data/vendor/Eigen/src/Eigenvalues/Tridiagonalization.h +561 -0
  222. data/vendor/Eigen/src/Geometry/AlignedBox.h +486 -0
  223. data/vendor/Eigen/src/Geometry/AngleAxis.h +247 -0
  224. data/vendor/Eigen/src/Geometry/EulerAngles.h +114 -0
  225. data/vendor/Eigen/src/Geometry/Homogeneous.h +501 -0
  226. data/vendor/Eigen/src/Geometry/Hyperplane.h +282 -0
  227. data/vendor/Eigen/src/Geometry/OrthoMethods.h +235 -0
  228. data/vendor/Eigen/src/Geometry/ParametrizedLine.h +232 -0
  229. data/vendor/Eigen/src/Geometry/Quaternion.h +870 -0
  230. data/vendor/Eigen/src/Geometry/Rotation2D.h +199 -0
  231. data/vendor/Eigen/src/Geometry/RotationBase.h +206 -0
  232. data/vendor/Eigen/src/Geometry/Scaling.h +188 -0
  233. data/vendor/Eigen/src/Geometry/Transform.h +1563 -0
  234. data/vendor/Eigen/src/Geometry/Translation.h +202 -0
  235. data/vendor/Eigen/src/Geometry/Umeyama.h +166 -0
  236. data/vendor/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  237. data/vendor/Eigen/src/Householder/BlockHouseholder.h +110 -0
  238. data/vendor/Eigen/src/Householder/Householder.h +176 -0
  239. data/vendor/Eigen/src/Householder/HouseholderSequence.h +545 -0
  240. data/vendor/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  241. data/vendor/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +212 -0
  242. data/vendor/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +229 -0
  243. data/vendor/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +394 -0
  244. data/vendor/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +453 -0
  245. data/vendor/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +444 -0
  246. data/vendor/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +198 -0
  247. data/vendor/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +117 -0
  248. data/vendor/Eigen/src/Jacobi/Jacobi.h +483 -0
  249. data/vendor/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  250. data/vendor/Eigen/src/LU/Determinant.h +117 -0
  251. data/vendor/Eigen/src/LU/FullPivLU.h +877 -0
  252. data/vendor/Eigen/src/LU/InverseImpl.h +432 -0
  253. data/vendor/Eigen/src/LU/PartialPivLU.h +624 -0
  254. data/vendor/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  255. data/vendor/Eigen/src/LU/arch/InverseSize4.h +351 -0
  256. data/vendor/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  257. data/vendor/Eigen/src/OrderingMethods/Amd.h +435 -0
  258. data/vendor/Eigen/src/OrderingMethods/Eigen_Colamd.h +1863 -0
  259. data/vendor/Eigen/src/OrderingMethods/Ordering.h +153 -0
  260. data/vendor/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  261. data/vendor/Eigen/src/PardisoSupport/PardisoSupport.h +545 -0
  262. data/vendor/Eigen/src/QR/ColPivHouseholderQR.h +674 -0
  263. data/vendor/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  264. data/vendor/Eigen/src/QR/CompleteOrthogonalDecomposition.h +635 -0
  265. data/vendor/Eigen/src/QR/FullPivHouseholderQR.h +713 -0
  266. data/vendor/Eigen/src/QR/HouseholderQR.h +434 -0
  267. data/vendor/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  268. data/vendor/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +335 -0
  269. data/vendor/Eigen/src/SVD/BDCSVD.h +1366 -0
  270. data/vendor/Eigen/src/SVD/JacobiSVD.h +812 -0
  271. data/vendor/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  272. data/vendor/Eigen/src/SVD/SVDBase.h +376 -0
  273. data/vendor/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
  274. data/vendor/Eigen/src/SparseCholesky/SimplicialCholesky.h +697 -0
  275. data/vendor/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +174 -0
  276. data/vendor/Eigen/src/SparseCore/AmbiVector.h +378 -0
  277. data/vendor/Eigen/src/SparseCore/CompressedStorage.h +274 -0
  278. data/vendor/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  279. data/vendor/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  280. data/vendor/Eigen/src/SparseCore/SparseAssign.h +270 -0
  281. data/vendor/Eigen/src/SparseCore/SparseBlock.h +571 -0
  282. data/vendor/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  283. data/vendor/Eigen/src/SparseCore/SparseCompressedBase.h +370 -0
  284. data/vendor/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +722 -0
  285. data/vendor/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +150 -0
  286. data/vendor/Eigen/src/SparseCore/SparseDenseProduct.h +342 -0
  287. data/vendor/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  288. data/vendor/Eigen/src/SparseCore/SparseDot.h +98 -0
  289. data/vendor/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  290. data/vendor/Eigen/src/SparseCore/SparseMap.h +305 -0
  291. data/vendor/Eigen/src/SparseCore/SparseMatrix.h +1518 -0
  292. data/vendor/Eigen/src/SparseCore/SparseMatrixBase.h +398 -0
  293. data/vendor/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  294. data/vendor/Eigen/src/SparseCore/SparseProduct.h +181 -0
  295. data/vendor/Eigen/src/SparseCore/SparseRedux.h +49 -0
  296. data/vendor/Eigen/src/SparseCore/SparseRef.h +397 -0
  297. data/vendor/Eigen/src/SparseCore/SparseSelfAdjointView.h +659 -0
  298. data/vendor/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  299. data/vendor/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  300. data/vendor/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  301. data/vendor/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  302. data/vendor/Eigen/src/SparseCore/SparseUtil.h +186 -0
  303. data/vendor/Eigen/src/SparseCore/SparseVector.h +478 -0
  304. data/vendor/Eigen/src/SparseCore/SparseView.h +254 -0
  305. data/vendor/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  306. data/vendor/Eigen/src/SparseLU/SparseLU.h +923 -0
  307. data/vendor/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  308. data/vendor/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  309. data/vendor/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  310. data/vendor/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +375 -0
  311. data/vendor/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  312. data/vendor/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  313. data/vendor/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  314. data/vendor/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  315. data/vendor/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
  316. data/vendor/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
  317. data/vendor/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
  318. data/vendor/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
  319. data/vendor/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  320. data/vendor/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  321. data/vendor/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  322. data/vendor/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  323. data/vendor/Eigen/src/SparseQR/SparseQR.h +758 -0
  324. data/vendor/Eigen/src/StlSupport/StdDeque.h +116 -0
  325. data/vendor/Eigen/src/StlSupport/StdList.h +106 -0
  326. data/vendor/Eigen/src/StlSupport/StdVector.h +131 -0
  327. data/vendor/Eigen/src/StlSupport/details.h +84 -0
  328. data/vendor/Eigen/src/SuperLUSupport/SuperLUSupport.h +1025 -0
  329. data/vendor/Eigen/src/UmfPackSupport/UmfPackSupport.h +642 -0
  330. data/vendor/Eigen/src/misc/Image.h +82 -0
  331. data/vendor/Eigen/src/misc/Kernel.h +79 -0
  332. data/vendor/Eigen/src/misc/RealSvd2x2.h +55 -0
  333. data/vendor/Eigen/src/misc/blas.h +440 -0
  334. data/vendor/Eigen/src/misc/lapack.h +152 -0
  335. data/vendor/Eigen/src/misc/lapacke.h +16292 -0
  336. data/vendor/Eigen/src/misc/lapacke_mangling.h +17 -0
  337. data/vendor/Eigen/src/plugins/ArrayCwiseBinaryOps.h +358 -0
  338. data/vendor/Eigen/src/plugins/ArrayCwiseUnaryOps.h +696 -0
  339. data/vendor/Eigen/src/plugins/BlockMethods.h +1442 -0
  340. data/vendor/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  341. data/vendor/Eigen/src/plugins/CommonCwiseUnaryOps.h +177 -0
  342. data/vendor/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  343. data/vendor/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
  344. data/vendor/Eigen/src/plugins/MatrixCwiseUnaryOps.h +95 -0
  345. data/vendor/Eigen/src/plugins/ReshapedMethods.h +149 -0
  346. data/vendor/aarand/aarand.hpp +114 -0
  347. data/vendor/annoy/annoylib.h +1495 -0
  348. data/vendor/annoy/kissrandom.h +120 -0
  349. data/vendor/annoy/mman.h +242 -0
  350. data/vendor/hnswlib/bruteforce.h +152 -0
  351. data/vendor/hnswlib/hnswalg.h +1192 -0
  352. data/vendor/hnswlib/hnswlib.h +108 -0
  353. data/vendor/hnswlib/space_ip.h +282 -0
  354. data/vendor/hnswlib/space_l2.h +281 -0
  355. data/vendor/hnswlib/visited_list_pool.h +79 -0
  356. data/vendor/irlba/irlba.hpp +575 -0
  357. data/vendor/irlba/lanczos.hpp +212 -0
  358. data/vendor/irlba/parallel.hpp +474 -0
  359. data/vendor/irlba/utils.hpp +224 -0
  360. data/vendor/irlba/wrappers.hpp +228 -0
  361. data/vendor/kmeans/Base.hpp +75 -0
  362. data/vendor/kmeans/Details.hpp +79 -0
  363. data/vendor/kmeans/HartiganWong.hpp +492 -0
  364. data/vendor/kmeans/InitializeKmeansPP.hpp +144 -0
  365. data/vendor/kmeans/InitializeNone.hpp +44 -0
  366. data/vendor/kmeans/InitializePCAPartition.hpp +309 -0
  367. data/vendor/kmeans/InitializeRandom.hpp +91 -0
  368. data/vendor/kmeans/Kmeans.hpp +161 -0
  369. data/vendor/kmeans/Lloyd.hpp +134 -0
  370. data/vendor/kmeans/MiniBatch.hpp +269 -0
  371. data/vendor/kmeans/QuickSearch.hpp +179 -0
  372. data/vendor/kmeans/compute_centroids.hpp +32 -0
  373. data/vendor/kmeans/compute_wcss.hpp +27 -0
  374. data/vendor/kmeans/is_edge_case.hpp +42 -0
  375. data/vendor/kmeans/random.hpp +55 -0
  376. data/vendor/knncolle/Annoy/Annoy.hpp +193 -0
  377. data/vendor/knncolle/BruteForce/BruteForce.hpp +120 -0
  378. data/vendor/knncolle/Hnsw/Hnsw.hpp +225 -0
  379. data/vendor/knncolle/Kmknn/Kmknn.hpp +286 -0
  380. data/vendor/knncolle/VpTree/VpTree.hpp +256 -0
  381. data/vendor/knncolle/knncolle.hpp +34 -0
  382. data/vendor/knncolle/utils/Base.hpp +100 -0
  383. data/vendor/knncolle/utils/NeighborQueue.hpp +94 -0
  384. data/vendor/knncolle/utils/distances.hpp +98 -0
  385. data/vendor/knncolle/utils/find_nearest_neighbors.hpp +112 -0
  386. data/vendor/powerit/PowerIterations.hpp +157 -0
  387. data/vendor/umappp/NeighborList.hpp +37 -0
  388. data/vendor/umappp/Umap.hpp +662 -0
  389. data/vendor/umappp/combine_neighbor_sets.hpp +95 -0
  390. data/vendor/umappp/find_ab.hpp +157 -0
  391. data/vendor/umappp/neighbor_similarities.hpp +136 -0
  392. data/vendor/umappp/optimize_layout.hpp +285 -0
  393. data/vendor/umappp/spectral_init.hpp +181 -0
  394. data/vendor/umappp/umappp.hpp +13 -0
  395. metadata +465 -0
@@ -0,0 +1,4587 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5
+ // Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
6
+ // Heavily based on Gael's SSE version.
7
+ //
8
+ // This Source Code Form is subject to the terms of the Mozilla
9
+ // Public License v. 2.0. If a copy of the MPL was not distributed
10
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
+
12
+ #ifndef EIGEN_PACKET_MATH_NEON_H
13
+ #define EIGEN_PACKET_MATH_NEON_H
14
+
15
+ namespace Eigen {
16
+
17
+ namespace internal {
18
+
19
+ #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20
+ #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
21
+ #endif
22
+
23
+ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24
+ #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25
+ #endif
26
+
27
+ #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
28
+ #if EIGEN_ARCH_ARM64
29
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
30
+ #else
31
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
32
+ #endif
33
+ #endif
34
+
35
+ #if EIGEN_COMP_MSVC_STRICT
36
+
37
+ // In MSVC's arm_neon.h header file, all NEON vector types
38
+ // are aliases to the same underlying type __n128.
39
+ // We thus have to wrap them to make them different C++ types.
40
+ // (See also bug 1428)
41
+ typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
42
+ typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
43
+ typedef eigen_packet_wrapper<int32_t ,2> Packet4c;
44
+ typedef eigen_packet_wrapper<int8x8_t ,3> Packet8c;
45
+ typedef eigen_packet_wrapper<int8x16_t ,4> Packet16c;
46
+ typedef eigen_packet_wrapper<uint32_t ,5> Packet4uc;
47
+ typedef eigen_packet_wrapper<uint8x8_t ,6> Packet8uc;
48
+ typedef eigen_packet_wrapper<uint8x16_t ,7> Packet16uc;
49
+ typedef eigen_packet_wrapper<int16x4_t ,8> Packet4s;
50
+ typedef eigen_packet_wrapper<int16x8_t ,9> Packet8s;
51
+ typedef eigen_packet_wrapper<uint16x4_t ,10> Packet4us;
52
+ typedef eigen_packet_wrapper<uint16x8_t ,11> Packet8us;
53
+ typedef eigen_packet_wrapper<int32x2_t ,12> Packet2i;
54
+ typedef eigen_packet_wrapper<int32x4_t ,13> Packet4i;
55
+ typedef eigen_packet_wrapper<uint32x2_t ,14> Packet2ui;
56
+ typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
57
+ typedef eigen_packet_wrapper<int64x2_t ,16> Packet2l;
58
+ typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
59
+
60
+ #else
61
+
62
+ typedef float32x2_t Packet2f;
63
+ typedef float32x4_t Packet4f;
64
+ typedef eigen_packet_wrapper<int32_t ,2> Packet4c;
65
+ typedef int8x8_t Packet8c;
66
+ typedef int8x16_t Packet16c;
67
+ typedef eigen_packet_wrapper<uint32_t ,5> Packet4uc;
68
+ typedef uint8x8_t Packet8uc;
69
+ typedef uint8x16_t Packet16uc;
70
+ typedef int16x4_t Packet4s;
71
+ typedef int16x8_t Packet8s;
72
+ typedef uint16x4_t Packet4us;
73
+ typedef uint16x8_t Packet8us;
74
+ typedef int32x2_t Packet2i;
75
+ typedef int32x4_t Packet4i;
76
+ typedef uint32x2_t Packet2ui;
77
+ typedef uint32x4_t Packet4ui;
78
+ typedef int64x2_t Packet2l;
79
+ typedef uint64x2_t Packet2ul;
80
+
81
+ #endif // EIGEN_COMP_MSVC_STRICT
82
+
83
+ EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
84
+ const float* a = reinterpret_cast<const float*>(&m);
85
+ Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
86
+ return res;
87
+ }
88
+
89
+ // fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
90
+ // == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
91
+ // interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
92
+ // to enable a shared implementation for fast inversion of matrices of size 4.
93
+ template<bool interleave>
94
+ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
95
+ {
96
+ const float* a = reinterpret_cast<const float*>(&m);
97
+ const float* b = reinterpret_cast<const float*>(&n);
98
+ Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
99
+ return res;
100
+ }
101
+
102
+ template<>
103
+ EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask)
104
+ {
105
+ const float* a = reinterpret_cast<const float*>(&m);
106
+ const float* b = reinterpret_cast<const float*>(&n);
107
+ Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
108
+ return res;
109
+ }
110
+
111
+ EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
112
+
113
+ EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s)
114
+ {
115
+ return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
116
+ }
117
+ EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
118
+ {
119
+ return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
120
+ }
121
+ EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
122
+ {
123
+ return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
124
+ }
125
+ EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
126
+ {
127
+ return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
128
+ }
129
+ EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
130
+ {
131
+ return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
132
+ }
133
+ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
134
+ {
135
+ return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
136
+ }
137
+ #define vec4f_duplane(a, p) \
138
+ vdupq_lane_f32(vget_low_f32(a), p)
139
+
140
+ #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
141
+ const Packet4f p4f_##NAME = pset1<Packet4f>(X)
142
+
143
+ #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
144
+ const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
145
+
146
+ #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
147
+ const Packet4i p4i_##NAME = pset1<Packet4i>(X)
148
+
149
+ #if EIGEN_ARCH_ARM64
150
+ // __builtin_prefetch tends to do nothing on ARM64 compilers because the
151
+ // prefetch instructions there are too detailed for __builtin_prefetch to map
152
+ // meaningfully to them.
153
+ #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
154
+ #elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
155
+ #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
156
+ #elif defined __pld
157
+ #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
158
+ #elif EIGEN_ARCH_ARM32
159
+ #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
160
+ #else
161
+ // by default no explicit prefetching
162
+ #define EIGEN_ARM_PREFETCH(ADDR)
163
+ #endif
164
+
165
+ template <>
166
+ struct packet_traits<float> : default_packet_traits
167
+ {
168
+ typedef Packet4f type;
169
+ typedef Packet2f half;
170
+ enum
171
+ {
172
+ Vectorizable = 1,
173
+ AlignedOnScalar = 1,
174
+ size = 4,
175
+ HasHalfPacket = 1,
176
+
177
+ HasAdd = 1,
178
+ HasSub = 1,
179
+ HasShift = 1,
180
+ HasMul = 1,
181
+ HasNegate = 1,
182
+ HasAbs = 1,
183
+ HasArg = 0,
184
+ HasAbs2 = 1,
185
+ HasAbsDiff = 1,
186
+ HasMin = 1,
187
+ HasMax = 1,
188
+ HasConj = 1,
189
+ HasSetLinear = 0,
190
+ HasBlend = 0,
191
+
192
+ HasDiv = 1,
193
+ HasFloor = 1,
194
+ HasCeil = 1,
195
+ HasRint = 1,
196
+
197
+ HasSin = EIGEN_FAST_MATH,
198
+ HasCos = EIGEN_FAST_MATH,
199
+ HasLog = 1,
200
+ HasExp = 1,
201
+ HasSqrt = 1,
202
+ HasRsqrt = 1,
203
+ HasTanh = EIGEN_FAST_MATH,
204
+ HasErf = EIGEN_FAST_MATH,
205
+ HasBessel = 0, // Issues with accuracy.
206
+ HasNdtri = 0
207
+ };
208
+ };
209
+
210
+ template <>
211
+ struct packet_traits<int8_t> : default_packet_traits
212
+ {
213
+ typedef Packet16c type;
214
+ typedef Packet8c half;
215
+ enum
216
+ {
217
+ Vectorizable = 1,
218
+ AlignedOnScalar = 1,
219
+ size = 16,
220
+ HasHalfPacket = 1,
221
+
222
+ HasAdd = 1,
223
+ HasSub = 1,
224
+ HasShift = 1,
225
+ HasMul = 1,
226
+ HasNegate = 1,
227
+ HasAbs = 1,
228
+ HasAbsDiff = 1,
229
+ HasArg = 0,
230
+ HasAbs2 = 1,
231
+ HasMin = 1,
232
+ HasMax = 1,
233
+ HasConj = 1,
234
+ HasSetLinear = 0,
235
+ HasBlend = 0
236
+ };
237
+ };
238
+
239
+ template <>
240
+ struct packet_traits<uint8_t> : default_packet_traits
241
+ {
242
+ typedef Packet16uc type;
243
+ typedef Packet8uc half;
244
+ enum
245
+ {
246
+ Vectorizable = 1,
247
+ AlignedOnScalar = 1,
248
+ size = 16,
249
+ HasHalfPacket = 1,
250
+
251
+ HasAdd = 1,
252
+ HasSub = 1,
253
+ HasShift = 1,
254
+ HasMul = 1,
255
+ HasNegate = 0,
256
+ HasAbs = 1,
257
+ HasAbsDiff = 1,
258
+ HasArg = 0,
259
+ HasAbs2 = 1,
260
+ HasMin = 1,
261
+ HasMax = 1,
262
+ HasConj = 1,
263
+ HasSetLinear = 0,
264
+ HasBlend = 0,
265
+
266
+ HasSqrt = 1
267
+ };
268
+ };
269
+
270
+ template <>
271
+ struct packet_traits<int16_t> : default_packet_traits
272
+ {
273
+ typedef Packet8s type;
274
+ typedef Packet4s half;
275
+ enum
276
+ {
277
+ Vectorizable = 1,
278
+ AlignedOnScalar = 1,
279
+ size = 8,
280
+ HasHalfPacket = 1,
281
+
282
+ HasAdd = 1,
283
+ HasSub = 1,
284
+ HasShift = 1,
285
+ HasMul = 1,
286
+ HasNegate = 1,
287
+ HasAbs = 1,
288
+ HasAbsDiff = 1,
289
+ HasArg = 0,
290
+ HasAbs2 = 1,
291
+ HasMin = 1,
292
+ HasMax = 1,
293
+ HasConj = 1,
294
+ HasSetLinear = 0,
295
+ HasBlend = 0
296
+ };
297
+ };
298
+
299
+ template <>
300
+ struct packet_traits<uint16_t> : default_packet_traits
301
+ {
302
+ typedef Packet8us type;
303
+ typedef Packet4us half;
304
+ enum
305
+ {
306
+ Vectorizable = 1,
307
+ AlignedOnScalar = 1,
308
+ size = 8,
309
+ HasHalfPacket = 1,
310
+
311
+ HasAdd = 1,
312
+ HasSub = 1,
313
+ HasShift = 1,
314
+ HasMul = 1,
315
+ HasNegate = 0,
316
+ HasAbs = 0,
317
+ HasAbsDiff = 1,
318
+ HasArg = 0,
319
+ HasAbs2 = 1,
320
+ HasMin = 1,
321
+ HasMax = 1,
322
+ HasConj = 1,
323
+ HasSetLinear = 0,
324
+ HasBlend = 0,
325
+ HasSqrt = 1
326
+ };
327
+ };
328
+
329
+ template <>
330
+ struct packet_traits<int32_t> : default_packet_traits
331
+ {
332
+ typedef Packet4i type;
333
+ typedef Packet2i half;
334
+ enum
335
+ {
336
+ Vectorizable = 1,
337
+ AlignedOnScalar = 1,
338
+ size = 4,
339
+ HasHalfPacket = 1,
340
+
341
+ HasAdd = 1,
342
+ HasSub = 1,
343
+ HasShift = 1,
344
+ HasMul = 1,
345
+ HasNegate = 1,
346
+ HasAbs = 1,
347
+ HasArg = 0,
348
+ HasAbs2 = 1,
349
+ HasAbsDiff = 1,
350
+ HasMin = 1,
351
+ HasMax = 1,
352
+ HasConj = 1,
353
+ HasSetLinear = 0,
354
+ HasBlend = 0
355
+ };
356
+ };
357
+
358
+ template <>
359
+ struct packet_traits<uint32_t> : default_packet_traits
360
+ {
361
+ typedef Packet4ui type;
362
+ typedef Packet2ui half;
363
+ enum
364
+ {
365
+ Vectorizable = 1,
366
+ AlignedOnScalar = 1,
367
+ size = 4,
368
+ HasHalfPacket = 1,
369
+
370
+ HasAdd = 1,
371
+ HasSub = 1,
372
+ HasShift = 1,
373
+ HasMul = 1,
374
+ HasNegate = 0,
375
+ HasAbs = 0,
376
+ HasArg = 0,
377
+ HasAbs2 = 1,
378
+ HasAbsDiff = 1,
379
+ HasMin = 1,
380
+ HasMax = 1,
381
+ HasConj = 1,
382
+ HasSetLinear = 0,
383
+ HasBlend = 0,
384
+
385
+ HasSqrt = 1
386
+ };
387
+ };
388
+
389
+ template <>
390
+ struct packet_traits<int64_t> : default_packet_traits
391
+ {
392
+ typedef Packet2l type;
393
+ typedef Packet2l half;
394
+ enum
395
+ {
396
+ Vectorizable = 1,
397
+ AlignedOnScalar = 1,
398
+ size = 2,
399
+ HasHalfPacket = 0,
400
+
401
+ HasCmp = 1,
402
+ HasAdd = 1,
403
+ HasSub = 1,
404
+ HasShift = 1,
405
+ HasMul = 1,
406
+ HasNegate = 1,
407
+ HasAbs = 1,
408
+ HasArg = 0,
409
+ HasAbs2 = 1,
410
+ HasAbsDiff = 1,
411
+ HasMin = 1,
412
+ HasMax = 1,
413
+ HasConj = 1,
414
+ HasSetLinear = 0,
415
+ HasBlend = 0
416
+ };
417
+ };
418
+
419
+ template <>
420
+ struct packet_traits<uint64_t> : default_packet_traits
421
+ {
422
+ typedef Packet2ul type;
423
+ typedef Packet2ul half;
424
+ enum
425
+ {
426
+ Vectorizable = 1,
427
+ AlignedOnScalar = 1,
428
+ size = 2,
429
+ HasHalfPacket = 0,
430
+
431
+ HasCmp = 1,
432
+ HasAdd = 1,
433
+ HasSub = 1,
434
+ HasShift = 1,
435
+ HasMul = 1,
436
+ HasNegate = 0,
437
+ HasAbs = 0,
438
+ HasArg = 0,
439
+ HasAbs2 = 1,
440
+ HasAbsDiff = 1,
441
+ HasMin = 1,
442
+ HasMax = 1,
443
+ HasConj = 1,
444
+ HasSetLinear = 0,
445
+ HasBlend = 0
446
+ };
447
+ };
448
+
449
+ #if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM
450
+ // workaround gcc 4.2, 4.3 and 4.4 compilation issue
451
+ EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
452
+ EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); }
453
+ EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
454
+ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
455
+ EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
456
+ #endif
457
+
458
+ template<> struct unpacket_traits<Packet2f>
459
+ {
460
+ typedef float type;
461
+ typedef Packet2f half;
462
+ typedef Packet2i integer_packet;
463
+ enum
464
+ {
465
+ size = 2,
466
+ alignment = Aligned16,
467
+ vectorizable = true,
468
+ masked_load_available = false,
469
+ masked_store_available = false
470
+ };
471
+ };
472
+ template<> struct unpacket_traits<Packet4f>
473
+ {
474
+ typedef float type;
475
+ typedef Packet2f half;
476
+ typedef Packet4i integer_packet;
477
+ enum
478
+ {
479
+ size = 4,
480
+ alignment = Aligned16,
481
+ vectorizable = true,
482
+ masked_load_available = false,
483
+ masked_store_available = false
484
+ };
485
+ };
486
+ template<> struct unpacket_traits<Packet4c>
487
+ {
488
+ typedef int8_t type;
489
+ typedef Packet4c half;
490
+ enum
491
+ {
492
+ size = 4,
493
+ alignment = Unaligned,
494
+ vectorizable = true,
495
+ masked_load_available = false,
496
+ masked_store_available = false
497
+ };
498
+ };
499
+ template<> struct unpacket_traits<Packet8c>
500
+ {
501
+ typedef int8_t type;
502
+ typedef Packet4c half;
503
+ enum
504
+ {
505
+ size = 8,
506
+ alignment = Aligned16,
507
+ vectorizable = true,
508
+ masked_load_available = false,
509
+ masked_store_available = false
510
+ };
511
+ };
512
+ template<> struct unpacket_traits<Packet16c>
513
+ {
514
+ typedef int8_t type;
515
+ typedef Packet8c half;
516
+ enum
517
+ {
518
+ size = 16,
519
+ alignment = Aligned16,
520
+ vectorizable = true,
521
+ masked_load_available = false,
522
+ masked_store_available = false
523
+ };
524
+ };
525
+ template<> struct unpacket_traits<Packet4uc>
526
+ {
527
+ typedef uint8_t type;
528
+ typedef Packet4uc half;
529
+ enum
530
+ {
531
+ size = 4,
532
+ alignment = Unaligned,
533
+ vectorizable = true,
534
+ masked_load_available = false,
535
+ masked_store_available = false
536
+ };
537
+ };
538
+ template<> struct unpacket_traits<Packet8uc>
539
+ {
540
+ typedef uint8_t type;
541
+ typedef Packet4uc half;
542
+ enum
543
+ {
544
+ size = 8,
545
+ alignment = Aligned16,
546
+ vectorizable = true,
547
+ masked_load_available = false,
548
+ masked_store_available = false
549
+ };
550
+ };
551
+ template<> struct unpacket_traits<Packet16uc>
552
+ {
553
+ typedef uint8_t type;
554
+ typedef Packet8uc half;
555
+ enum
556
+ {
557
+ size = 16,
558
+ alignment = Aligned16,
559
+ vectorizable = true,
560
+ masked_load_available = false,
561
+ masked_store_available = false};
562
+ };
563
+ template<> struct unpacket_traits<Packet4s>
564
+ {
565
+ typedef int16_t type;
566
+ typedef Packet4s half;
567
+ enum
568
+ {
569
+ size = 4,
570
+ alignment = Aligned16,
571
+ vectorizable = true,
572
+ masked_load_available = false,
573
+ masked_store_available = false
574
+ };
575
+ };
576
+ template<> struct unpacket_traits<Packet8s>
577
+ {
578
+ typedef int16_t type;
579
+ typedef Packet4s half;
580
+ enum
581
+ {
582
+ size = 8,
583
+ alignment = Aligned16,
584
+ vectorizable = true,
585
+ masked_load_available = false,
586
+ masked_store_available = false
587
+ };
588
+ };
589
+ template<> struct unpacket_traits<Packet4us>
590
+ {
591
+ typedef uint16_t type;
592
+ typedef Packet4us half;
593
+ enum
594
+ {
595
+ size = 4,
596
+ alignment = Aligned16,
597
+ vectorizable = true,
598
+ masked_load_available = false,
599
+ masked_store_available = false
600
+ };
601
+ };
602
+ template<> struct unpacket_traits<Packet8us>
603
+ {
604
+ typedef uint16_t type;
605
+ typedef Packet4us half;
606
+ enum
607
+ {
608
+ size = 8,
609
+ alignment = Aligned16,
610
+ vectorizable = true,
611
+ masked_load_available = false,
612
+ masked_store_available = false
613
+ };
614
+ };
615
+ template<> struct unpacket_traits<Packet2i>
616
+ {
617
+ typedef int32_t type;
618
+ typedef Packet2i half;
619
+ enum
620
+ {
621
+ size = 2,
622
+ alignment = Aligned16,
623
+ vectorizable = true,
624
+ masked_load_available = false,
625
+ masked_store_available = false
626
+ };
627
+ };
628
+ template<> struct unpacket_traits<Packet4i>
629
+ {
630
+ typedef int32_t type;
631
+ typedef Packet2i half;
632
+ enum
633
+ {
634
+ size = 4,
635
+ alignment = Aligned16,
636
+ vectorizable = true,
637
+ masked_load_available = false,
638
+ masked_store_available = false
639
+ };
640
+ };
641
+ template<> struct unpacket_traits<Packet2ui>
642
+ {
643
+ typedef uint32_t type;
644
+ typedef Packet2ui half;
645
+ enum
646
+ {
647
+ size = 2,
648
+ alignment = Aligned16,
649
+ vectorizable = true,
650
+ masked_load_available = false,
651
+ masked_store_available = false
652
+ };
653
+ };
654
+ template<> struct unpacket_traits<Packet4ui>
655
+ {
656
+ typedef uint32_t type;
657
+ typedef Packet2ui half;
658
+ enum
659
+ {
660
+ size = 4,
661
+ alignment = Aligned16,
662
+ vectorizable = true,
663
+ masked_load_available = false,
664
+ masked_store_available = false
665
+ };
666
+ };
667
+ template<> struct unpacket_traits<Packet2l>
668
+ {
669
+ typedef int64_t type;
670
+ typedef Packet2l half;
671
+ enum
672
+ {
673
+ size = 2,
674
+ alignment = Aligned16,
675
+ vectorizable = true,
676
+ masked_load_available = false,
677
+ masked_store_available = false
678
+ };
679
+ };
680
+ template<> struct unpacket_traits<Packet2ul>
681
+ {
682
+ typedef uint64_t type;
683
+ typedef Packet2ul half;
684
+ enum
685
+ {
686
+ size = 2,
687
+ alignment = Aligned16,
688
+ vectorizable = true,
689
+ masked_load_available = false,
690
+ masked_store_available = false
691
+ };
692
+ };
693
+
694
+ template<> EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) { return vdup_n_f32(from); }
695
+ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
696
+ template<> EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from)
697
+ { return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); }
698
+ template<> EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) { return vdup_n_s8(from); }
699
+ template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) { return vdupq_n_s8(from); }
700
+ template<> EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from)
701
+ { return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); }
702
+ template<> EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) { return vdup_n_u8(from); }
703
+ template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) { return vdupq_n_u8(from); }
704
+ template<> EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) { return vdup_n_s16(from); }
705
+ template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) { return vdupq_n_s16(from); }
706
+ template<> EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) { return vdup_n_u16(from); }
707
+ template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) { return vdupq_n_u16(from); }
708
+ template<> EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) { return vdup_n_s32(from); }
709
+ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
710
+ template<> EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) { return vdup_n_u32(from); }
711
+ template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return vdupq_n_u32(from); }
712
+ template<> EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { return vdupq_n_s64(from); }
713
+ template<> EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) { return vdupq_n_u64(from); }
714
+
715
+ template<> EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(unsigned int from)
716
+ { return vreinterpret_f32_u32(vdup_n_u32(from)); }
717
+ template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from)
718
+ { return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
719
+
720
+ template<> EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a)
721
+ {
722
+ const float c[] = {0.0f,1.0f};
723
+ return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
724
+ }
725
+ template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
726
+ {
727
+ const float c[] = {0.0f,1.0f,2.0f,3.0f};
728
+ return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
729
+ }
730
+ template<> EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a)
731
+ { return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); }
732
+ template<> EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a)
733
+ {
734
+ const int8_t c[] = {0,1,2,3,4,5,6,7};
735
+ return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
736
+ }
737
+ template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a)
738
+ {
739
+ const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
740
+ return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
741
+ }
742
+ template<> EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a)
743
+ { return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); }
744
+ template<> EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a)
745
+ {
746
+ const uint8_t c[] = {0,1,2,3,4,5,6,7};
747
+ return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
748
+ }
749
+ template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a)
750
+ {
751
+ const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
752
+ return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
753
+ }
754
+ template<> EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a)
755
+ {
756
+ const int16_t c[] = {0,1,2,3};
757
+ return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
758
+ }
759
+ template<> EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a)
760
+ {
761
+ const uint16_t c[] = {0,1,2,3};
762
+ return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
763
+ }
764
+ template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a)
765
+ {
766
+ const int16_t c[] = {0,1,2,3,4,5,6,7};
767
+ return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
768
+ }
769
+ template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a)
770
+ {
771
+ const uint16_t c[] = {0,1,2,3,4,5,6,7};
772
+ return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
773
+ }
774
+ template<> EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a)
775
+ {
776
+ const int32_t c[] = {0,1};
777
+ return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
778
+ }
779
+ template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
780
+ {
781
+ const int32_t c[] = {0,1,2,3};
782
+ return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
783
+ }
784
+ template<> EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a)
785
+ {
786
+ const uint32_t c[] = {0,1};
787
+ return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
788
+ }
789
+ template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a)
790
+ {
791
+ const uint32_t c[] = {0,1,2,3};
792
+ return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
793
+ }
794
+ template<> EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a)
795
+ {
796
+ const int64_t c[] = {0,1};
797
+ return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
798
+ }
799
+ template<> EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a)
800
+ {
801
+ const uint64_t c[] = {0,1};
802
+ return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
803
+ }
804
+
805
+ template<> EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); }
806
+ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
807
+ template<> EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b)
808
+ {
809
+ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(
810
+ vreinterpret_s8_s32(vdup_n_s32(a)),
811
+ vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
812
+ }
813
+ template<> EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); }
814
+ template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); }
815
+ template<> EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
816
+ {
817
+ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(
818
+ vreinterpret_u8_u32(vdup_n_u32(a)),
819
+ vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
820
+ }
821
+ template<> EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); }
822
+ template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); }
823
+ template<> EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); }
824
+ template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); }
825
+ template<> EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); }
826
+ template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); }
827
+ template<> EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); }
828
+ template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
829
+ template<> EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); }
830
+ template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); }
831
+ template<> EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); }
832
+ template<> EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); }
833
+
834
+ template<> EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); }
835
+ template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
836
+ template<> EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b)
837
+ {
838
+ return vget_lane_s32(vreinterpret_s32_s8(vsub_s8(
839
+ vreinterpret_s8_s32(vdup_n_s32(a)),
840
+ vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
841
+ }
842
+ template<> EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); }
843
+ template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); }
844
+ template<> EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
845
+ {
846
+ return vget_lane_u32(vreinterpret_u32_u8(vsub_u8(
847
+ vreinterpret_u8_u32(vdup_n_u32(a)),
848
+ vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
849
+ }
850
+ template<> EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); }
851
+ template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); }
852
+ template<> EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); }
853
+ template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); }
854
+ template<> EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); }
855
+ template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); }
856
+ template<> EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); }
857
+ template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
858
+ template<> EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); }
859
+ template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); }
860
+ template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
861
+ template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
862
+
863
+ template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
864
+ template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
865
+ Packet2f mask = {numext::bit_cast<float>(0x80000000u), 0.0f};
866
+ return padd(a, pxor(mask, b));
867
+ }
868
+ template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
869
+ template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
870
+ Packet4f mask = {numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f};
871
+ return padd(a, pxor(mask, b));
872
+ }
873
+
874
+ template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
875
+ template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
876
+ template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
877
+ { return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
878
+ template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); }
879
+ template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); }
880
+ template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); }
881
+ template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); }
882
+ template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); }
883
+ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
884
+ template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
885
+ #if EIGEN_ARCH_ARM64
886
+ return vnegq_s64(a);
887
+ #else
888
+ return vcombine_s64(
889
+ vdup_n_s64(-vgetq_lane_s64(a, 0)),
890
+ vdup_n_s64(-vgetq_lane_s64(a, 1)));
891
+ #endif
892
+ }
893
+
894
+ template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; }
895
+ template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
896
+ template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; }
897
+ template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; }
898
+ template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; }
899
+ template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; }
900
+ template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; }
901
+ template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; }
902
+ template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; }
903
+ template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; }
904
+ template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; }
905
+ template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; }
906
+ template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; }
907
+ template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
908
+ template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; }
909
+ template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; }
910
+ template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; }
911
+ template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; }
912
+
913
+ template<> EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); }
914
+ template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
915
+ template<> EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b)
916
+ {
917
+ return vget_lane_s32(vreinterpret_s32_s8(vmul_s8(
918
+ vreinterpret_s8_s32(vdup_n_s32(a)),
919
+ vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
920
+ }
921
+ template<> EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); }
922
+ template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); }
923
+ template<> EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
924
+ {
925
+ return vget_lane_u32(vreinterpret_u32_u8(vmul_u8(
926
+ vreinterpret_u8_u32(vdup_n_u32(a)),
927
+ vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
928
+ }
929
+ template<> EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); }
930
+ template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); }
931
+ template<> EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); }
932
+ template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); }
933
+ template<> EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); }
934
+ template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); }
935
+ template<> EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); }
936
+ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
937
+ template<> EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); }
938
+ template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); }
939
+ template<> EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
940
+ return vcombine_s64(
941
+ vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)),
942
+ vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1)));
943
+ }
944
+ template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
945
+ return vcombine_u64(
946
+ vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)),
947
+ vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
948
+ }
949
+
950
+ template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b)
951
+ {
952
+ #if EIGEN_ARCH_ARM64
953
+ return vdiv_f32(a,b);
954
+ #else
955
+ Packet2f inv, restep, div;
956
+
957
+ // NEON does not offer a divide instruction, we have to do a reciprocal approximation
958
+ // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
959
+ // a reciprocal estimate AND a reciprocal step -which saves a few instructions
960
+ // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
961
+ // Newton-Raphson and vrecpsq_f32()
962
+ inv = vrecpe_f32(b);
963
+
964
+ // This returns a differential, by which we will have to multiply inv to get a better
965
+ // approximation of 1/b.
966
+ restep = vrecps_f32(b, inv);
967
+ inv = vmul_f32(restep, inv);
968
+
969
+ // Finally, multiply a by 1/b and get the wanted result of the division.
970
+ div = vmul_f32(a, inv);
971
+
972
+ return div;
973
+ #endif
974
+ }
975
+ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
976
+ {
977
+ #if EIGEN_ARCH_ARM64
978
+ return vdivq_f32(a,b);
979
+ #else
980
+ Packet4f inv, restep, div;
981
+
982
+ // NEON does not offer a divide instruction, we have to do a reciprocal approximation
983
+ // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
984
+ // a reciprocal estimate AND a reciprocal step -which saves a few instructions
985
+ // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
986
+ // Newton-Raphson and vrecpsq_f32()
987
+ inv = vrecpeq_f32(b);
988
+
989
+ // This returns a differential, by which we will have to multiply inv to get a better
990
+ // approximation of 1/b.
991
+ restep = vrecpsq_f32(b, inv);
992
+ inv = vmulq_f32(restep, inv);
993
+
994
+ // Finally, multiply a by 1/b and get the wanted result of the division.
995
+ div = vmulq_f32(a, inv);
996
+
997
+ return div;
998
+ #endif
999
+ }
1000
+
1001
+ template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
1002
+ {
1003
+ eigen_assert(false && "packet integer division are not supported by NEON");
1004
+ return pset1<Packet4c>(0);
1005
+ }
1006
+ template<> EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/)
1007
+ {
1008
+ eigen_assert(false && "packet integer division are not supported by NEON");
1009
+ return pset1<Packet8c>(0);
1010
+ }
1011
+ template<> EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/)
1012
+ {
1013
+ eigen_assert(false && "packet integer division are not supported by NEON");
1014
+ return pset1<Packet16c>(0);
1015
+ }
1016
+ template<> EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/)
1017
+ {
1018
+ eigen_assert(false && "packet integer division are not supported by NEON");
1019
+ return pset1<Packet4uc>(0);
1020
+ }
1021
+ template<> EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/)
1022
+ {
1023
+ eigen_assert(false && "packet integer division are not supported by NEON");
1024
+ return pset1<Packet8uc>(0);
1025
+ }
1026
+ template<> EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/)
1027
+ {
1028
+ eigen_assert(false && "packet integer division are not supported by NEON");
1029
+ return pset1<Packet16uc>(0);
1030
+ }
1031
+ template<> EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/)
1032
+ {
1033
+ eigen_assert(false && "packet integer division are not supported by NEON");
1034
+ return pset1<Packet4s>(0);
1035
+ }
1036
+ template<> EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/)
1037
+ {
1038
+ eigen_assert(false && "packet integer division are not supported by NEON");
1039
+ return pset1<Packet8s>(0);
1040
+ }
1041
+ template<> EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/)
1042
+ {
1043
+ eigen_assert(false && "packet integer division are not supported by NEON");
1044
+ return pset1<Packet4us>(0);
1045
+ }
1046
+ template<> EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/)
1047
+ {
1048
+ eigen_assert(false && "packet integer division are not supported by NEON");
1049
+ return pset1<Packet8us>(0);
1050
+ }
1051
+ template<> EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/)
1052
+ {
1053
+ eigen_assert(false && "packet integer division are not supported by NEON");
1054
+ return pset1<Packet2i>(0);
1055
+ }
1056
+ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
1057
+ {
1058
+ eigen_assert(false && "packet integer division are not supported by NEON");
1059
+ return pset1<Packet4i>(0);
1060
+ }
1061
+ template<> EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/)
1062
+ {
1063
+ eigen_assert(false && "packet integer division are not supported by NEON");
1064
+ return pset1<Packet2ui>(0);
1065
+ }
1066
+ template<> EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/)
1067
+ {
1068
+ eigen_assert(false && "packet integer division are not supported by NEON");
1069
+ return pset1<Packet4ui>(0);
1070
+ }
1071
+ template<> EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/)
1072
+ {
1073
+ eigen_assert(false && "packet integer division are not supported by NEON");
1074
+ return pset1<Packet2l>(0LL);
1075
+ }
1076
+ template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/)
1077
+ {
1078
+ eigen_assert(false && "packet integer division are not supported by NEON");
1079
+ return pset1<Packet2ul>(0ULL);
1080
+ }
1081
+
1082
+
1083
+ #ifdef __ARM_FEATURE_FMA
1084
+ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
1085
+ { return vfmaq_f32(c,a,b); }
1086
+ template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
1087
+ { return vfma_f32(c,a,b); }
1088
+ #else
1089
+ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
1090
+ {
1091
+ return vmlaq_f32(c,a,b);
1092
+ }
1093
+ template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
1094
+ {
1095
+ return vmla_f32(c,a,b);
1096
+ }
1097
+ #endif
1098
+
1099
+ // No FMA instruction for int, so use MLA unconditionally.
1100
+ template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c)
1101
+ {
1102
+ return vget_lane_s32(vreinterpret_s32_s8(vmla_s8(
1103
+ vreinterpret_s8_s32(vdup_n_s32(c)),
1104
+ vreinterpret_s8_s32(vdup_n_s32(a)),
1105
+ vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1106
+ }
1107
+ template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c)
1108
+ { return vmla_s8(c,a,b); }
1109
+ template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c)
1110
+ { return vmlaq_s8(c,a,b); }
1111
+ template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c)
1112
+ {
1113
+ return vget_lane_u32(vreinterpret_u32_u8(vmla_u8(
1114
+ vreinterpret_u8_u32(vdup_n_u32(c)),
1115
+ vreinterpret_u8_u32(vdup_n_u32(a)),
1116
+ vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1117
+ }
1118
+ template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c)
1119
+ { return vmla_u8(c,a,b); }
1120
+ template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c)
1121
+ { return vmlaq_u8(c,a,b); }
1122
+ template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c)
1123
+ { return vmla_s16(c,a,b); }
1124
+ template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c)
1125
+ { return vmlaq_s16(c,a,b); }
1126
+ template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c)
1127
+ { return vmla_u16(c,a,b); }
1128
+ template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c)
1129
+ { return vmlaq_u16(c,a,b); }
1130
+ template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c)
1131
+ { return vmla_s32(c,a,b); }
1132
+ template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c)
1133
+ { return vmlaq_s32(c,a,b); }
1134
+ template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c)
1135
+ { return vmla_u32(c,a,b); }
1136
+ template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c)
1137
+ { return vmlaq_u32(c,a,b); }
1138
+
1139
+ template<> EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b)
1140
+ { return vabd_f32(a,b); }
1141
+ template<> EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b)
1142
+ { return vabdq_f32(a,b); }
1143
+ template<> EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b)
1144
+ {
1145
+ return vget_lane_s32(vreinterpret_s32_s8(vabd_s8(
1146
+ vreinterpret_s8_s32(vdup_n_s32(a)),
1147
+ vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1148
+ }
1149
+ template<> EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b)
1150
+ { return vabd_s8(a,b); }
1151
+ template<> EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b)
1152
+ { return vabdq_s8(a,b); }
1153
+ template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1154
+ {
1155
+ return vget_lane_u32(vreinterpret_u32_u8(vabd_u8(
1156
+ vreinterpret_u8_u32(vdup_n_u32(a)),
1157
+ vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1158
+ }
1159
+ template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1160
+ { return vabd_u8(a,b); }
1161
+ template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1162
+ { return vabdq_u8(a,b); }
1163
+ template<> EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b)
1164
+ { return vabd_s16(a,b); }
1165
+ template<> EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b)
1166
+ { return vabdq_s16(a,b); }
1167
+ template<> EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b)
1168
+ { return vabd_u16(a,b); }
1169
+ template<> EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b)
1170
+ { return vabdq_u16(a,b); }
1171
+ template<> EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b)
1172
+ { return vabd_s32(a,b); }
1173
+ template<> EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b)
1174
+ { return vabdq_s32(a,b); }
1175
+ template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1176
+ { return vabd_u32(a,b); }
1177
+ template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1178
+ { return vabdq_u32(a,b); }
1179
+
1180
+ template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); }
1181
+ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
1182
+
1183
+ #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1184
+ // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
1185
+ template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); }
1186
+ template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); }
1187
+ #endif
1188
+
1189
+ template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmin<Packet4f>(a, b); }
1190
+
1191
+ template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmin<Packet2f>(a, b); }
1192
+
1193
+ template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b)
1194
+ {
1195
+ return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
1196
+ vreinterpret_s8_s32(vdup_n_s32(a)),
1197
+ vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1198
+ }
1199
+ template<> EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); }
1200
+ template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); }
1201
+ template<> EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1202
+ {
1203
+ return vget_lane_u32(vreinterpret_u32_u8(vmin_u8(
1204
+ vreinterpret_u8_u32(vdup_n_u32(a)),
1205
+ vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1206
+ }
1207
+ template<> EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); }
1208
+ template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); }
1209
+ template<> EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); }
1210
+ template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); }
1211
+ template<> EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); }
1212
+ template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); }
1213
+ template<> EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); }
1214
+ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
1215
+ template<> EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); }
1216
+ template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); }
1217
+ template<> EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1218
+ return vcombine_s64(
1219
+ vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1220
+ vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1221
+ }
1222
+ template<> EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1223
+ return vcombine_u64(
1224
+ vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1225
+ vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1226
+ }
1227
+
1228
+ template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); }
1229
+ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
1230
+
1231
+ #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1232
+ // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
1233
+ template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); }
1234
+ template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); }
1235
+ #endif
1236
+
1237
+ template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmax<Packet4f>(a, b); }
1238
+
1239
+ template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmax<Packet2f>(a, b); }
1240
+
1241
+ template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b)
1242
+ {
1243
+ return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
1244
+ vreinterpret_s8_s32(vdup_n_s32(a)),
1245
+ vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1246
+ }
1247
+ template<> EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); }
1248
+ template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); }
1249
+ template<> EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1250
+ {
1251
+ return vget_lane_u32(vreinterpret_u32_u8(vmax_u8(
1252
+ vreinterpret_u8_u32(vdup_n_u32(a)),
1253
+ vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1254
+ }
1255
+ template<> EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); }
1256
+ template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); }
1257
+ template<> EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); }
1258
+ template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); }
1259
+ template<> EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); }
1260
+ template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); }
1261
+ template<> EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); }
1262
+ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
1263
+ template<> EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); }
1264
+ template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); }
1265
+ template<> EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1266
+ return vcombine_s64(
1267
+ vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1268
+ vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1269
+ }
1270
+ template<> EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1271
+ return vcombine_u64(
1272
+ vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1273
+ vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1274
+ }
1275
+
1276
+ template<> EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b)
1277
+ { return vreinterpret_f32_u32(vcle_f32(a,b)); }
1278
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
1279
+ { return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
1280
+ template<> EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b)
1281
+ {
1282
+ return vget_lane_s32(vreinterpret_s32_u8(vcle_s8(
1283
+ vreinterpret_s8_s32(vdup_n_s32(a)),
1284
+ vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1285
+ }
1286
+ template<> EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b)
1287
+ { return vreinterpret_s8_u8(vcle_s8(a,b)); }
1288
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b)
1289
+ { return vreinterpretq_s8_u8(vcleq_s8(a,b)); }
1290
+ template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1291
+ {
1292
+ return vget_lane_u32(vreinterpret_u32_u8(vcle_u8(
1293
+ vreinterpret_u8_u32(vdup_n_u32(a)),
1294
+ vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1295
+ }
1296
+ template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1297
+ { return vcle_u8(a,b); }
1298
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1299
+ { return vcleq_u8(a,b); }
1300
+ template<> EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b)
1301
+ { return vreinterpret_s16_u16(vcle_s16(a,b)); }
1302
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b)
1303
+ { return vreinterpretq_s16_u16(vcleq_s16(a,b)); }
1304
+ template<> EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b)
1305
+ { return vcle_u16(a,b); }
1306
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b)
1307
+ { return vcleq_u16(a,b); }
1308
+ template<> EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b)
1309
+ { return vreinterpret_s32_u32(vcle_s32(a,b)); }
1310
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b)
1311
+ { return vreinterpretq_s32_u32(vcleq_s32(a,b)); }
1312
+ template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1313
+ { return vcle_u32(a,b); }
1314
+ template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1315
+ { return vcleq_u32(a,b); }
1316
+ template<> EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b)
1317
+ {
1318
+ #if EIGEN_ARCH_ARM64
1319
+ return vreinterpretq_s64_u64(vcleq_s64(a,b));
1320
+ #else
1321
+ return vcombine_s64(
1322
+ vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1323
+ vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1324
+ #endif
1325
+ }
1326
+ template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1327
+ {
1328
+ #if EIGEN_ARCH_ARM64
1329
+ return vcleq_u64(a,b);
1330
+ #else
1331
+ return vcombine_u64(
1332
+ vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1333
+ vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1334
+ #endif
1335
+ }
1336
+
1337
+ template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b)
1338
+ { return vreinterpret_f32_u32(vclt_f32(a,b)); }
1339
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
1340
+ { return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
1341
+ template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b)
1342
+ {
1343
+ return vget_lane_s32(vreinterpret_s32_u8(vclt_s8(
1344
+ vreinterpret_s8_s32(vdup_n_s32(a)),
1345
+ vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1346
+ }
1347
+ template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b)
1348
+ { return vreinterpret_s8_u8(vclt_s8(a,b)); }
1349
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b)
1350
+ { return vreinterpretq_s8_u8(vcltq_s8(a,b)); }
1351
+ template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1352
+ {
1353
+ return vget_lane_u32(vreinterpret_u32_u8(vclt_u8(
1354
+ vreinterpret_u8_u32(vdup_n_u32(a)),
1355
+ vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1356
+ }
1357
+ template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1358
+ { return vclt_u8(a,b); }
1359
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1360
+ { return vcltq_u8(a,b); }
1361
+ template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b)
1362
+ { return vreinterpret_s16_u16(vclt_s16(a,b)); }
1363
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b)
1364
+ { return vreinterpretq_s16_u16(vcltq_s16(a,b)); }
1365
+ template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b)
1366
+ { return vclt_u16(a,b); }
1367
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b)
1368
+ { return vcltq_u16(a,b); }
1369
+ template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b)
1370
+ { return vreinterpret_s32_u32(vclt_s32(a,b)); }
1371
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b)
1372
+ { return vreinterpretq_s32_u32(vcltq_s32(a,b)); }
1373
+ template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1374
+ { return vclt_u32(a,b); }
1375
+ template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1376
+ { return vcltq_u32(a,b); }
1377
+ template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b)
1378
+ {
1379
+ #if EIGEN_ARCH_ARM64
1380
+ return vreinterpretq_s64_u64(vcltq_s64(a,b));
1381
+ #else
1382
+ return vcombine_s64(
1383
+ vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1384
+ vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1385
+ #endif
1386
+ }
1387
+ template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1388
+ {
1389
+ #if EIGEN_ARCH_ARM64
1390
+ return vcltq_u64(a,b);
1391
+ #else
1392
+ return vcombine_u64(
1393
+ vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1394
+ vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1395
+ #endif
1396
+ }
1397
+
1398
+ template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b)
1399
+ { return vreinterpret_f32_u32(vceq_f32(a,b)); }
1400
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
1401
+ { return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
1402
+ template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b)
1403
+ {
1404
+ return vget_lane_s32(vreinterpret_s32_u8(vceq_s8(
1405
+ vreinterpret_s8_s32(vdup_n_s32(a)),
1406
+ vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1407
+ }
1408
+ template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b)
1409
+ { return vreinterpret_s8_u8(vceq_s8(a,b)); }
1410
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b)
1411
+ { return vreinterpretq_s8_u8(vceqq_s8(a,b)); }
1412
+ template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1413
+ {
1414
+ return vget_lane_u32(vreinterpret_u32_u8(vceq_u8(
1415
+ vreinterpret_u8_u32(vdup_n_u32(a)),
1416
+ vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1417
+ }
1418
+ template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1419
+ { return vceq_u8(a,b); }
1420
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1421
+ { return vceqq_u8(a,b); }
1422
+ template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b)
1423
+ { return vreinterpret_s16_u16(vceq_s16(a,b)); }
1424
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b)
1425
+ { return vreinterpretq_s16_u16(vceqq_s16(a,b)); }
1426
+ template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b)
1427
+ { return vceq_u16(a,b); }
1428
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b)
1429
+ { return vceqq_u16(a,b); }
1430
+ template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b)
1431
+ { return vreinterpret_s32_u32(vceq_s32(a,b)); }
1432
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b)
1433
+ { return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
1434
+ template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1435
+ { return vceq_u32(a,b); }
1436
+ template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1437
+ { return vceqq_u32(a,b); }
1438
+ template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b)
1439
+ {
1440
+ #if EIGEN_ARCH_ARM64
1441
+ return vreinterpretq_s64_u64(vceqq_s64(a,b));
1442
+ #else
1443
+ return vcombine_s64(
1444
+ vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1445
+ vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1446
+ #endif
1447
+ }
1448
+ template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1449
+ {
1450
+ #if EIGEN_ARCH_ARM64
1451
+ return vceqq_u64(a,b);
1452
+ #else
1453
+ return vcombine_u64(
1454
+ vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1455
+ vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1456
+ #endif
1457
+ }
1458
+
1459
+ template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b)
1460
+ { return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); }
1461
+ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b)
1462
+ { return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
1463
+
1464
+ // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
1465
+ template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b)
1466
+ { return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1467
+ template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
1468
+ { return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1469
+ template<> EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b)
1470
+ { return a & b; }
1471
+ template<> EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b)
1472
+ { return vand_s8(a,b); }
1473
+ template<> EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b)
1474
+ { return vandq_s8(a,b); }
1475
+ template<> EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1476
+ { return a & b; }
1477
+ template<> EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1478
+ { return vand_u8(a,b); }
1479
+ template<> EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1480
+ { return vandq_u8(a,b); }
1481
+ template<> EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); }
1482
+ template<> EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); }
1483
+ template<> EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b)
1484
+ { return vand_u16(a,b); }
1485
+ template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b)
1486
+ { return vandq_u16(a,b); }
1487
+ template<> EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); }
1488
+ template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
1489
+ template<> EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1490
+ { return vand_u32(a,b); }
1491
+ template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1492
+ { return vandq_u32(a,b); }
1493
+ template<> EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); }
1494
+ template<> EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1495
+ { return vandq_u64(a,b); }
1496
+
1497
+ template<> EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b)
1498
+ { return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1499
+ template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
1500
+ { return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1501
+ template<> EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b)
1502
+ { return a | b; }
1503
+ template<> EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); }
1504
+ template<> EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b)
1505
+ { return vorrq_s8(a,b); }
1506
+ template<> EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1507
+ { return a | b; }
1508
+ template<> EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1509
+ { return vorr_u8(a,b); }
1510
+ template<> EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1511
+ { return vorrq_u8(a,b); }
1512
+ template<> EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b)
1513
+ { return vorr_s16(a,b); }
1514
+ template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b)
1515
+ { return vorrq_s16(a,b); }
1516
+ template<> EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b)
1517
+ { return vorr_u16(a,b); }
1518
+ template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b)
1519
+ { return vorrq_u16(a,b); }
1520
+ template<> EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); }
1521
+ template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
1522
+ template<> EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1523
+ { return vorr_u32(a,b); }
1524
+ template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1525
+ { return vorrq_u32(a,b); }
1526
+ template<> EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b)
1527
+ { return vorrq_s64(a,b); }
1528
+ template<> EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1529
+ { return vorrq_u64(a,b); }
1530
+
1531
+ template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b)
1532
+ { return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1533
+ template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
1534
+ { return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1535
+ template<> EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b)
1536
+ { return a ^ b; }
1537
+ template<> EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b)
1538
+ { return veor_s8(a,b); }
1539
+ template<> EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b)
1540
+ { return veorq_s8(a,b); }
1541
+ template<> EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1542
+ { return a ^ b; }
1543
+ template<> EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1544
+ { return veor_u8(a,b); }
1545
+ template<> EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1546
+ { return veorq_u8(a,b); }
1547
+ template<> EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); }
1548
+ template<> EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); }
1549
+ template<> EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b)
1550
+ { return veor_u16(a,b); }
1551
+ template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b)
1552
+ { return veorq_u16(a,b); }
1553
+ template<> EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); }
1554
+ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
1555
+ template<> EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1556
+ { return veor_u32(a,b); }
1557
+ template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1558
+ { return veorq_u32(a,b); }
1559
+ template<> EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b)
1560
+ { return veorq_s64(a,b); }
1561
+ template<> EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1562
+ { return veorq_u64(a,b); }
1563
+
1564
+ template<> EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b)
1565
+ { return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1566
+ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
1567
+ { return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1568
+ template<> EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b)
1569
+ { return a & ~b; }
1570
+ template<> EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); }
1571
+ template<> EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); }
1572
+ template<> EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1573
+ { return a & ~b; }
1574
+ template<> EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1575
+ { return vbic_u8(a,b); }
1576
+ template<> EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1577
+ { return vbicq_u8(a,b); }
1578
+ template<> EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b)
1579
+ { return vbic_s16(a,b); }
1580
+ template<> EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b)
1581
+ { return vbicq_s16(a,b); }
1582
+ template<> EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b)
1583
+ { return vbic_u16(a,b); }
1584
+ template<> EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b)
1585
+ { return vbicq_u16(a,b); }
1586
+ template<> EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b)
1587
+ { return vbic_s32(a,b); }
1588
+ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b)
1589
+ { return vbicq_s32(a,b); }
1590
+ template<> EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1591
+ { return vbic_u32(a,b); }
1592
+ template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1593
+ { return vbicq_u32(a,b); }
1594
+ template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b)
1595
+ { return vbicq_s64(a,b); }
1596
+ template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1597
+ { return vbicq_u64(a,b); }
1598
+
1599
+
1600
+ template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
1601
+ { return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
1602
+ template<int N> EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); }
1603
+ template<int N> EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); }
1604
+ template<int N> EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a)
1605
+ { return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
1606
+ template<int N> EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
1607
+ template<int N> EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
1608
+ template<int N> EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); }
1609
+ template<int N> EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); }
1610
+ template<int N> EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
1611
+ template<int N> EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
1612
+ template<int N> EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); }
1613
+ template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); }
1614
+ template<int N> EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
1615
+ template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
1616
+ template<int N> EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); }
1617
+ template<int N> EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
1618
+
1619
+ template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a)
1620
+ { return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); }
1621
+ template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a)
1622
+ { return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); }
1623
+ template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a)
1624
+ { return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); }
1625
+ template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a)
1626
+ { return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); }
1627
+ template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
1628
+ template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
1629
+ template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a)
1630
+ { return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); }
1631
+ template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a)
1632
+ { return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); }
1633
+ template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
1634
+ template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
1635
+ template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a)
1636
+ { return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); }
1637
+ template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
1638
+ { return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); }
1639
+ template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
1640
+ template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
1641
+ template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a)
1642
+ { return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); }
1643
+ template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
1644
+
1645
+ template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a)
1646
+ { return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
1647
+ template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); }
1648
+ template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); }
1649
+ template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a)
1650
+ { return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
1651
+ template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); }
1652
+ template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); }
1653
+ template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); }
1654
+ template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); }
1655
+ template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); }
1656
+ template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); }
1657
+ template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); }
1658
+ template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); }
1659
+ template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); }
1660
+ template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); }
1661
+ template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); }
1662
+ template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); }
1663
+
1664
+ template<> EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from)
1665
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); }
1666
+ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
1667
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
1668
+ template<> EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from)
1669
+ {
1670
+ Packet4c res;
1671
+ memcpy(&res, from, sizeof(Packet4c));
1672
+ return res;
1673
+ }
1674
+ template<> EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from)
1675
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); }
1676
+ template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from)
1677
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); }
1678
+ template<> EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from)
1679
+ {
1680
+ Packet4uc res;
1681
+ memcpy(&res, from, sizeof(Packet4uc));
1682
+ return res;
1683
+ }
1684
+ template<> EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from)
1685
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); }
1686
+ template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from)
1687
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); }
1688
+ template<> EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from)
1689
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); }
1690
+ template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from)
1691
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); }
1692
+ template<> EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from)
1693
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); }
1694
+ template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from)
1695
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); }
1696
+ template<> EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from)
1697
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); }
1698
+ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from)
1699
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
1700
+ template<> EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from)
1701
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); }
1702
+ template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from)
1703
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); }
1704
+ template<> EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from)
1705
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); }
1706
+ template<> EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from)
1707
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); }
1708
+
1709
+ template<> EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from)
1710
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); }
1711
+ template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
1712
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
1713
+ template<> EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from)
1714
+ {
1715
+ Packet4c res;
1716
+ memcpy(&res, from, sizeof(Packet4c));
1717
+ return res;
1718
+ }
1719
+ template<> EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from)
1720
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); }
1721
+ template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from)
1722
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); }
1723
+ template<> EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from)
1724
+ {
1725
+ Packet4uc res;
1726
+ memcpy(&res, from, sizeof(Packet4uc));
1727
+ return res;
1728
+ }
1729
+ template<> EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from)
1730
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); }
1731
+ template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from)
1732
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); }
1733
+ template<> EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from)
1734
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); }
1735
+ template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from)
1736
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); }
1737
+ template<> EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from)
1738
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); }
1739
+ template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from)
1740
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); }
1741
+ template<> EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from)
1742
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); }
1743
+ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from)
1744
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
1745
+ template<> EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from)
1746
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); }
1747
+ template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
1748
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); }
1749
+ template<> EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from)
1750
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); }
1751
+ template<> EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from)
1752
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); }
1753
+
1754
+ template<> EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from)
1755
+ { return vld1_dup_f32(from); }
1756
+ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
1757
+ { return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); }
1758
+ template<> EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from)
1759
+ {
1760
+ const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
1761
+ return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0);
1762
+ }
1763
+ template<> EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from)
1764
+ {
1765
+ const int8x8_t a = vld1_s8(from);
1766
+ return vzip_s8(a,a).val[0];
1767
+ }
1768
+ template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from)
1769
+ {
1770
+ const int8x8_t a = vld1_s8(from);
1771
+ const int8x8x2_t b = vzip_s8(a,a);
1772
+ return vcombine_s8(b.val[0], b.val[1]);
1773
+ }
1774
+ template<> EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from)
1775
+ {
1776
+ const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
1777
+ return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0);
1778
+ }
1779
+ template<> EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from)
1780
+ {
1781
+ const uint8x8_t a = vld1_u8(from);
1782
+ return vzip_u8(a,a).val[0];
1783
+ }
1784
+ template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from)
1785
+ {
1786
+ const uint8x8_t a = vld1_u8(from);
1787
+ const uint8x8x2_t b = vzip_u8(a,a);
1788
+ return vcombine_u8(b.val[0], b.val[1]);
1789
+ }
1790
+ template<> EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from)
1791
+ {
1792
+ return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)),
1793
+ vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]);
1794
+ }
1795
+ template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from)
1796
+ {
1797
+ const int16x4_t a = vld1_s16(from);
1798
+ const int16x4x2_t b = vzip_s16(a,a);
1799
+ return vcombine_s16(b.val[0], b.val[1]);
1800
+ }
1801
+ template<> EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from)
1802
+ {
1803
+ return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)),
1804
+ vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]);
1805
+ }
1806
+ template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from)
1807
+ {
1808
+ const uint16x4_t a = vld1_u16(from);
1809
+ const uint16x4x2_t b = vzip_u16(a,a);
1810
+ return vcombine_u16(b.val[0], b.val[1]);
1811
+ }
1812
+ template<> EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from)
1813
+ { return vld1_dup_s32(from); }
1814
+ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
1815
+ { return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); }
1816
+ template<> EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from)
1817
+ { return vld1_dup_u32(from); }
1818
+ template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
1819
+ { return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); }
1820
+ template<> EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from)
1821
+ { return vld1q_dup_s64(from); }
1822
+ template<> EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from)
1823
+ { return vld1q_dup_u64(from); }
1824
+
1825
+ template<> EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) { return vld1q_dup_f32(from); }
1826
+ template<> EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from)
1827
+ { return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); }
1828
+ template<> EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from)
1829
+ {
1830
+ return vreinterpret_s8_u32(vzip_u32(
1831
+ vreinterpret_u32_s8(vld1_dup_s8(from)),
1832
+ vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
1833
+ }
1834
+ template<> EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from)
1835
+ {
1836
+ const int8x8_t a = vreinterpret_s8_u32(vzip_u32(
1837
+ vreinterpret_u32_s8(vld1_dup_s8(from)),
1838
+ vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
1839
+ const int8x8_t b = vreinterpret_s8_u32(vzip_u32(
1840
+ vreinterpret_u32_s8(vld1_dup_s8(from+2)),
1841
+ vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]);
1842
+ return vcombine_s8(a,b);
1843
+ }
1844
+ template<> EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from)
1845
+ { return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); }
1846
+ template<> EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from)
1847
+ {
1848
+ return vreinterpret_u8_u32(vzip_u32(
1849
+ vreinterpret_u32_u8(vld1_dup_u8(from)),
1850
+ vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
1851
+ }
1852
+ template<> EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from)
1853
+ {
1854
+ const uint8x8_t a = vreinterpret_u8_u32(vzip_u32(
1855
+ vreinterpret_u32_u8(vld1_dup_u8(from)),
1856
+ vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
1857
+ const uint8x8_t b = vreinterpret_u8_u32(vzip_u32(
1858
+ vreinterpret_u32_u8(vld1_dup_u8(from+2)),
1859
+ vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]);
1860
+ return vcombine_u8(a,b);
1861
+ }
1862
+ template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from)
1863
+ { return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); }
1864
+ template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from)
1865
+ { return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); }
1866
+ template<> EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) { return vld1q_dup_s32(from); }
1867
+ template<> EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) { return vld1q_dup_u32(from); }
1868
+
1869
+ template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from)
1870
+ { EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); }
1871
+ template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
1872
+ { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); }
1873
+ template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from)
1874
+ { memcpy(to, &from, sizeof(from)); }
1875
+ template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from)
1876
+ { EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); }
1877
+ template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from)
1878
+ { EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); }
1879
+ template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from)
1880
+ { memcpy(to, &from, sizeof(from)); }
1881
+ template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from)
1882
+ { EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); }
1883
+ template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from)
1884
+ { EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); }
1885
+ template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from)
1886
+ { EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); }
1887
+ template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from)
1888
+ { EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); }
1889
+ template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from)
1890
+ { EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); }
1891
+ template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from)
1892
+ { EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); }
1893
+ template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from)
1894
+ { EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); }
1895
+ template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from)
1896
+ { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); }
1897
+ template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from)
1898
+ { EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); }
1899
+ template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from)
1900
+ { EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); }
1901
+ template<> EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from)
1902
+ { EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); }
1903
+ template<> EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from)
1904
+ { EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); }
1905
+
1906
+ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from)
1907
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); }
1908
+ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
1909
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); }
1910
+ template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from)
1911
+ { memcpy(to, &from, sizeof(from)); }
1912
+ template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from)
1913
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); }
1914
+ template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from)
1915
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); }
1916
+ template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from)
1917
+ { memcpy(to, &from, sizeof(from)); }
1918
+ template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from)
1919
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); }
1920
+ template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from)
1921
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); }
1922
+ template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from)
1923
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); }
1924
+ template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from)
1925
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); }
1926
+ template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from)
1927
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); }
1928
+ template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from)
1929
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); }
1930
+ template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from)
1931
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); }
1932
+ template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from)
1933
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); }
1934
+ template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from)
1935
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); }
1936
+ template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from)
1937
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); }
1938
+ template<> EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from)
1939
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); }
1940
+ template<> EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from)
1941
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
1942
+
1943
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride)
1944
+ {
1945
+ Packet2f res = vld1_dup_f32(from);
1946
+ res = vld1_lane_f32(from + 1*stride, res, 1);
1947
+ return res;
1948
+ }
1949
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
1950
+ {
1951
+ Packet4f res = vld1q_dup_f32(from);
1952
+ res = vld1q_lane_f32(from + 1*stride, res, 1);
1953
+ res = vld1q_lane_f32(from + 2*stride, res, 2);
1954
+ res = vld1q_lane_f32(from + 3*stride, res, 3);
1955
+ return res;
1956
+ }
1957
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
1958
+ {
1959
+ Packet4c res;
1960
+ for (int i = 0; i != 4; i++)
1961
+ reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
1962
+ return res;
1963
+ }
1964
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
1965
+ {
1966
+ Packet8c res = vld1_dup_s8(from);
1967
+ res = vld1_lane_s8(from + 1*stride, res, 1);
1968
+ res = vld1_lane_s8(from + 2*stride, res, 2);
1969
+ res = vld1_lane_s8(from + 3*stride, res, 3);
1970
+ res = vld1_lane_s8(from + 4*stride, res, 4);
1971
+ res = vld1_lane_s8(from + 5*stride, res, 5);
1972
+ res = vld1_lane_s8(from + 6*stride, res, 6);
1973
+ res = vld1_lane_s8(from + 7*stride, res, 7);
1974
+ return res;
1975
+ }
1976
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
1977
+ {
1978
+ Packet16c res = vld1q_dup_s8(from);
1979
+ res = vld1q_lane_s8(from + 1*stride, res, 1);
1980
+ res = vld1q_lane_s8(from + 2*stride, res, 2);
1981
+ res = vld1q_lane_s8(from + 3*stride, res, 3);
1982
+ res = vld1q_lane_s8(from + 4*stride, res, 4);
1983
+ res = vld1q_lane_s8(from + 5*stride, res, 5);
1984
+ res = vld1q_lane_s8(from + 6*stride, res, 6);
1985
+ res = vld1q_lane_s8(from + 7*stride, res, 7);
1986
+ res = vld1q_lane_s8(from + 8*stride, res, 8);
1987
+ res = vld1q_lane_s8(from + 9*stride, res, 9);
1988
+ res = vld1q_lane_s8(from + 10*stride, res, 10);
1989
+ res = vld1q_lane_s8(from + 11*stride, res, 11);
1990
+ res = vld1q_lane_s8(from + 12*stride, res, 12);
1991
+ res = vld1q_lane_s8(from + 13*stride, res, 13);
1992
+ res = vld1q_lane_s8(from + 14*stride, res, 14);
1993
+ res = vld1q_lane_s8(from + 15*stride, res, 15);
1994
+ return res;
1995
+ }
1996
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
1997
+ {
1998
+ Packet4uc res;
1999
+ for (int i = 0; i != 4; i++)
2000
+ reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
2001
+ return res;
2002
+ }
2003
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
2004
+ {
2005
+ Packet8uc res = vld1_dup_u8(from);
2006
+ res = vld1_lane_u8(from + 1*stride, res, 1);
2007
+ res = vld1_lane_u8(from + 2*stride, res, 2);
2008
+ res = vld1_lane_u8(from + 3*stride, res, 3);
2009
+ res = vld1_lane_u8(from + 4*stride, res, 4);
2010
+ res = vld1_lane_u8(from + 5*stride, res, 5);
2011
+ res = vld1_lane_u8(from + 6*stride, res, 6);
2012
+ res = vld1_lane_u8(from + 7*stride, res, 7);
2013
+ return res;
2014
+ }
2015
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
2016
+ {
2017
+ Packet16uc res = vld1q_dup_u8(from);
2018
+ res = vld1q_lane_u8(from + 1*stride, res, 1);
2019
+ res = vld1q_lane_u8(from + 2*stride, res, 2);
2020
+ res = vld1q_lane_u8(from + 3*stride, res, 3);
2021
+ res = vld1q_lane_u8(from + 4*stride, res, 4);
2022
+ res = vld1q_lane_u8(from + 5*stride, res, 5);
2023
+ res = vld1q_lane_u8(from + 6*stride, res, 6);
2024
+ res = vld1q_lane_u8(from + 7*stride, res, 7);
2025
+ res = vld1q_lane_u8(from + 8*stride, res, 8);
2026
+ res = vld1q_lane_u8(from + 9*stride, res, 9);
2027
+ res = vld1q_lane_u8(from + 10*stride, res, 10);
2028
+ res = vld1q_lane_u8(from + 11*stride, res, 11);
2029
+ res = vld1q_lane_u8(from + 12*stride, res, 12);
2030
+ res = vld1q_lane_u8(from + 13*stride, res, 13);
2031
+ res = vld1q_lane_u8(from + 14*stride, res, 14);
2032
+ res = vld1q_lane_u8(from + 15*stride, res, 15);
2033
+ return res;
2034
+ }
2035
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
2036
+ {
2037
+ Packet4s res = vld1_dup_s16(from);
2038
+ res = vld1_lane_s16(from + 1*stride, res, 1);
2039
+ res = vld1_lane_s16(from + 2*stride, res, 2);
2040
+ res = vld1_lane_s16(from + 3*stride, res, 3);
2041
+ return res;
2042
+ }
2043
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
2044
+ {
2045
+ Packet8s res = vld1q_dup_s16(from);
2046
+ res = vld1q_lane_s16(from + 1*stride, res, 1);
2047
+ res = vld1q_lane_s16(from + 2*stride, res, 2);
2048
+ res = vld1q_lane_s16(from + 3*stride, res, 3);
2049
+ res = vld1q_lane_s16(from + 4*stride, res, 4);
2050
+ res = vld1q_lane_s16(from + 5*stride, res, 5);
2051
+ res = vld1q_lane_s16(from + 6*stride, res, 6);
2052
+ res = vld1q_lane_s16(from + 7*stride, res, 7);
2053
+ return res;
2054
+ }
2055
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
2056
+ {
2057
+ Packet4us res = vld1_dup_u16(from);
2058
+ res = vld1_lane_u16(from + 1*stride, res, 1);
2059
+ res = vld1_lane_u16(from + 2*stride, res, 2);
2060
+ res = vld1_lane_u16(from + 3*stride, res, 3);
2061
+ return res;
2062
+ }
2063
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
2064
+ {
2065
+ Packet8us res = vld1q_dup_u16(from);
2066
+ res = vld1q_lane_u16(from + 1*stride, res, 1);
2067
+ res = vld1q_lane_u16(from + 2*stride, res, 2);
2068
+ res = vld1q_lane_u16(from + 3*stride, res, 3);
2069
+ res = vld1q_lane_u16(from + 4*stride, res, 4);
2070
+ res = vld1q_lane_u16(from + 5*stride, res, 5);
2071
+ res = vld1q_lane_u16(from + 6*stride, res, 6);
2072
+ res = vld1q_lane_u16(from + 7*stride, res, 7);
2073
+ return res;
2074
+ }
2075
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
2076
+ {
2077
+ Packet2i res = vld1_dup_s32(from);
2078
+ res = vld1_lane_s32(from + 1*stride, res, 1);
2079
+ return res;
2080
+ }
2081
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
2082
+ {
2083
+ Packet4i res = vld1q_dup_s32(from);
2084
+ res = vld1q_lane_s32(from + 1*stride, res, 1);
2085
+ res = vld1q_lane_s32(from + 2*stride, res, 2);
2086
+ res = vld1q_lane_s32(from + 3*stride, res, 3);
2087
+ return res;
2088
+ }
2089
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
2090
+ {
2091
+ Packet2ui res = vld1_dup_u32(from);
2092
+ res = vld1_lane_u32(from + 1*stride, res, 1);
2093
+ return res;
2094
+ }
2095
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
2096
+ {
2097
+ Packet4ui res = vld1q_dup_u32(from);
2098
+ res = vld1q_lane_u32(from + 1*stride, res, 1);
2099
+ res = vld1q_lane_u32(from + 2*stride, res, 2);
2100
+ res = vld1q_lane_u32(from + 3*stride, res, 3);
2101
+ return res;
2102
+ }
2103
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
2104
+ {
2105
+ Packet2l res = vld1q_dup_s64(from);
2106
+ res = vld1q_lane_s64(from + 1*stride, res, 1);
2107
+ return res;
2108
+ }
2109
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
2110
+ {
2111
+ Packet2ul res = vld1q_dup_u64(from);
2112
+ res = vld1q_lane_u64(from + 1*stride, res, 1);
2113
+ return res;
2114
+ }
2115
+
2116
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
2117
+ {
2118
+ vst1_lane_f32(to + stride*0, from, 0);
2119
+ vst1_lane_f32(to + stride*1, from, 1);
2120
+ }
2121
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
2122
+ {
2123
+ vst1q_lane_f32(to + stride*0, from, 0);
2124
+ vst1q_lane_f32(to + stride*1, from, 1);
2125
+ vst1q_lane_f32(to + stride*2, from, 2);
2126
+ vst1q_lane_f32(to + stride*3, from, 3);
2127
+ }
2128
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
2129
+ {
2130
+ for (int i = 0; i != 4; i++)
2131
+ *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
2132
+ }
2133
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
2134
+ {
2135
+ vst1_lane_s8(to + stride*0, from, 0);
2136
+ vst1_lane_s8(to + stride*1, from, 1);
2137
+ vst1_lane_s8(to + stride*2, from, 2);
2138
+ vst1_lane_s8(to + stride*3, from, 3);
2139
+ vst1_lane_s8(to + stride*4, from, 4);
2140
+ vst1_lane_s8(to + stride*5, from, 5);
2141
+ vst1_lane_s8(to + stride*6, from, 6);
2142
+ vst1_lane_s8(to + stride*7, from, 7);
2143
+ }
2144
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
2145
+ {
2146
+ vst1q_lane_s8(to + stride*0, from, 0);
2147
+ vst1q_lane_s8(to + stride*1, from, 1);
2148
+ vst1q_lane_s8(to + stride*2, from, 2);
2149
+ vst1q_lane_s8(to + stride*3, from, 3);
2150
+ vst1q_lane_s8(to + stride*4, from, 4);
2151
+ vst1q_lane_s8(to + stride*5, from, 5);
2152
+ vst1q_lane_s8(to + stride*6, from, 6);
2153
+ vst1q_lane_s8(to + stride*7, from, 7);
2154
+ vst1q_lane_s8(to + stride*8, from, 8);
2155
+ vst1q_lane_s8(to + stride*9, from, 9);
2156
+ vst1q_lane_s8(to + stride*10, from, 10);
2157
+ vst1q_lane_s8(to + stride*11, from, 11);
2158
+ vst1q_lane_s8(to + stride*12, from, 12);
2159
+ vst1q_lane_s8(to + stride*13, from, 13);
2160
+ vst1q_lane_s8(to + stride*14, from, 14);
2161
+ vst1q_lane_s8(to + stride*15, from, 15);
2162
+ }
2163
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
2164
+ {
2165
+ for (int i = 0; i != 4; i++)
2166
+ *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
2167
+ }
2168
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
2169
+ {
2170
+ vst1_lane_u8(to + stride*0, from, 0);
2171
+ vst1_lane_u8(to + stride*1, from, 1);
2172
+ vst1_lane_u8(to + stride*2, from, 2);
2173
+ vst1_lane_u8(to + stride*3, from, 3);
2174
+ vst1_lane_u8(to + stride*4, from, 4);
2175
+ vst1_lane_u8(to + stride*5, from, 5);
2176
+ vst1_lane_u8(to + stride*6, from, 6);
2177
+ vst1_lane_u8(to + stride*7, from, 7);
2178
+ }
2179
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
2180
+ {
2181
+ vst1q_lane_u8(to + stride*0, from, 0);
2182
+ vst1q_lane_u8(to + stride*1, from, 1);
2183
+ vst1q_lane_u8(to + stride*2, from, 2);
2184
+ vst1q_lane_u8(to + stride*3, from, 3);
2185
+ vst1q_lane_u8(to + stride*4, from, 4);
2186
+ vst1q_lane_u8(to + stride*5, from, 5);
2187
+ vst1q_lane_u8(to + stride*6, from, 6);
2188
+ vst1q_lane_u8(to + stride*7, from, 7);
2189
+ vst1q_lane_u8(to + stride*8, from, 8);
2190
+ vst1q_lane_u8(to + stride*9, from, 9);
2191
+ vst1q_lane_u8(to + stride*10, from, 10);
2192
+ vst1q_lane_u8(to + stride*11, from, 11);
2193
+ vst1q_lane_u8(to + stride*12, from, 12);
2194
+ vst1q_lane_u8(to + stride*13, from, 13);
2195
+ vst1q_lane_u8(to + stride*14, from, 14);
2196
+ vst1q_lane_u8(to + stride*15, from, 15);
2197
+ }
2198
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
2199
+ {
2200
+ vst1_lane_s16(to + stride*0, from, 0);
2201
+ vst1_lane_s16(to + stride*1, from, 1);
2202
+ vst1_lane_s16(to + stride*2, from, 2);
2203
+ vst1_lane_s16(to + stride*3, from, 3);
2204
+ }
2205
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
2206
+ {
2207
+ vst1q_lane_s16(to + stride*0, from, 0);
2208
+ vst1q_lane_s16(to + stride*1, from, 1);
2209
+ vst1q_lane_s16(to + stride*2, from, 2);
2210
+ vst1q_lane_s16(to + stride*3, from, 3);
2211
+ vst1q_lane_s16(to + stride*4, from, 4);
2212
+ vst1q_lane_s16(to + stride*5, from, 5);
2213
+ vst1q_lane_s16(to + stride*6, from, 6);
2214
+ vst1q_lane_s16(to + stride*7, from, 7);
2215
+ }
2216
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
2217
+ {
2218
+ vst1_lane_u16(to + stride*0, from, 0);
2219
+ vst1_lane_u16(to + stride*1, from, 1);
2220
+ vst1_lane_u16(to + stride*2, from, 2);
2221
+ vst1_lane_u16(to + stride*3, from, 3);
2222
+ }
2223
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
2224
+ {
2225
+ vst1q_lane_u16(to + stride*0, from, 0);
2226
+ vst1q_lane_u16(to + stride*1, from, 1);
2227
+ vst1q_lane_u16(to + stride*2, from, 2);
2228
+ vst1q_lane_u16(to + stride*3, from, 3);
2229
+ vst1q_lane_u16(to + stride*4, from, 4);
2230
+ vst1q_lane_u16(to + stride*5, from, 5);
2231
+ vst1q_lane_u16(to + stride*6, from, 6);
2232
+ vst1q_lane_u16(to + stride*7, from, 7);
2233
+ }
2234
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
2235
+ {
2236
+ vst1_lane_s32(to + stride*0, from, 0);
2237
+ vst1_lane_s32(to + stride*1, from, 1);
2238
+ }
2239
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
2240
+ {
2241
+ vst1q_lane_s32(to + stride*0, from, 0);
2242
+ vst1q_lane_s32(to + stride*1, from, 1);
2243
+ vst1q_lane_s32(to + stride*2, from, 2);
2244
+ vst1q_lane_s32(to + stride*3, from, 3);
2245
+ }
2246
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
2247
+ {
2248
+ vst1_lane_u32(to + stride*0, from, 0);
2249
+ vst1_lane_u32(to + stride*1, from, 1);
2250
+ }
2251
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
2252
+ {
2253
+ vst1q_lane_u32(to + stride*0, from, 0);
2254
+ vst1q_lane_u32(to + stride*1, from, 1);
2255
+ vst1q_lane_u32(to + stride*2, from, 2);
2256
+ vst1q_lane_u32(to + stride*3, from, 3);
2257
+ }
2258
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
2259
+ {
2260
+ vst1q_lane_s64(to + stride*0, from, 0);
2261
+ vst1q_lane_s64(to + stride*1, from, 1);
2262
+ }
2263
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
2264
+ {
2265
+ vst1q_lane_u64(to + stride*0, from, 0);
2266
+ vst1q_lane_u64(to + stride*1, from, 1);
2267
+ }
2268
+
2269
+ template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
2270
+ template<> EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2271
+ template<> EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2272
+ template<> EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2273
+ template<> EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2274
+ template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2275
+ template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2276
+ template<> EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2277
+ template<> EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2278
+
2279
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) { return vget_lane_f32(a,0); }
2280
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return vgetq_lane_f32(a,0); }
2281
+ template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) { return static_cast<int8_t>(a & 0xff); }
2282
+ template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) { return vget_lane_s8(a,0); }
2283
+ template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) { return vgetq_lane_s8(a,0); }
2284
+ template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) { return static_cast<uint8_t>(a & 0xff); }
2285
+ template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) { return vget_lane_u8(a,0); }
2286
+ template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) { return vgetq_lane_u8(a,0); }
2287
+ template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) { return vget_lane_s16(a,0); }
2288
+ template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) { return vgetq_lane_s16(a,0); }
2289
+ template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) { return vget_lane_u16(a,0); }
2290
+ template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) { return vgetq_lane_u16(a,0); }
2291
+ template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) { return vget_lane_s32(a,0); }
2292
+ template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { return vgetq_lane_s32(a,0); }
2293
+ template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(a,0); }
2294
+ template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return vgetq_lane_u32(a,0); }
2295
+ template<> EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) { return vgetq_lane_s64(a,0); }
2296
+ template<> EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) { return vgetq_lane_u64(a,0); }
2297
+
2298
+ template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); }
2299
+ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
2300
+ {
2301
+ const float32x4_t a_r64 = vrev64q_f32(a);
2302
+ return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
2303
+ }
2304
+ template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a)
2305
+ { return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
2306
+ template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); }
2307
+ template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
2308
+ {
2309
+ const int8x16_t a_r64 = vrev64q_s8(a);
2310
+ return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
2311
+ }
2312
+ template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a)
2313
+ { return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); }
2314
+ template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); }
2315
+ template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
2316
+ {
2317
+ const uint8x16_t a_r64 = vrev64q_u8(a);
2318
+ return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
2319
+ }
2320
+ template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); }
2321
+ template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
2322
+ {
2323
+ const int16x8_t a_r64 = vrev64q_s16(a);
2324
+ return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
2325
+ }
2326
+ template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); }
2327
+ template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
2328
+ {
2329
+ const uint16x8_t a_r64 = vrev64q_u16(a);
2330
+ return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
2331
+ }
2332
+ template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); }
2333
+ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
2334
+ {
2335
+ const int32x4_t a_r64 = vrev64q_s32(a);
2336
+ return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
2337
+ }
2338
+ template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); }
2339
+ template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a)
2340
+ {
2341
+ const uint32x4_t a_r64 = vrev64q_u32(a);
2342
+ return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
2343
+ }
2344
+ template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a)
2345
+ { return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); }
2346
+ template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a)
2347
+ { return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); }
2348
+
2349
+ template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); }
2350
+ template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
2351
+ template<> EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a)
2352
+ { return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
2353
+ template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); }
2354
+ template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); }
2355
+ template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; }
2356
+ template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; }
2357
+ template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
2358
+ template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); }
2359
+ template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); }
2360
+ template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; }
2361
+ template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
2362
+ template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); }
2363
+ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
2364
+ template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; }
2365
+ template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
2366
+ template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
2367
+ #if EIGEN_ARCH_ARM64
2368
+ return vabsq_s64(a);
2369
+ #else
2370
+ return vcombine_s64(
2371
+ vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))),
2372
+ vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
2373
+ #endif
2374
+ }
2375
+ template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
2376
+
2377
+ template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
2378
+ { return pfrexp_generic(a,exponent); }
2379
+ template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
2380
+ { return pfrexp_generic(a,exponent); }
2381
+
2382
+ template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent)
2383
+ { return pldexp_generic(a,exponent); }
2384
+ template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent)
2385
+ { return pldexp_generic(a,exponent); }
2386
+
2387
+ template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); }
2388
+ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
2389
+ {
2390
+ const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
2391
+ return vget_lane_f32(vpadd_f32(sum, sum), 0);
2392
+ }
2393
+ template<> EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a)
2394
+ {
2395
+ const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
2396
+ int8x8_t sum = vpadd_s8(a_dup, a_dup);
2397
+ sum = vpadd_s8(sum, sum);
2398
+ return vget_lane_s8(sum, 0);
2399
+ }
2400
+ template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a)
2401
+ {
2402
+ int8x8_t sum = vpadd_s8(a,a);
2403
+ sum = vpadd_s8(sum, sum);
2404
+ sum = vpadd_s8(sum, sum);
2405
+ return vget_lane_s8(sum, 0);
2406
+ }
2407
+ template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a)
2408
+ {
2409
+ int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
2410
+ sum = vpadd_s8(sum, sum);
2411
+ sum = vpadd_s8(sum, sum);
2412
+ sum = vpadd_s8(sum, sum);
2413
+ return vget_lane_s8(sum, 0);
2414
+ }
2415
+ template<> EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a)
2416
+ {
2417
+ const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
2418
+ uint8x8_t sum = vpadd_u8(a_dup, a_dup);
2419
+ sum = vpadd_u8(sum, sum);
2420
+ return vget_lane_u8(sum, 0);
2421
+ }
2422
+ template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a)
2423
+ {
2424
+ uint8x8_t sum = vpadd_u8(a,a);
2425
+ sum = vpadd_u8(sum, sum);
2426
+ sum = vpadd_u8(sum, sum);
2427
+ return vget_lane_u8(sum, 0);
2428
+ }
2429
+ template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a)
2430
+ {
2431
+ uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
2432
+ sum = vpadd_u8(sum, sum);
2433
+ sum = vpadd_u8(sum, sum);
2434
+ sum = vpadd_u8(sum, sum);
2435
+ return vget_lane_u8(sum, 0);
2436
+ }
2437
+ template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a)
2438
+ {
2439
+ const int16x4_t sum = vpadd_s16(a,a);
2440
+ return vget_lane_s16(vpadd_s16(sum, sum), 0);
2441
+ }
2442
+ template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a)
2443
+ {
2444
+ int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
2445
+ sum = vpadd_s16(sum, sum);
2446
+ sum = vpadd_s16(sum, sum);
2447
+ return vget_lane_s16(sum, 0);
2448
+ }
2449
+ template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a)
2450
+ {
2451
+ const uint16x4_t sum = vpadd_u16(a,a);
2452
+ return vget_lane_u16(vpadd_u16(sum, sum), 0);
2453
+ }
2454
+ template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a)
2455
+ {
2456
+ uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
2457
+ sum = vpadd_u16(sum, sum);
2458
+ sum = vpadd_u16(sum, sum);
2459
+ return vget_lane_u16(sum, 0);
2460
+ }
2461
+ template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); }
2462
+ template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
2463
+ {
2464
+ const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
2465
+ return vget_lane_s32(vpadd_s32(sum, sum), 0);
2466
+ }
2467
+ template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); }
2468
+ template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
2469
+ {
2470
+ const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
2471
+ return vget_lane_u32(vpadd_u32(sum, sum), 0);
2472
+ }
2473
+ template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a)
2474
+ { return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); }
2475
+ template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a)
2476
+ { return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
2477
+
2478
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a)
2479
+ {
2480
+ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
2481
+ vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
2482
+ }
2483
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a)
2484
+ { return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
2485
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a)
2486
+ {
2487
+ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
2488
+ vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
2489
+ }
2490
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a)
2491
+ { return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
2492
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a)
2493
+ { return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
2494
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a)
2495
+ { return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
2496
+
2497
+ // Other reduction functions:
2498
+ // mul
2499
+ template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
2500
+ { return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
2501
+ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
2502
+ { return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
2503
+ template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
2504
+ {
2505
+ int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
2506
+ prod = vmul_s8(prod, vrev16_s8(prod));
2507
+ return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
2508
+ }
2509
+ template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
2510
+ {
2511
+ int8x8_t prod = vmul_s8(a, vrev16_s8(a));
2512
+ prod = vmul_s8(prod, vrev32_s8(prod));
2513
+ return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
2514
+ }
2515
+ template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
2516
+ { return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
2517
+ template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
2518
+ {
2519
+ uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
2520
+ prod = vmul_u8(prod, vrev16_u8(prod));
2521
+ return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
2522
+ }
2523
+ template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
2524
+ {
2525
+ uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
2526
+ prod = vmul_u8(prod, vrev32_u8(prod));
2527
+ return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
2528
+ }
2529
+ template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
2530
+ { return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
2531
+ template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
2532
+ {
2533
+ const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
2534
+ return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
2535
+ }
2536
+ template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a)
2537
+ {
2538
+ int16x4_t prod;
2539
+
2540
+ // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
2541
+ prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
2542
+ // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
2543
+ prod = vmul_s16(prod, vrev32_s16(prod));
2544
+ // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
2545
+ return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
2546
+ }
2547
+ template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a)
2548
+ {
2549
+ const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
2550
+ return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
2551
+ }
2552
+ template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a)
2553
+ {
2554
+ uint16x4_t prod;
2555
+
2556
+ // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
2557
+ prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
2558
+ // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
2559
+ prod = vmul_u16(prod, vrev32_u16(prod));
2560
+ // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
2561
+ return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
2562
+ }
2563
+ template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
2564
+ { return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
2565
+ template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
2566
+ { return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
2567
+ template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
2568
+ { return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
2569
+ template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
2570
+ { return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
2571
+ template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
2572
+ { return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
2573
+ template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
2574
+ { return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); }
2575
+
2576
+ // min
2577
+ template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a)
2578
+ { return vget_lane_f32(vpmin_f32(a,a), 0); }
2579
+ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
2580
+ {
2581
+ const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
2582
+ return vget_lane_f32(vpmin_f32(min, min), 0);
2583
+ }
2584
+ template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a)
2585
+ {
2586
+ const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
2587
+ int8x8_t min = vpmin_s8(a_dup, a_dup);
2588
+ min = vpmin_s8(min, min);
2589
+ return vget_lane_s8(min, 0);
2590
+ }
2591
+ template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a)
2592
+ {
2593
+ int8x8_t min = vpmin_s8(a,a);
2594
+ min = vpmin_s8(min, min);
2595
+ min = vpmin_s8(min, min);
2596
+ return vget_lane_s8(min, 0);
2597
+ }
2598
+ template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a)
2599
+ {
2600
+ int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
2601
+ min = vpmin_s8(min, min);
2602
+ min = vpmin_s8(min, min);
2603
+ min = vpmin_s8(min, min);
2604
+ return vget_lane_s8(min, 0);
2605
+ }
2606
+ template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a)
2607
+ {
2608
+ const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
2609
+ uint8x8_t min = vpmin_u8(a_dup, a_dup);
2610
+ min = vpmin_u8(min, min);
2611
+ return vget_lane_u8(min, 0);
2612
+ }
2613
+ template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a)
2614
+ {
2615
+ uint8x8_t min = vpmin_u8(a,a);
2616
+ min = vpmin_u8(min, min);
2617
+ min = vpmin_u8(min, min);
2618
+ return vget_lane_u8(min, 0);
2619
+ }
2620
+ template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a)
2621
+ {
2622
+ uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
2623
+ min = vpmin_u8(min, min);
2624
+ min = vpmin_u8(min, min);
2625
+ min = vpmin_u8(min, min);
2626
+ return vget_lane_u8(min, 0);
2627
+ }
2628
+ template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a)
2629
+ {
2630
+ const int16x4_t min = vpmin_s16(a,a);
2631
+ return vget_lane_s16(vpmin_s16(min, min), 0);
2632
+ }
2633
+ template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a)
2634
+ {
2635
+ int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
2636
+ min = vpmin_s16(min, min);
2637
+ min = vpmin_s16(min, min);
2638
+ return vget_lane_s16(min, 0);
2639
+ }
2640
+ template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a)
2641
+ {
2642
+ const uint16x4_t min = vpmin_u16(a,a);
2643
+ return vget_lane_u16(vpmin_u16(min, min), 0);
2644
+ }
2645
+ template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a)
2646
+ {
2647
+ uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
2648
+ min = vpmin_u16(min, min);
2649
+ min = vpmin_u16(min, min);
2650
+ return vget_lane_u16(min, 0);
2651
+ }
2652
+ template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a)
2653
+ { return vget_lane_s32(vpmin_s32(a,a), 0); }
2654
+ template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
2655
+ {
2656
+ const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
2657
+ return vget_lane_s32(vpmin_s32(min, min), 0);
2658
+ }
2659
+ template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a)
2660
+ { return vget_lane_u32(vpmin_u32(a,a), 0); }
2661
+ template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
2662
+ {
2663
+ const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
2664
+ return vget_lane_u32(vpmin_u32(min, min), 0);
2665
+ }
2666
+ template<> EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a)
2667
+ { return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
2668
+ template<> EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a)
2669
+ { return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
2670
+
2671
+ // max
2672
+ template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a)
2673
+ { return vget_lane_f32(vpmax_f32(a,a), 0); }
2674
+ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
2675
+ {
2676
+ const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
2677
+ return vget_lane_f32(vpmax_f32(max, max), 0);
2678
+ }
2679
+ template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a)
2680
+ {
2681
+ const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
2682
+ int8x8_t max = vpmax_s8(a_dup, a_dup);
2683
+ max = vpmax_s8(max, max);
2684
+ return vget_lane_s8(max, 0);
2685
+ }
2686
+ template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a)
2687
+ {
2688
+ int8x8_t max = vpmax_s8(a,a);
2689
+ max = vpmax_s8(max, max);
2690
+ max = vpmax_s8(max, max);
2691
+ return vget_lane_s8(max, 0);
2692
+ }
2693
+ template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a)
2694
+ {
2695
+ int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
2696
+ max = vpmax_s8(max, max);
2697
+ max = vpmax_s8(max, max);
2698
+ max = vpmax_s8(max, max);
2699
+ return vget_lane_s8(max, 0);
2700
+ }
2701
+ template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a)
2702
+ {
2703
+ const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
2704
+ uint8x8_t max = vpmax_u8(a_dup, a_dup);
2705
+ max = vpmax_u8(max, max);
2706
+ return vget_lane_u8(max, 0);
2707
+ }
2708
+ template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a)
2709
+ {
2710
+ uint8x8_t max = vpmax_u8(a,a);
2711
+ max = vpmax_u8(max, max);
2712
+ max = vpmax_u8(max, max);
2713
+ return vget_lane_u8(max, 0);
2714
+ }
2715
+ template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a)
2716
+ {
2717
+ uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
2718
+ max = vpmax_u8(max, max);
2719
+ max = vpmax_u8(max, max);
2720
+ max = vpmax_u8(max, max);
2721
+ return vget_lane_u8(max, 0);
2722
+ }
2723
+ template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a)
2724
+ {
2725
+ const int16x4_t max = vpmax_s16(a,a);
2726
+ return vget_lane_s16(vpmax_s16(max, max), 0);
2727
+ }
2728
+ template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a)
2729
+ {
2730
+ int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
2731
+ max = vpmax_s16(max, max);
2732
+ max = vpmax_s16(max, max);
2733
+ return vget_lane_s16(max, 0);
2734
+ }
2735
+ template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a)
2736
+ {
2737
+ const uint16x4_t max = vpmax_u16(a,a);
2738
+ return vget_lane_u16(vpmax_u16(max, max), 0);
2739
+ }
2740
+ template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a)
2741
+ {
2742
+ uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
2743
+ max = vpmax_u16(max, max);
2744
+ max = vpmax_u16(max, max);
2745
+ return vget_lane_u16(max, 0);
2746
+ }
2747
+ template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a)
2748
+ { return vget_lane_s32(vpmax_s32(a,a), 0); }
2749
+ template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
2750
+ {
2751
+ const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
2752
+ return vget_lane_s32(vpmax_s32(max, max), 0);
2753
+ }
2754
+ template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a)
2755
+ { return vget_lane_u32(vpmax_u32(a,a), 0); }
2756
+ template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
2757
+ {
2758
+ const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
2759
+ return vget_lane_u32(vpmax_u32(max, max), 0);
2760
+ }
2761
+ template<> EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a)
2762
+ { return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
2763
+ template<> EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a)
2764
+ { return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
2765
+
2766
+ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
2767
+ {
2768
+ uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
2769
+ vget_high_u32(vreinterpretq_u32_f32(x)));
2770
+ return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
2771
+ }
2772
+
2773
+ // Helpers for ptranspose.
2774
+ namespace detail {
2775
+
2776
+ template<typename Packet>
2777
+ void zip_in_place(Packet& p1, Packet& p2);
2778
+
2779
+ template<>
2780
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
2781
+ const float32x2x2_t tmp = vzip_f32(p1, p2);
2782
+ p1 = tmp.val[0];
2783
+ p2 = tmp.val[1];
2784
+ }
2785
+
2786
+ template<>
2787
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
2788
+ const float32x4x2_t tmp = vzipq_f32(p1, p2);
2789
+ p1 = tmp.val[0];
2790
+ p2 = tmp.val[1];
2791
+ }
2792
+
2793
+ template<>
2794
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
2795
+ const int8x8x2_t tmp = vzip_s8(p1, p2);
2796
+ p1 = tmp.val[0];
2797
+ p2 = tmp.val[1];
2798
+ }
2799
+
2800
+ template<>
2801
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
2802
+ const int8x16x2_t tmp = vzipq_s8(p1, p2);
2803
+ p1 = tmp.val[0];
2804
+ p2 = tmp.val[1];
2805
+ }
2806
+
2807
+ template<>
2808
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
2809
+ const uint8x8x2_t tmp = vzip_u8(p1, p2);
2810
+ p1 = tmp.val[0];
2811
+ p2 = tmp.val[1];
2812
+ }
2813
+
2814
+ template<>
2815
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
2816
+ const uint8x16x2_t tmp = vzipq_u8(p1, p2);
2817
+ p1 = tmp.val[0];
2818
+ p2 = tmp.val[1];
2819
+ }
2820
+
2821
+ template<>
2822
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
2823
+ const int32x2x2_t tmp = vzip_s32(p1, p2);
2824
+ p1 = tmp.val[0];
2825
+ p2 = tmp.val[1];
2826
+ }
2827
+
2828
+ template<>
2829
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
2830
+ const int32x4x2_t tmp = vzipq_s32(p1, p2);
2831
+ p1 = tmp.val[0];
2832
+ p2 = tmp.val[1];
2833
+ }
2834
+
2835
+ template<>
2836
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
2837
+ const uint32x2x2_t tmp = vzip_u32(p1, p2);
2838
+ p1 = tmp.val[0];
2839
+ p2 = tmp.val[1];
2840
+ }
2841
+
2842
+ template<>
2843
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
2844
+ const uint32x4x2_t tmp = vzipq_u32(p1, p2);
2845
+ p1 = tmp.val[0];
2846
+ p2 = tmp.val[1];
2847
+ }
2848
+
2849
+ template<>
2850
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
2851
+ const int16x4x2_t tmp = vzip_s16(p1, p2);
2852
+ p1 = tmp.val[0];
2853
+ p2 = tmp.val[1];
2854
+ }
2855
+
2856
+ template<>
2857
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
2858
+ const int16x8x2_t tmp = vzipq_s16(p1, p2);
2859
+ p1 = tmp.val[0];
2860
+ p2 = tmp.val[1];
2861
+ }
2862
+
2863
+ template<>
2864
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
2865
+ const uint16x4x2_t tmp = vzip_u16(p1, p2);
2866
+ p1 = tmp.val[0];
2867
+ p2 = tmp.val[1];
2868
+ }
2869
+
2870
+ template<>
2871
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
2872
+ const uint16x8x2_t tmp = vzipq_u16(p1, p2);
2873
+ p1 = tmp.val[0];
2874
+ p2 = tmp.val[1];
2875
+ }
2876
+
2877
+ template<typename Packet>
2878
+ EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
2879
+ zip_in_place(kernel.packet[0], kernel.packet[1]);
2880
+ }
2881
+
2882
+ template<typename Packet>
2883
+ EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
2884
+ zip_in_place(kernel.packet[0], kernel.packet[2]);
2885
+ zip_in_place(kernel.packet[1], kernel.packet[3]);
2886
+ zip_in_place(kernel.packet[0], kernel.packet[1]);
2887
+ zip_in_place(kernel.packet[2], kernel.packet[3]);
2888
+ }
2889
+
2890
+ template<typename Packet>
2891
+ EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
2892
+ zip_in_place(kernel.packet[0], kernel.packet[4]);
2893
+ zip_in_place(kernel.packet[1], kernel.packet[5]);
2894
+ zip_in_place(kernel.packet[2], kernel.packet[6]);
2895
+ zip_in_place(kernel.packet[3], kernel.packet[7]);
2896
+
2897
+ zip_in_place(kernel.packet[0], kernel.packet[2]);
2898
+ zip_in_place(kernel.packet[1], kernel.packet[3]);
2899
+ zip_in_place(kernel.packet[4], kernel.packet[6]);
2900
+ zip_in_place(kernel.packet[5], kernel.packet[7]);
2901
+
2902
+ zip_in_place(kernel.packet[0], kernel.packet[1]);
2903
+ zip_in_place(kernel.packet[2], kernel.packet[3]);
2904
+ zip_in_place(kernel.packet[4], kernel.packet[5]);
2905
+ zip_in_place(kernel.packet[6], kernel.packet[7]);
2906
+ }
2907
+
2908
+ template<typename Packet>
2909
+ EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
2910
+ EIGEN_UNROLL_LOOP
2911
+ for (int i=0; i<4; ++i) {
2912
+ const int m = (1 << i);
2913
+ EIGEN_UNROLL_LOOP
2914
+ for (int j=0; j<m; ++j) {
2915
+ const int n = (1 << (3-i));
2916
+ EIGEN_UNROLL_LOOP
2917
+ for (int k=0; k<n; ++k) {
2918
+ const int idx = 2*j*n+k;
2919
+ zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
2920
+ }
2921
+ }
2922
+ }
2923
+ }
2924
+
2925
+ } // namespace detail
2926
+
2927
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
2928
+ detail::ptranspose_impl(kernel);
2929
+ }
2930
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
2931
+ detail::ptranspose_impl(kernel);
2932
+ }
2933
+
2934
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
2935
+ {
2936
+ const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
2937
+ const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
2938
+
2939
+ const int8x8x2_t zip8 = vzip_s8(a,b);
2940
+ const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
2941
+
2942
+ kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
2943
+ kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
2944
+ kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
2945
+ kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
2946
+ }
2947
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
2948
+ detail::ptranspose_impl(kernel);
2949
+ }
2950
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
2951
+ detail::ptranspose_impl(kernel);
2952
+ }
2953
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2954
+ detail::ptranspose_impl(kernel);
2955
+ }
2956
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
2957
+ detail::ptranspose_impl(kernel);
2958
+ }
2959
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2960
+ detail::ptranspose_impl(kernel);
2961
+ }
2962
+
2963
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
2964
+ {
2965
+ const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
2966
+ const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
2967
+
2968
+ const uint8x8x2_t zip8 = vzip_u8(a,b);
2969
+ const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
2970
+
2971
+ kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
2972
+ kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
2973
+ kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
2974
+ kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
2975
+ }
2976
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
2977
+ detail::ptranspose_impl(kernel);
2978
+ }
2979
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
2980
+ detail::ptranspose_impl(kernel);
2981
+ }
2982
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2983
+ detail::ptranspose_impl(kernel);
2984
+ }
2985
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
2986
+ detail::ptranspose_impl(kernel);
2987
+ }
2988
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2989
+ detail::ptranspose_impl(kernel);
2990
+ }
2991
+
2992
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
2993
+ detail::ptranspose_impl(kernel);
2994
+ }
2995
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2996
+ detail::ptranspose_impl(kernel);
2997
+ }
2998
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2999
+ detail::ptranspose_impl(kernel);
3000
+ }
3001
+
3002
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
3003
+ detail::ptranspose_impl(kernel);
3004
+ }
3005
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
3006
+ detail::ptranspose_impl(kernel);
3007
+ }
3008
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
3009
+ detail::ptranspose_impl(kernel);
3010
+ }
3011
+
3012
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
3013
+ detail::ptranspose_impl(kernel);
3014
+ }
3015
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
3016
+ detail::ptranspose_impl(kernel);
3017
+ }
3018
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
3019
+ detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
3020
+ }
3021
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
3022
+ detail::ptranspose_impl(kernel);
3023
+ }
3024
+
3025
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
3026
+ ptranspose(PacketBlock<Packet2l, 2>& kernel)
3027
+ {
3028
+ #if EIGEN_ARCH_ARM64
3029
+ const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
3030
+ kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
3031
+ kernel.packet[0] = tmp1;
3032
+ #else
3033
+ const int64x1_t tmp[2][2] = {
3034
+ { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
3035
+ { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) }
3036
+ };
3037
+
3038
+ kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
3039
+ kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
3040
+ #endif
3041
+ }
3042
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
3043
+ ptranspose(PacketBlock<Packet2ul, 2>& kernel)
3044
+ {
3045
+ #if EIGEN_ARCH_ARM64
3046
+ const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
3047
+ kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
3048
+ kernel.packet[0] = tmp1;
3049
+ #else
3050
+ const uint64x1_t tmp[2][2] = {
3051
+ { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
3052
+ { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) }
3053
+ };
3054
+
3055
+ kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
3056
+ kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
3057
+ #endif
3058
+ }
3059
+
3060
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
3061
+ { return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
3062
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
3063
+ { return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
3064
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
3065
+ { return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
3066
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
3067
+ { return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
3068
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
3069
+ { return vbsl_u8(mask, a, b); }
3070
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
3071
+ { return vbslq_u8(mask, a, b); }
3072
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
3073
+ { return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
3074
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
3075
+ { return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
3076
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
3077
+ { return vbsl_u16(mask, a, b); }
3078
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
3079
+ { return vbslq_u16(mask, a, b); }
3080
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
3081
+ { return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
3082
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
3083
+ { return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
3084
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
3085
+ { return vbsl_u32(mask, a, b); }
3086
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
3087
+ { return vbslq_u32(mask, a, b); }
3088
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
3089
+ { return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
3090
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
3091
+ { return vbslq_u64(mask, a, b); }
3092
+
3093
+ // Use armv8 rounding intinsics if available.
3094
+ #if EIGEN_ARCH_ARMV8
3095
+ template<> EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a)
3096
+ { return vrndn_f32(a); }
3097
+
3098
+ template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
3099
+ { return vrndnq_f32(a); }
3100
+
3101
+ template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
3102
+ { return vrndm_f32(a); }
3103
+
3104
+ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
3105
+ { return vrndmq_f32(a); }
3106
+
3107
+ template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
3108
+ { return vrndp_f32(a); }
3109
+
3110
+ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
3111
+ { return vrndpq_f32(a); }
3112
+
3113
+ #else
3114
+
3115
+ template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
3116
+ // Adds and subtracts signum(a) * 2^23 to force rounding.
3117
+ const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
3118
+ const Packet4f abs_a = pabs(a);
3119
+ Packet4f r = padd(abs_a, limit);
3120
+ // Don't compile-away addition and subtraction.
3121
+ EIGEN_OPTIMIZATION_BARRIER(r);
3122
+ r = psub(r, limit);
3123
+ // If greater than limit, simply return a. Otherwise, account for sign.
3124
+ r = pselect(pcmp_lt(abs_a, limit),
3125
+ pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
3126
+ return r;
3127
+ }
3128
+
3129
+ template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
3130
+ // Adds and subtracts signum(a) * 2^23 to force rounding.
3131
+ const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
3132
+ const Packet2f abs_a = pabs(a);
3133
+ Packet2f r = padd(abs_a, limit);
3134
+ // Don't compile-away addition and subtraction.
3135
+ EIGEN_OPTIMIZATION_BARRIER(r);
3136
+ r = psub(r, limit);
3137
+ // If greater than limit, simply return a. Otherwise, account for sign.
3138
+ r = pselect(pcmp_lt(abs_a, limit),
3139
+ pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
3140
+ return r;
3141
+ }
3142
+
3143
+ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
3144
+ {
3145
+ const Packet4f cst_1 = pset1<Packet4f>(1.0f);
3146
+ Packet4f tmp = print<Packet4f>(a);
3147
+ // If greater, subtract one.
3148
+ Packet4f mask = pcmp_lt(a, tmp);
3149
+ mask = pand(mask, cst_1);
3150
+ return psub(tmp, mask);
3151
+ }
3152
+
3153
+ template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
3154
+ {
3155
+ const Packet2f cst_1 = pset1<Packet2f>(1.0f);
3156
+ Packet2f tmp = print<Packet2f>(a);
3157
+ // If greater, subtract one.
3158
+ Packet2f mask = pcmp_lt(a, tmp);
3159
+ mask = pand(mask, cst_1);
3160
+ return psub(tmp, mask);
3161
+ }
3162
+
3163
+ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
3164
+ {
3165
+ const Packet4f cst_1 = pset1<Packet4f>(1.0f);
3166
+ Packet4f tmp = print<Packet4f>(a);
3167
+ // If smaller, add one.
3168
+ Packet4f mask = pcmp_lt(tmp, a);
3169
+ mask = pand(mask, cst_1);
3170
+ return padd(tmp, mask);
3171
+ }
3172
+
3173
+ template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
3174
+ {
3175
+ const Packet2f cst_1 = pset1<Packet2f>(1.0);
3176
+ Packet2f tmp = print<Packet2f>(a);
3177
+ // If smaller, add one.
3178
+ Packet2f mask = pcmp_lt(tmp, a);
3179
+ mask = pand(mask, cst_1);
3180
+ return padd(tmp, mask);
3181
+ }
3182
+
3183
+ #endif
3184
+
3185
+ /**
3186
+ * Computes the integer square root
3187
+ * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result
3188
+ * and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument
3189
+ * value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf .
3190
+ */
3191
+ template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
3192
+ uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
3193
+ uint8x8_t res = vdup_n_u8(0);
3194
+ uint8x8_t add = vdup_n_u8(0x8);
3195
+ for (int i = 0; i < 4; i++)
3196
+ {
3197
+ const uint8x8_t temp = vorr_u8(res, add);
3198
+ res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
3199
+ add = vshr_n_u8(add, 1);
3200
+ }
3201
+ return vget_lane_u32(vreinterpret_u32_u8(res), 0);
3202
+ }
3203
+ /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
3204
+ template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
3205
+ uint8x8_t res = vdup_n_u8(0);
3206
+ uint8x8_t add = vdup_n_u8(0x8);
3207
+ for (int i = 0; i < 4; i++)
3208
+ {
3209
+ const uint8x8_t temp = vorr_u8(res, add);
3210
+ res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
3211
+ add = vshr_n_u8(add, 1);
3212
+ }
3213
+ return res;
3214
+ }
3215
+ /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
3216
+ template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
3217
+ uint8x16_t res = vdupq_n_u8(0);
3218
+ uint8x16_t add = vdupq_n_u8(0x8);
3219
+ for (int i = 0; i < 4; i++)
3220
+ {
3221
+ const uint8x16_t temp = vorrq_u8(res, add);
3222
+ res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
3223
+ add = vshrq_n_u8(add, 1);
3224
+ }
3225
+ return res;
3226
+ }
3227
+ /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
3228
+ template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
3229
+ uint16x4_t res = vdup_n_u16(0);
3230
+ uint16x4_t add = vdup_n_u16(0x80);
3231
+ for (int i = 0; i < 8; i++)
3232
+ {
3233
+ const uint16x4_t temp = vorr_u16(res, add);
3234
+ res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
3235
+ add = vshr_n_u16(add, 1);
3236
+ }
3237
+ return res;
3238
+ }
3239
+ /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
3240
+ template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
3241
+ uint16x8_t res = vdupq_n_u16(0);
3242
+ uint16x8_t add = vdupq_n_u16(0x80);
3243
+ for (int i = 0; i < 8; i++)
3244
+ {
3245
+ const uint16x8_t temp = vorrq_u16(res, add);
3246
+ res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
3247
+ add = vshrq_n_u16(add, 1);
3248
+ }
3249
+ return res;
3250
+ }
3251
+ /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
3252
+ template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
3253
+ uint32x2_t res = vdup_n_u32(0);
3254
+ uint32x2_t add = vdup_n_u32(0x8000);
3255
+ for (int i = 0; i < 16; i++)
3256
+ {
3257
+ const uint32x2_t temp = vorr_u32(res, add);
3258
+ res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
3259
+ add = vshr_n_u32(add, 1);
3260
+ }
3261
+ return res;
3262
+ }
3263
+ /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
3264
+ template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
3265
+ uint32x4_t res = vdupq_n_u32(0);
3266
+ uint32x4_t add = vdupq_n_u32(0x8000);
3267
+ for (int i = 0; i < 16; i++)
3268
+ {
3269
+ const uint32x4_t temp = vorrq_u32(res, add);
3270
+ res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
3271
+ add = vshrq_n_u32(add, 1);
3272
+ }
3273
+ return res;
3274
+ }
3275
+
3276
+ template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
3277
+ // Compute approximate reciprocal sqrt.
3278
+ Packet4f x = vrsqrteq_f32(a);
3279
+ // Do Newton iterations for 1/sqrt(x).
3280
+ x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
3281
+ x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
3282
+ const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
3283
+ return pselect(pcmp_eq(a, pzero(a)), infinity, x);
3284
+ }
3285
+
3286
+ template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
3287
+ // Compute approximate reciprocal sqrt.
3288
+ Packet2f x = vrsqrte_f32(a);
3289
+ // Do Newton iterations for 1/sqrt(x).
3290
+ x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
3291
+ x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
3292
+ const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
3293
+ return pselect(pcmp_eq(a, pzero(a)), infinity, x);
3294
+ }
3295
+
3296
+ // Unfortunately vsqrt_f32 is only available for A64.
3297
+ #if EIGEN_ARCH_ARM64
3298
+ template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);}
3299
+ template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); }
3300
+ #else
3301
+ template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
3302
+ const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
3303
+ const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
3304
+ return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
3305
+ }
3306
+ template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
3307
+ const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
3308
+ const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
3309
+ return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
3310
+ }
3311
+ #endif
3312
+
3313
+ //---------- bfloat16 ----------
3314
+ // TODO: Add support for native armv8.6-a bfloat16_t
3315
+
3316
+ // TODO: Guard if we have native bfloat16 support
3317
+ typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
3318
+
3319
+ template<> struct is_arithmetic<Packet4bf> { enum { value = true }; };
3320
+
3321
+ template<> struct packet_traits<bfloat16> : default_packet_traits
3322
+ {
3323
+ typedef Packet4bf type;
3324
+ typedef Packet4bf half;
3325
+ enum
3326
+ {
3327
+ Vectorizable = 1,
3328
+ AlignedOnScalar = 1,
3329
+ size = 4,
3330
+ HasHalfPacket = 0,
3331
+
3332
+ HasCmp = 1,
3333
+ HasAdd = 1,
3334
+ HasSub = 1,
3335
+ HasShift = 1,
3336
+ HasMul = 1,
3337
+ HasNegate = 1,
3338
+ HasAbs = 1,
3339
+ HasArg = 0,
3340
+ HasAbs2 = 1,
3341
+ HasAbsDiff = 1,
3342
+ HasMin = 1,
3343
+ HasMax = 1,
3344
+ HasConj = 1,
3345
+ HasSetLinear = 0,
3346
+ HasBlend = 0,
3347
+ HasDiv = 1,
3348
+ HasFloor = 1,
3349
+ HasCeil = 1,
3350
+ HasRint = 1,
3351
+
3352
+ HasSin = EIGEN_FAST_MATH,
3353
+ HasCos = EIGEN_FAST_MATH,
3354
+ HasLog = 1,
3355
+ HasExp = 1,
3356
+ HasSqrt = 0,
3357
+ HasTanh = EIGEN_FAST_MATH,
3358
+ HasErf = EIGEN_FAST_MATH,
3359
+ HasBessel = 0, // Issues with accuracy.
3360
+ HasNdtri = 0
3361
+ };
3362
+ };
3363
+
3364
+ template<> struct unpacket_traits<Packet4bf>
3365
+ {
3366
+ typedef bfloat16 type;
3367
+ typedef Packet4bf half;
3368
+ enum
3369
+ {
3370
+ size = 4,
3371
+ alignment = Aligned16,
3372
+ vectorizable = true,
3373
+ masked_load_available = false,
3374
+ masked_store_available = false
3375
+ };
3376
+ };
3377
+
3378
+ namespace detail {
3379
+ template<>
3380
+ EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
3381
+ const uint16x4x2_t tmp = vzip_u16(p1, p2);
3382
+ p1 = tmp.val[0];
3383
+ p2 = tmp.val[1];
3384
+ }
3385
+ } // namespace detail
3386
+
3387
+ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
3388
+ {
3389
+ // See the scalar implemention in BFloat16.h for a comprehensible explanation
3390
+ // of this fast rounding algorithm
3391
+ Packet4ui input = reinterpret_cast<Packet4ui>(p);
3392
+
3393
+ // lsb = (input >> 16) & 1
3394
+ Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
3395
+
3396
+ // rounding_bias = 0x7fff + lsb
3397
+ Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
3398
+
3399
+ // input += rounding_bias
3400
+ input = vaddq_u32(input, rounding_bias);
3401
+
3402
+ // input = input >> 16
3403
+ input = vshrq_n_u32(input, 16);
3404
+
3405
+ // Replace float-nans by bfloat16-nans, that is 0x7fc0
3406
+ const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
3407
+ const Packet4ui mask = vceqq_f32(p, p);
3408
+ input = vbslq_u32(mask, input, bf16_nan);
3409
+
3410
+ // output = static_cast<uint16_t>(input)
3411
+ return vmovn_u32(input);
3412
+ }
3413
+
3414
+ EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
3415
+ {
3416
+ return reinterpret_cast<Packet4f>(vshlq_n_u32(vmovl_u16(p), 16));
3417
+ }
3418
+
3419
+ EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
3420
+ return vmovn_u32(vreinterpretq_u32_f32(p));
3421
+ }
3422
+
3423
+ template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
3424
+ return pset1<Packet4us>(from.value);
3425
+ }
3426
+
3427
+ template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
3428
+ return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(from)));
3429
+ }
3430
+
3431
+ template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
3432
+ {
3433
+ return pload<Packet4us>(reinterpret_cast<const uint16_t*>(from));
3434
+ }
3435
+
3436
+ template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
3437
+ {
3438
+ return ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from));
3439
+ }
3440
+
3441
+ template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
3442
+ {
3443
+ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
3444
+ }
3445
+
3446
+ template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from)
3447
+ {
3448
+ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
3449
+ }
3450
+
3451
+ template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
3452
+ {
3453
+ return ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from));
3454
+ }
3455
+
3456
+ template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
3457
+ return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
3458
+ }
3459
+
3460
+ template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf &a,
3461
+ const Packet4bf &b)
3462
+ {
3463
+ return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3464
+ }
3465
+ template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf &a,
3466
+ const Packet4bf &b)
3467
+ {
3468
+ return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3469
+ }
3470
+
3471
+ template <> EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf &a,
3472
+ const Packet4bf &b)
3473
+ {
3474
+ return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3475
+ }
3476
+
3477
+ template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf &a,
3478
+ const Packet4bf &b)
3479
+ {
3480
+ return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3481
+ }
3482
+ template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf &a,
3483
+ const Packet4bf &b)
3484
+ {
3485
+ return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3486
+ }
3487
+
3488
+ template <> EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf &a,
3489
+ const Packet4bf &b)
3490
+ {
3491
+ return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3492
+ }
3493
+
3494
+ template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
3495
+ {
3496
+ return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
3497
+ }
3498
+
3499
+ template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
3500
+ return por<Packet4us>(a, b);
3501
+ }
3502
+
3503
+ template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
3504
+ return pxor<Packet4us>(a, b);
3505
+ }
3506
+
3507
+ template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
3508
+ return pand<Packet4us>(a, b);
3509
+ }
3510
+
3511
+ template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
3512
+ return pandnot<Packet4us>(a, b);
3513
+ }
3514
+
3515
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
3516
+ const Packet4bf& b)
3517
+ {
3518
+ return pselect<Packet4us>(mask, a, b);
3519
+ }
3520
+
3521
+ template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
3522
+ {
3523
+ return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
3524
+ }
3525
+
3526
+ template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
3527
+ {
3528
+ return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
3529
+ }
3530
+
3531
+ template<> EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a)
3532
+ {
3533
+ return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
3534
+ }
3535
+
3536
+ template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; }
3537
+
3538
+ template<> EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
3539
+ return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3540
+ }
3541
+
3542
+ template<> EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
3543
+ return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3544
+ }
3545
+
3546
+ template<> EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
3547
+ return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3548
+ }
3549
+
3550
+ template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
3551
+ return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3552
+ }
3553
+
3554
+ template<>
3555
+ EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
3556
+ {
3557
+ return pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride);
3558
+ }
3559
+
3560
+ template<>
3561
+ EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
3562
+ {
3563
+ pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), from, stride);
3564
+ }
3565
+
3566
+ template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
3567
+ {
3568
+ return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
3569
+ }
3570
+
3571
+ template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a)
3572
+ {
3573
+ return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
3574
+ }
3575
+
3576
+ template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a)
3577
+ {
3578
+ return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
3579
+ }
3580
+
3581
+ template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a)
3582
+ {
3583
+ return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
3584
+ }
3585
+
3586
+ template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
3587
+ {
3588
+ return preverse<Packet4us>(a);
3589
+ }
3590
+
3591
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
3592
+ {
3593
+ detail::ptranspose_impl(kernel);
3594
+ }
3595
+
3596
+ template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
3597
+ {
3598
+ return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3599
+ }
3600
+
3601
+ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
3602
+ {
3603
+ return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3604
+ }
3605
+
3606
+ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
3607
+ {
3608
+ return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3609
+ }
3610
+
3611
+ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
3612
+ {
3613
+ return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3614
+ }
3615
+
3616
+ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
3617
+ {
3618
+ return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3619
+ }
3620
+
3621
+ template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
3622
+ {
3623
+ return pxor<Packet4us>(a, pset1<Packet4us>(static_cast<uint16_t>(0x8000)));
3624
+ }
3625
+
3626
+ //---------- double ----------
3627
+
3628
+ // Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
3629
+ // Confirmed at least with __apple_build_version__ = 6000054.
3630
+ #ifdef __apple_build_version__
3631
+ // Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
3632
+ // https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
3633
+ // major toolchain updates.
3634
+ #define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
3635
+ #else
3636
+ #define EIGEN_APPLE_DOUBLE_NEON_BUG 0
3637
+ #endif
3638
+
3639
+ #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
3640
+
3641
+ // Bug 907: workaround missing declarations of the following two functions in the ADK
3642
+ // Defining these functions as templates ensures that if these intrinsics are
3643
+ // already defined in arm_neon.h, then our workaround doesn't cause a conflict
3644
+ // and has lower priority in overload resolution.
3645
+ template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
3646
+
3647
+ template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
3648
+
3649
+ typedef float64x2_t Packet2d;
3650
+ typedef float64x1_t Packet1d;
3651
+
3652
+ // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
3653
+ // Currently used in LU/arch/InverseSize4.h to enable a shared implementation
3654
+ // for fast inversion of matrices of size 4.
3655
+ EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
3656
+ {
3657
+ const double* a = reinterpret_cast<const double*>(&m);
3658
+ const double* b = reinterpret_cast<const double*>(&n);
3659
+ Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
3660
+ return res;
3661
+ }
3662
+
3663
+ EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask)
3664
+ {
3665
+ return shuffle(a, b, mask);
3666
+ }
3667
+ EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b)
3668
+ {
3669
+ return shuffle(a, b, 0);
3670
+ }
3671
+ EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
3672
+ {
3673
+ return shuffle(a, b, 3);
3674
+ }
3675
+ #define vec2d_duplane(a, p) \
3676
+ vdupq_laneq_f64(a, p)
3677
+
3678
+ template<> struct packet_traits<double> : default_packet_traits
3679
+ {
3680
+ typedef Packet2d type;
3681
+ typedef Packet2d half;
3682
+ enum
3683
+ {
3684
+ Vectorizable = 1,
3685
+ AlignedOnScalar = 1,
3686
+ size = 2,
3687
+ HasHalfPacket = 0,
3688
+
3689
+ HasCmp = 1,
3690
+ HasAdd = 1,
3691
+ HasSub = 1,
3692
+ HasShift = 1,
3693
+ HasMul = 1,
3694
+ HasNegate = 1,
3695
+ HasAbs = 1,
3696
+ HasArg = 0,
3697
+ HasAbs2 = 1,
3698
+ HasAbsDiff = 1,
3699
+ HasMin = 1,
3700
+ HasMax = 1,
3701
+ HasConj = 1,
3702
+ HasSetLinear = 0,
3703
+ HasBlend = 0,
3704
+
3705
+ HasDiv = 1,
3706
+ HasFloor = 1,
3707
+ HasCeil = 1,
3708
+ HasRint = 1,
3709
+
3710
+ HasSin = 0,
3711
+ HasCos = 0,
3712
+ HasLog = 1,
3713
+ HasExp = 1,
3714
+ HasSqrt = 1,
3715
+ HasRsqrt = 1,
3716
+ HasTanh = 0,
3717
+ HasErf = 0
3718
+ };
3719
+ };
3720
+
3721
+ template<> struct unpacket_traits<Packet2d>
3722
+ {
3723
+ typedef double type;
3724
+ typedef Packet2d half;
3725
+ typedef Packet2l integer_packet;
3726
+ enum
3727
+ {
3728
+ size = 2,
3729
+ alignment = Aligned16,
3730
+ vectorizable = true,
3731
+ masked_load_available = false,
3732
+ masked_store_available = false
3733
+ };
3734
+ };
3735
+
3736
+ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }
3737
+
3738
+ template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
3739
+ {
3740
+ const double c[] = {0.0,1.0};
3741
+ return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
3742
+ }
3743
+
3744
+ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
3745
+
3746
+ template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
3747
+
3748
+ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
3749
+ template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
3750
+ const Packet2d mask = {numext::bit_cast<double>(0x8000000000000000ull),0.0};
3751
+ return padd(a, pxor(mask, b));
3752
+ }
3753
+
3754
+ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
3755
+
3756
+ template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
3757
+
3758
+ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
3759
+
3760
+ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
3761
+
3762
+ #ifdef __ARM_FEATURE_FMA
3763
+ // See bug 936. See above comment about FMA for float.
3764
+ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
3765
+ { return vfmaq_f64(c,a,b); }
3766
+ #else
3767
+ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
3768
+ { return vmlaq_f64(c,a,b); }
3769
+ #endif
3770
+
3771
+ template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
3772
+
3773
+ #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
3774
+ // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
3775
+ template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); }
3776
+ template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); }
3777
+
3778
+ #endif
3779
+
3780
+ template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmin<Packet2d>(a, b); }
3781
+
3782
+ template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
3783
+
3784
+
3785
+ template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmax<Packet2d>(a, b); }
3786
+
3787
+ // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
3788
+ template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
3789
+ { return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3790
+
3791
+ template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
3792
+ { return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3793
+
3794
+ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
3795
+ { return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3796
+
3797
+ template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
3798
+ { return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3799
+
3800
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b)
3801
+ { return vreinterpretq_f64_u64(vcleq_f64(a,b)); }
3802
+
3803
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b)
3804
+ { return vreinterpretq_f64_u64(vcltq_f64(a,b)); }
3805
+
3806
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b)
3807
+ { return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); }
3808
+
3809
+ template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b)
3810
+ { return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
3811
+
3812
+ template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
3813
+ { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
3814
+
3815
+ template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
3816
+ { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
3817
+
3818
+ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { return vld1q_dup_f64(from); }
3819
+ template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
3820
+ { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); }
3821
+
3822
+ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
3823
+ { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
3824
+
3825
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
3826
+ {
3827
+ Packet2d res = pset1<Packet2d>(0.0);
3828
+ res = vld1q_lane_f64(from + 0*stride, res, 0);
3829
+ res = vld1q_lane_f64(from + 1*stride, res, 1);
3830
+ return res;
3831
+ }
3832
+
3833
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
3834
+ {
3835
+ vst1q_lane_f64(to + stride*0, from, 0);
3836
+ vst1q_lane_f64(to + stride*1, from, 1);
3837
+ }
3838
+
3839
+ template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
3840
+
3841
+ // FIXME only store the 2 first elements ?
3842
+ template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a,0); }
3843
+
3844
+ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
3845
+ { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
3846
+
3847
+ template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
3848
+
3849
+ #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
3850
+ // workaround ICE, see bug 907
3851
+ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
3852
+ { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
3853
+ #else
3854
+ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
3855
+ { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
3856
+ #endif
3857
+
3858
+ // Other reduction functions:
3859
+ // mul
3860
+ #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
3861
+ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
3862
+ { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
3863
+ #else
3864
+ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
3865
+ { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
3866
+ #endif
3867
+
3868
+ // min
3869
+ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
3870
+ { return vgetq_lane_f64(vpminq_f64(a,a), 0); }
3871
+
3872
+ // max
3873
+ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
3874
+ { return vgetq_lane_f64(vpmaxq_f64(a,a), 0); }
3875
+
3876
+
3877
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
3878
+ ptranspose(PacketBlock<Packet2d, 2>& kernel)
3879
+ {
3880
+ const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
3881
+ const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
3882
+
3883
+ kernel.packet[0] = tmp1;
3884
+ kernel.packet[1] = tmp2;
3885
+ }
3886
+
3887
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
3888
+ { return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
3889
+
3890
+ template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
3891
+ { return vrndnq_f64(a); }
3892
+
3893
+ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
3894
+ { return vrndmq_f64(a); }
3895
+
3896
+ template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
3897
+ { return vrndpq_f64(a); }
3898
+
3899
+ template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
3900
+ { return pldexp_generic(a, exponent); }
3901
+
3902
+ template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent)
3903
+ { return pfrexp_generic(a,exponent); }
3904
+
3905
+ template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
3906
+ { return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
3907
+
3908
+ template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
3909
+ // Compute approximate reciprocal sqrt.
3910
+ Packet2d x = vrsqrteq_f64(a);
3911
+ // Do Newton iterations for 1/sqrt(x).
3912
+ x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
3913
+ x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
3914
+ x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
3915
+ const Packet2d infinity = pset1<Packet2d>(NumTraits<double>::infinity());
3916
+ return pselect(pcmp_eq(a, pzero(a)), infinity, x);
3917
+ }
3918
+
3919
+ template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
3920
+
3921
+ #endif // EIGEN_ARCH_ARM64
3922
+
3923
+ // Do we have an fp16 types and supporting Neon intrinsics?
3924
+ #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
3925
+ typedef float16x4_t Packet4hf;
3926
+ typedef float16x8_t Packet8hf;
3927
+
3928
+ template <>
3929
+ struct packet_traits<Eigen::half> : default_packet_traits {
3930
+ typedef Packet8hf type;
3931
+ typedef Packet4hf half;
3932
+ enum {
3933
+ Vectorizable = 1,
3934
+ AlignedOnScalar = 1,
3935
+ size = 8,
3936
+ HasHalfPacket = 1,
3937
+
3938
+ HasCmp = 1,
3939
+ HasCast = 1,
3940
+ HasAdd = 1,
3941
+ HasSub = 1,
3942
+ HasShift = 1,
3943
+ HasMul = 1,
3944
+ HasNegate = 1,
3945
+ HasAbs = 1,
3946
+ HasArg = 0,
3947
+ HasAbs2 = 1,
3948
+ HasAbsDiff = 0,
3949
+ HasMin = 1,
3950
+ HasMax = 1,
3951
+ HasConj = 1,
3952
+ HasSetLinear = 0,
3953
+ HasBlend = 0,
3954
+ HasInsert = 1,
3955
+ HasReduxp = 1,
3956
+ HasDiv = 1,
3957
+ HasFloor = 1,
3958
+ HasCeil = 1,
3959
+ HasRint = 1,
3960
+ HasSin = 0,
3961
+ HasCos = 0,
3962
+ HasLog = 0,
3963
+ HasExp = 0,
3964
+ HasSqrt = 1,
3965
+ HasRsqrt = 1,
3966
+ HasErf = EIGEN_FAST_MATH,
3967
+ HasBessel = 0, // Issues with accuracy.
3968
+ HasNdtri = 0
3969
+ };
3970
+ };
3971
+
3972
+ template <>
3973
+ struct unpacket_traits<Packet4hf> {
3974
+ typedef Eigen::half type;
3975
+ typedef Packet4hf half;
3976
+ enum {
3977
+ size = 4,
3978
+ alignment = Aligned16,
3979
+ vectorizable = true,
3980
+ masked_load_available = false,
3981
+ masked_store_available = false
3982
+ };
3983
+ };
3984
+
3985
+ template <>
3986
+ struct unpacket_traits<Packet8hf> {
3987
+ typedef Eigen::half type;
3988
+ typedef Packet4hf half;
3989
+ enum {
3990
+ size = 8,
3991
+ alignment = Aligned16,
3992
+ vectorizable = true,
3993
+ masked_load_available = false,
3994
+ masked_store_available = false
3995
+ };
3996
+ };
3997
+
3998
+ template<>
3999
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
4000
+ return vadd_f16(vget_low_f16(a), vget_high_f16(a));
4001
+ }
4002
+
4003
+ template <>
4004
+ EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
4005
+ return vdupq_n_f16(from.x);
4006
+ }
4007
+
4008
+ template <>
4009
+ EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
4010
+ return vdup_n_f16(from.x);
4011
+ }
4012
+
4013
+ template <>
4014
+ EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
4015
+ const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
4016
+ Packet8hf countdown = vld1q_f16(f);
4017
+ return vaddq_f16(pset1<Packet8hf>(a), countdown);
4018
+ }
4019
+
4020
+ template <>
4021
+ EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
4022
+ const float16_t f[] = {0, 1, 2, 3};
4023
+ Packet4hf countdown = vld1_f16(f);
4024
+ return vadd_f16(pset1<Packet4hf>(a), countdown);
4025
+ }
4026
+
4027
+ template <>
4028
+ EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4029
+ return vaddq_f16(a, b);
4030
+ }
4031
+
4032
+ template <>
4033
+ EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4034
+ return vadd_f16(a, b);
4035
+ }
4036
+
4037
+ template <>
4038
+ EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4039
+ return vsubq_f16(a, b);
4040
+ }
4041
+
4042
+ template <>
4043
+ EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4044
+ return vsub_f16(a, b);
4045
+ }
4046
+
4047
+ template <>
4048
+ EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
4049
+ return vnegq_f16(a);
4050
+ }
4051
+
4052
+ template <>
4053
+ EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
4054
+ return vneg_f16(a);
4055
+ }
4056
+
4057
+ template <>
4058
+ EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
4059
+ return a;
4060
+ }
4061
+
4062
+ template <>
4063
+ EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
4064
+ return a;
4065
+ }
4066
+
4067
+ template <>
4068
+ EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4069
+ return vmulq_f16(a, b);
4070
+ }
4071
+
4072
+ template <>
4073
+ EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4074
+ return vmul_f16(a, b);
4075
+ }
4076
+
4077
+ template <>
4078
+ EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4079
+ return vdivq_f16(a, b);
4080
+ }
4081
+
4082
+ template <>
4083
+ EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4084
+ return vdiv_f16(a, b);
4085
+ }
4086
+
4087
+ template <>
4088
+ EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
4089
+ return vfmaq_f16(c, a, b);
4090
+ }
4091
+
4092
+ template <>
4093
+ EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
4094
+ return vfma_f16(c, a, b);
4095
+ }
4096
+
4097
+ template <>
4098
+ EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4099
+ return vminq_f16(a, b);
4100
+ }
4101
+
4102
+ template <>
4103
+ EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4104
+ return vmin_f16(a, b);
4105
+ }
4106
+
4107
+ #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
4108
+ // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
4109
+ template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); }
4110
+ template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); }
4111
+ #endif
4112
+
4113
+ template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmin<Packet4hf>(a, b); }
4114
+
4115
+ template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmin<Packet8hf>(a, b); }
4116
+
4117
+ template <>
4118
+ EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4119
+ return vmaxq_f16(a, b);
4120
+ }
4121
+
4122
+ template <>
4123
+ EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4124
+ return vmax_f16(a, b);
4125
+ }
4126
+
4127
+ #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
4128
+ // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
4129
+ template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); }
4130
+ template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); }
4131
+ #endif
4132
+
4133
+ template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmax<Packet4hf>(a, b); }
4134
+
4135
+ template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmax<Packet8hf>(a, b); }
4136
+
4137
+ #define EIGEN_MAKE_ARM_FP16_CMP_8(name) \
4138
+ template <> \
4139
+ EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
4140
+ return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \
4141
+ }
4142
+
4143
+ #define EIGEN_MAKE_ARM_FP16_CMP_4(name) \
4144
+ template <> \
4145
+ EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
4146
+ return vreinterpret_f16_u16(vc##name##_f16(a, b)); \
4147
+ }
4148
+
4149
+ EIGEN_MAKE_ARM_FP16_CMP_8(eq)
4150
+ EIGEN_MAKE_ARM_FP16_CMP_8(lt)
4151
+ EIGEN_MAKE_ARM_FP16_CMP_8(le)
4152
+
4153
+ EIGEN_MAKE_ARM_FP16_CMP_4(eq)
4154
+ EIGEN_MAKE_ARM_FP16_CMP_4(lt)
4155
+ EIGEN_MAKE_ARM_FP16_CMP_4(le)
4156
+
4157
+ #undef EIGEN_MAKE_ARM_FP16_CMP_8
4158
+ #undef EIGEN_MAKE_ARM_FP16_CMP_4
4159
+
4160
+ template <>
4161
+ EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4162
+ return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
4163
+ }
4164
+
4165
+ template <>
4166
+ EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4167
+ return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
4168
+ }
4169
+
4170
+ template <>
4171
+ EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a)
4172
+ { return vrndnq_f16(a); }
4173
+
4174
+ template <>
4175
+ EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a)
4176
+ { return vrndn_f16(a); }
4177
+
4178
+ template <>
4179
+ EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a)
4180
+ { return vrndmq_f16(a); }
4181
+
4182
+ template <>
4183
+ EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a)
4184
+ { return vrndm_f16(a); }
4185
+
4186
+ template <>
4187
+ EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a)
4188
+ { return vrndpq_f16(a); }
4189
+
4190
+ template <>
4191
+ EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a)
4192
+ { return vrndp_f16(a); }
4193
+
4194
+ template <>
4195
+ EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
4196
+ return vsqrtq_f16(a);
4197
+ }
4198
+
4199
+ template <>
4200
+ EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
4201
+ return vsqrt_f16(a);
4202
+ }
4203
+
4204
+ template <>
4205
+ EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4206
+ return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4207
+ }
4208
+
4209
+ template <>
4210
+ EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4211
+ return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4212
+ }
4213
+
4214
+ template <>
4215
+ EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4216
+ return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4217
+ }
4218
+
4219
+ template <>
4220
+ EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4221
+ return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4222
+ }
4223
+
4224
+ template <>
4225
+ EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4226
+ return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4227
+ }
4228
+
4229
+ template <>
4230
+ EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4231
+ return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4232
+ }
4233
+
4234
+ template <>
4235
+ EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4236
+ return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4237
+ }
4238
+
4239
+ template <>
4240
+ EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4241
+ return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4242
+ }
4243
+
4244
+ template <>
4245
+ EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
4246
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
4247
+ }
4248
+
4249
+ template <>
4250
+ EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
4251
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
4252
+ }
4253
+
4254
+ template <>
4255
+ EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
4256
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
4257
+ }
4258
+
4259
+ template <>
4260
+ EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
4261
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
4262
+ }
4263
+
4264
+ template <>
4265
+ EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
4266
+ Packet8hf packet;
4267
+ packet[0] = from[0].x;
4268
+ packet[1] = from[0].x;
4269
+ packet[2] = from[1].x;
4270
+ packet[3] = from[1].x;
4271
+ packet[4] = from[2].x;
4272
+ packet[5] = from[2].x;
4273
+ packet[6] = from[3].x;
4274
+ packet[7] = from[3].x;
4275
+ return packet;
4276
+ }
4277
+
4278
+ template <>
4279
+ EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
4280
+ float16x4_t packet;
4281
+ float16_t* tmp;
4282
+ tmp = (float16_t*)&packet;
4283
+ tmp[0] = from[0].x;
4284
+ tmp[1] = from[0].x;
4285
+ tmp[2] = from[1].x;
4286
+ tmp[3] = from[1].x;
4287
+ return packet;
4288
+ }
4289
+
4290
+ template <>
4291
+ EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
4292
+ Packet4hf lo, hi;
4293
+ lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
4294
+ hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from+1));
4295
+ return vcombine_f16(lo, hi);
4296
+ }
4297
+
4298
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); }
4299
+
4300
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); }
4301
+
4302
+ template <>
4303
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
4304
+ return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
4305
+ }
4306
+
4307
+ template <>
4308
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
4309
+ return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
4310
+ }
4311
+
4312
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); }
4313
+
4314
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); }
4315
+
4316
+ template <>
4317
+ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
4318
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
4319
+ }
4320
+
4321
+ template <>
4322
+ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
4323
+ EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
4324
+ }
4325
+
4326
+ template <>
4327
+ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
4328
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
4329
+ }
4330
+
4331
+ template <>
4332
+ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
4333
+ EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
4334
+ }
4335
+
4336
+ template <>
4337
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
4338
+ Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
4339
+ res = vsetq_lane_f16(from[0 * stride].x, res, 0);
4340
+ res = vsetq_lane_f16(from[1 * stride].x, res, 1);
4341
+ res = vsetq_lane_f16(from[2 * stride].x, res, 2);
4342
+ res = vsetq_lane_f16(from[3 * stride].x, res, 3);
4343
+ res = vsetq_lane_f16(from[4 * stride].x, res, 4);
4344
+ res = vsetq_lane_f16(from[5 * stride].x, res, 5);
4345
+ res = vsetq_lane_f16(from[6 * stride].x, res, 6);
4346
+ res = vsetq_lane_f16(from[7 * stride].x, res, 7);
4347
+ return res;
4348
+ }
4349
+
4350
+ template <>
4351
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
4352
+ Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
4353
+ res = vset_lane_f16(from[0 * stride].x, res, 0);
4354
+ res = vset_lane_f16(from[1 * stride].x, res, 1);
4355
+ res = vset_lane_f16(from[2 * stride].x, res, 2);
4356
+ res = vset_lane_f16(from[3 * stride].x, res, 3);
4357
+ return res;
4358
+ }
4359
+
4360
+ template <>
4361
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from, Index stride) {
4362
+ to[stride * 0].x = vgetq_lane_f16(from, 0);
4363
+ to[stride * 1].x = vgetq_lane_f16(from, 1);
4364
+ to[stride * 2].x = vgetq_lane_f16(from, 2);
4365
+ to[stride * 3].x = vgetq_lane_f16(from, 3);
4366
+ to[stride * 4].x = vgetq_lane_f16(from, 4);
4367
+ to[stride * 5].x = vgetq_lane_f16(from, 5);
4368
+ to[stride * 6].x = vgetq_lane_f16(from, 6);
4369
+ to[stride * 7].x = vgetq_lane_f16(from, 7);
4370
+ }
4371
+
4372
+ template <>
4373
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from, Index stride) {
4374
+ to[stride * 0].x = vget_lane_f16(from, 0);
4375
+ to[stride * 1].x = vget_lane_f16(from, 1);
4376
+ to[stride * 2].x = vget_lane_f16(from, 2);
4377
+ to[stride * 3].x = vget_lane_f16(from, 3);
4378
+ }
4379
+
4380
+ template <>
4381
+ EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
4382
+ EIGEN_ARM_PREFETCH(addr);
4383
+ }
4384
+
4385
+ template <>
4386
+ EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
4387
+ float16_t x[8];
4388
+ vst1q_f16(x, a);
4389
+ Eigen::half h;
4390
+ h.x = x[0];
4391
+ return h;
4392
+ }
4393
+
4394
+ template <>
4395
+ EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
4396
+ float16_t x[4];
4397
+ vst1_f16(x, a);
4398
+ Eigen::half h;
4399
+ h.x = x[0];
4400
+ return h;
4401
+ }
4402
+
4403
+ template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
4404
+ float16x4_t a_lo, a_hi;
4405
+ Packet8hf a_r64;
4406
+
4407
+ a_r64 = vrev64q_f16(a);
4408
+ a_lo = vget_low_f16(a_r64);
4409
+ a_hi = vget_high_f16(a_r64);
4410
+ return vcombine_f16(a_hi, a_lo);
4411
+ }
4412
+
4413
+ template <>
4414
+ EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
4415
+ return vrev64_f16(a);
4416
+ }
4417
+
4418
+ template <>
4419
+ EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
4420
+ return vabsq_f16(a);
4421
+ }
4422
+
4423
+ template <>
4424
+ EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
4425
+ return vabs_f16(a);
4426
+ }
4427
+
4428
+ template <>
4429
+ EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
4430
+ float16x4_t a_lo, a_hi, sum;
4431
+
4432
+ a_lo = vget_low_f16(a);
4433
+ a_hi = vget_high_f16(a);
4434
+ sum = vpadd_f16(a_lo, a_hi);
4435
+ sum = vpadd_f16(sum, sum);
4436
+ sum = vpadd_f16(sum, sum);
4437
+
4438
+ Eigen::half h;
4439
+ h.x = vget_lane_f16(sum, 0);
4440
+ return h;
4441
+ }
4442
+
4443
+ template <>
4444
+ EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
4445
+ float16x4_t sum;
4446
+
4447
+ sum = vpadd_f16(a, a);
4448
+ sum = vpadd_f16(sum, sum);
4449
+ Eigen::half h;
4450
+ h.x = vget_lane_f16(sum, 0);
4451
+ return h;
4452
+ }
4453
+
4454
+ template <>
4455
+ EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
4456
+ float16x4_t a_lo, a_hi, prod;
4457
+
4458
+ a_lo = vget_low_f16(a);
4459
+ a_hi = vget_high_f16(a);
4460
+ prod = vmul_f16(a_lo, a_hi);
4461
+ prod = vmul_f16(prod, vrev64_f16(prod));
4462
+
4463
+ Eigen::half h;
4464
+ h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
4465
+ return h;
4466
+ }
4467
+
4468
+ template <>
4469
+ EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
4470
+ float16x4_t prod;
4471
+ prod = vmul_f16(a, vrev64_f16(a));
4472
+ Eigen::half h;
4473
+ h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
4474
+ return h;
4475
+ }
4476
+
4477
+ template <>
4478
+ EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
4479
+ float16x4_t a_lo, a_hi, min;
4480
+
4481
+ a_lo = vget_low_f16(a);
4482
+ a_hi = vget_high_f16(a);
4483
+ min = vpmin_f16(a_lo, a_hi);
4484
+ min = vpmin_f16(min, min);
4485
+ min = vpmin_f16(min, min);
4486
+
4487
+ Eigen::half h;
4488
+ h.x = vget_lane_f16(min, 0);
4489
+ return h;
4490
+ }
4491
+
4492
+ template <>
4493
+ EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
4494
+ Packet4hf tmp;
4495
+ tmp = vpmin_f16(a, a);
4496
+ tmp = vpmin_f16(tmp, tmp);
4497
+ Eigen::half h;
4498
+ h.x = vget_lane_f16(tmp, 0);
4499
+ return h;
4500
+ }
4501
+
4502
+ template <>
4503
+ EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
4504
+ float16x4_t a_lo, a_hi, max;
4505
+
4506
+ a_lo = vget_low_f16(a);
4507
+ a_hi = vget_high_f16(a);
4508
+ max = vpmax_f16(a_lo, a_hi);
4509
+ max = vpmax_f16(max, max);
4510
+ max = vpmax_f16(max, max);
4511
+
4512
+ Eigen::half h;
4513
+ h.x = vget_lane_f16(max, 0);
4514
+ return h;
4515
+ }
4516
+
4517
+ template <>
4518
+ EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
4519
+ Packet4hf tmp;
4520
+ tmp = vpmax_f16(a, a);
4521
+ tmp = vpmax_f16(tmp, tmp);
4522
+ Eigen::half h;
4523
+ h.x = vget_lane_f16(tmp, 0);
4524
+ return h;
4525
+ }
4526
+
4527
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
4528
+ {
4529
+ const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
4530
+ const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
4531
+
4532
+ const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
4533
+ const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
4534
+
4535
+ kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
4536
+ kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
4537
+ kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
4538
+ kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
4539
+ }
4540
+
4541
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
4542
+ EIGEN_ALIGN16 float16x4x4_t tmp_x4;
4543
+ float16_t* tmp = (float16_t*)&kernel;
4544
+ tmp_x4 = vld4_f16(tmp);
4545
+
4546
+ kernel.packet[0] = tmp_x4.val[0];
4547
+ kernel.packet[1] = tmp_x4.val[1];
4548
+ kernel.packet[2] = tmp_x4.val[2];
4549
+ kernel.packet[3] = tmp_x4.val[3];
4550
+ }
4551
+
4552
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
4553
+ float16x8x2_t T_1[4];
4554
+
4555
+ T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
4556
+ T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
4557
+ T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
4558
+ T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
4559
+
4560
+ float16x8x2_t T_2[4];
4561
+ T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
4562
+ T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
4563
+ T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
4564
+ T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
4565
+
4566
+ float16x8x2_t T_3[4];
4567
+ T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
4568
+ T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
4569
+ T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
4570
+ T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
4571
+
4572
+ kernel.packet[0] = T_3[0].val[0];
4573
+ kernel.packet[1] = T_3[2].val[0];
4574
+ kernel.packet[2] = T_3[1].val[0];
4575
+ kernel.packet[3] = T_3[3].val[0];
4576
+ kernel.packet[4] = T_3[0].val[1];
4577
+ kernel.packet[5] = T_3[2].val[1];
4578
+ kernel.packet[6] = T_3[1].val[1];
4579
+ kernel.packet[7] = T_3[3].val[1];
4580
+ }
4581
+ #endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
4582
+
4583
+ } // end namespace internal
4584
+
4585
+ } // end namespace Eigen
4586
+
4587
+ #endif // EIGEN_PACKET_MATH_NEON_H