tomoto 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/README.md +8 -10
  4. data/ext/tomoto/ct.cpp +11 -11
  5. data/ext/tomoto/dmr.cpp +14 -13
  6. data/ext/tomoto/dt.cpp +14 -14
  7. data/ext/tomoto/extconf.rb +7 -5
  8. data/ext/tomoto/gdmr.cpp +7 -7
  9. data/ext/tomoto/hdp.cpp +9 -9
  10. data/ext/tomoto/hlda.cpp +13 -13
  11. data/ext/tomoto/hpa.cpp +5 -5
  12. data/ext/tomoto/lda.cpp +42 -39
  13. data/ext/tomoto/llda.cpp +6 -6
  14. data/ext/tomoto/mglda.cpp +15 -15
  15. data/ext/tomoto/pa.cpp +6 -6
  16. data/ext/tomoto/plda.cpp +6 -6
  17. data/ext/tomoto/slda.cpp +8 -8
  18. data/ext/tomoto/{ext.cpp → tomoto.cpp} +8 -8
  19. data/ext/tomoto/utils.h +16 -70
  20. data/lib/tomoto/version.rb +1 -1
  21. data/lib/tomoto.rb +5 -1
  22. data/vendor/EigenRand/EigenRand/Core.h +10 -10
  23. data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
  24. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
  25. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
  26. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
  27. data/vendor/EigenRand/EigenRand/EigenRand +11 -6
  28. data/vendor/EigenRand/EigenRand/Macro.h +13 -7
  29. data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
  30. data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
  31. data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
  32. data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
  33. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
  34. data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
  35. data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
  36. data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
  37. data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
  38. data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
  39. data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
  40. data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
  41. data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
  42. data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
  43. data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
  44. data/vendor/EigenRand/EigenRand/doc.h +24 -12
  45. data/vendor/EigenRand/README.md +57 -4
  46. data/vendor/eigen/COPYING.APACHE +203 -0
  47. data/vendor/eigen/COPYING.BSD +1 -1
  48. data/vendor/eigen/COPYING.MINPACK +51 -52
  49. data/vendor/eigen/Eigen/Cholesky +0 -1
  50. data/vendor/eigen/Eigen/Core +112 -265
  51. data/vendor/eigen/Eigen/Eigenvalues +2 -3
  52. data/vendor/eigen/Eigen/Geometry +5 -8
  53. data/vendor/eigen/Eigen/Householder +0 -1
  54. data/vendor/eigen/Eigen/Jacobi +0 -1
  55. data/vendor/eigen/Eigen/KLUSupport +41 -0
  56. data/vendor/eigen/Eigen/LU +2 -5
  57. data/vendor/eigen/Eigen/OrderingMethods +0 -3
  58. data/vendor/eigen/Eigen/PaStiXSupport +1 -0
  59. data/vendor/eigen/Eigen/PardisoSupport +0 -0
  60. data/vendor/eigen/Eigen/QR +2 -3
  61. data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
  62. data/vendor/eigen/Eigen/SVD +0 -1
  63. data/vendor/eigen/Eigen/Sparse +0 -2
  64. data/vendor/eigen/Eigen/SparseCholesky +0 -8
  65. data/vendor/eigen/Eigen/SparseLU +4 -0
  66. data/vendor/eigen/Eigen/SparseQR +0 -1
  67. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
  68. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
  69. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
  70. data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
  71. data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
  72. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
  73. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
  74. data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
  75. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
  76. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
  77. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
  78. data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
  79. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
  80. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
  81. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
  82. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
  83. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
  84. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
  85. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
  86. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
  87. data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
  88. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
  89. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
  90. data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
  91. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
  92. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
  93. data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
  94. data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
  95. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
  96. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
  97. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
  98. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
  99. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
  100. data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
  101. data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
  102. data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
  103. data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
  104. data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
  105. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
  106. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
  107. data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
  108. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
  109. data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
  110. data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
  111. data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
  112. data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  113. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
  114. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
  115. data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
  116. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
  117. data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
  118. data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
  119. data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
  120. data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
  121. data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
  122. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
  123. data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
  124. data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
  125. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
  126. data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
  127. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
  128. data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
  129. data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
  130. data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
  131. data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
  132. data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
  133. data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
  134. data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
  135. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
  136. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
  137. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
  138. data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
  139. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
  140. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
  141. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
  142. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
  143. data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
  145. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
  146. data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
  148. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
  149. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
  153. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
  154. data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  155. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
  156. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  157. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  158. data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
  160. data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  161. data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
  162. data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  163. data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  164. data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  165. data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  166. data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  167. data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  168. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
  169. data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  170. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
  171. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
  172. data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  173. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
  174. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
  175. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
  176. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
  177. data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  178. data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  179. data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  180. data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  181. data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  182. data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  183. data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  184. data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  185. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
  186. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
  187. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
  188. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
  189. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
  190. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
  191. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
  192. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
  193. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
  194. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
  195. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
  196. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
  197. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
  198. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
  199. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
  200. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
  201. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
  202. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
  203. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
  204. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
  205. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
  206. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
  207. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
  208. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
  209. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
  210. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
  211. data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  212. data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
  213. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
  214. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
  215. data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  216. data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
  217. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
  218. data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
  219. data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
  220. data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
  221. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
  222. data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  223. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
  224. data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  225. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
  226. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
  227. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
  228. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
  229. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
  230. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
  231. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
  232. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
  233. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
  234. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
  235. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
  236. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
  237. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
  238. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
  239. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
  240. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
  241. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
  242. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
  243. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
  244. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
  245. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
  246. data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
  247. data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
  248. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
  249. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  250. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
  251. data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
  252. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
  253. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
  254. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
  255. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
  256. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
  257. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
  258. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
  259. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
  260. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
  261. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
  262. data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  263. data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
  264. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
  265. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
  266. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
  267. data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
  268. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
  269. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
  270. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
  271. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
  272. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
  273. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
  274. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
  275. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
  276. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
  277. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
  278. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
  279. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
  280. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
  281. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
  282. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
  283. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
  284. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
  285. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
  286. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
  287. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
  288. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
  289. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
  290. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
  291. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
  292. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
  293. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
  294. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
  295. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
  296. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
  297. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
  298. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
  299. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
  300. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
  301. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
  302. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
  303. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
  304. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
  305. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
  306. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
  307. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
  308. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
  309. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
  310. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
  311. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
  312. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
  313. data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
  314. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
  315. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
  316. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
  317. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
  318. data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  319. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
  320. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
  321. data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
  322. data/vendor/eigen/README.md +2 -0
  323. data/vendor/eigen/bench/btl/README +1 -1
  324. data/vendor/eigen/bench/tensors/README +6 -7
  325. data/vendor/eigen/ci/README.md +56 -0
  326. data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
  327. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
  328. data/vendor/eigen/unsupported/README.txt +1 -1
  329. data/vendor/tomotopy/README.kr.rst +78 -0
  330. data/vendor/tomotopy/README.rst +75 -0
  331. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
  332. data/vendor/tomotopy/src/Labeling/Phraser.hpp +4 -4
  333. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +7 -3
  334. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +7 -3
  335. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +6 -3
  336. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
  337. data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
  338. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +57 -6
  339. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +6 -3
  340. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +3 -2
  341. data/vendor/tomotopy/src/TopicModel/LDA.h +3 -3
  342. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +5 -5
  343. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +50 -19
  344. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +6 -2
  345. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +3 -2
  346. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +1 -1
  347. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +6 -2
  348. data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
  349. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +36 -3
  350. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +6 -3
  351. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +55 -26
  352. data/vendor/tomotopy/src/Utils/AliasMethod.hpp +5 -4
  353. data/vendor/tomotopy/src/Utils/Dictionary.h +2 -2
  354. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
  355. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -1
  356. data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +1 -1
  357. data/vendor/tomotopy/src/Utils/exception.h +6 -0
  358. data/vendor/tomotopy/src/Utils/math.h +2 -2
  359. data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
  360. data/vendor/tomotopy/src/Utils/serializer.hpp +30 -5
  361. data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
  362. metadata +64 -18
  363. data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
  364. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
  365. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  366. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  367. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  368. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  369. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
@@ -10,26 +10,20 @@
10
10
  #ifndef EIGEN_PACKET_MATH_ZVECTOR_H
11
11
  #define EIGEN_PACKET_MATH_ZVECTOR_H
12
12
 
13
- #include <stdint.h>
14
-
15
13
  namespace Eigen {
16
14
 
17
15
  namespace internal {
18
16
 
19
17
  #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20
- #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
18
+ #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
21
19
  #endif
22
20
 
23
21
  #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24
22
  #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25
23
  #endif
26
24
 
27
- #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
28
- #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
29
- #endif
30
-
31
25
  #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
32
- #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
26
+ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
33
27
  #endif
34
28
 
35
29
  typedef __vector int Packet4i;
@@ -41,21 +35,30 @@ typedef __vector double Packet2d;
41
35
  typedef __vector unsigned long long Packet2ul;
42
36
  typedef __vector long long Packet2l;
43
37
 
38
+ // Z14 has builtin support for float vectors
39
+ #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
40
+ typedef __vector float Packet4f;
41
+ #else
44
42
  typedef struct {
45
43
  Packet2d v4f[2];
46
44
  } Packet4f;
45
+ #endif
47
46
 
48
47
  typedef union {
49
- int32_t i[4];
50
- uint32_t ui[4];
51
- int64_t l[2];
52
- uint64_t ul[2];
48
+ numext::int32_t i[4];
49
+ numext::uint32_t ui[4];
50
+ numext::int64_t l[2];
51
+ numext::uint64_t ul[2];
53
52
  double d[2];
53
+ float f[4];
54
54
  Packet4i v4i;
55
55
  Packet4ui v4ui;
56
56
  Packet2l v2l;
57
57
  Packet2ul v2ul;
58
58
  Packet2d v2d;
59
+ #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
60
+ Packet4f v4f;
61
+ #endif
59
62
  } Packet;
60
63
 
61
64
  // We don't want to write the same code all the time, but we need to reuse the constants
@@ -80,15 +83,31 @@ typedef union {
80
83
  Packet2l p2l_##NAME = pset1<Packet2l>(X)
81
84
 
82
85
  // These constants are endian-agnostic
83
- //static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
86
+ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
84
87
  static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
85
88
 
86
89
  static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
87
90
  static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
88
91
  static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
89
92
 
90
- static Packet2d p2d_ONE = { 1.0, 1.0 };
91
- static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
93
+ static Packet2d p2d_ONE = { 1.0, 1.0 };
94
+ static Packet2d p2d_ZERO_ = { numext::bit_cast<double>0x8000000000000000ull),
95
+ numext::bit_cast<double>0x8000000000000000ull) };
96
+
97
+ #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
98
+ #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
99
+ Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
100
+
101
+ #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
102
+ Packet4f p4f_##NAME = pset1<Packet4f>(X)
103
+
104
+ #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
105
+ const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
106
+
107
+ static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
108
+ static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
109
+ static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
110
+ #endif
92
111
 
93
112
  static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
94
113
  static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
@@ -120,9 +139,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0
120
139
  static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
121
140
  static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
122
141
 
123
- //static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
142
+ static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
124
143
 
125
- //static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
144
+ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
126
145
 
127
146
 
128
147
  #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
@@ -149,29 +168,31 @@ template<> struct packet_traits<int> : default_packet_traits
149
168
  };
150
169
  };
151
170
 
152
- template<> struct packet_traits<float> : default_packet_traits
153
- {
171
+ template <>
172
+ struct packet_traits<float> : default_packet_traits {
154
173
  typedef Packet4f type;
155
174
  typedef Packet4f half;
156
175
  enum {
157
176
  Vectorizable = 1,
158
177
  AlignedOnScalar = 1,
159
- size=4,
178
+ size = 4,
160
179
  HasHalfPacket = 0,
161
180
 
162
- HasAdd = 1,
163
- HasSub = 1,
164
- HasMul = 1,
165
- HasDiv = 1,
166
- HasMin = 1,
167
- HasMax = 1,
168
- HasAbs = 1,
169
- HasSin = 0,
170
- HasCos = 0,
171
- HasLog = 0,
172
- HasExp = 1,
181
+ HasAdd = 1,
182
+ HasSub = 1,
183
+ HasMul = 1,
184
+ HasDiv = 1,
185
+ HasMin = 1,
186
+ HasMax = 1,
187
+ HasAbs = 1,
188
+ HasSin = 0,
189
+ HasCos = 0,
190
+ HasLog = 0,
191
+ HasExp = 1,
173
192
  HasSqrt = 1,
174
193
  HasRsqrt = 1,
194
+ HasTanh = 1,
195
+ HasErf = 1,
175
196
  HasRound = 1,
176
197
  HasFloor = 1,
177
198
  HasCeil = 1,
@@ -211,9 +232,9 @@ template<> struct packet_traits<double> : default_packet_traits
211
232
  };
212
233
  };
213
234
 
214
- template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
215
- template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
216
- template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
235
+ template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; };
236
+ template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; };
237
+ template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
217
238
 
218
239
  /* Forward declaration */
219
240
  EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
@@ -258,82 +279,15 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
258
279
  return s;
259
280
  }
260
281
 
261
- /* Helper function to simulate a vec_splat_packet4f
262
- */
263
- template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from)
282
+ #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
283
+ inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
264
284
  {
265
- Packet4f splat;
266
- switch (element) {
267
- case 0:
268
- splat.v4f[0] = vec_splat(from.v4f[0], 0);
269
- splat.v4f[1] = splat.v4f[0];
270
- break;
271
- case 1:
272
- splat.v4f[0] = vec_splat(from.v4f[0], 1);
273
- splat.v4f[1] = splat.v4f[0];
274
- break;
275
- case 2:
276
- splat.v4f[0] = vec_splat(from.v4f[1], 0);
277
- splat.v4f[1] = splat.v4f[0];
278
- break;
279
- case 3:
280
- splat.v4f[0] = vec_splat(from.v4f[1], 1);
281
- splat.v4f[1] = splat.v4f[0];
282
- break;
283
- }
284
- return splat;
285
+ Packet vt;
286
+ vt.v4f = v;
287
+ s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
288
+ return s;
285
289
  }
286
-
287
- template<int Offset>
288
- struct palign_impl<Offset,Packet4i>
289
- {
290
- static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
291
- {
292
- switch (Offset % 4) {
293
- case 1:
294
- first = vec_sld(first, second, 4); break;
295
- case 2:
296
- first = vec_sld(first, second, 8); break;
297
- case 3:
298
- first = vec_sld(first, second, 12); break;
299
- }
300
- }
301
- };
302
-
303
- /* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
304
- */
305
- template<int Offset>
306
- struct palign_impl<Offset,Packet4f>
307
- {
308
- static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
309
- {
310
- switch (Offset % 4) {
311
- case 1:
312
- first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
313
- first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
314
- break;
315
- case 2:
316
- first.v4f[0] = first.v4f[1];
317
- first.v4f[1] = second.v4f[0];
318
- break;
319
- case 3:
320
- first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8);
321
- first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
322
- break;
323
- }
324
- }
325
- };
326
-
327
-
328
- template<int Offset>
329
- struct palign_impl<Offset,Packet2d>
330
- {
331
- static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
332
- {
333
- if (Offset == 1)
334
- first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(first), reinterpret_cast<Packet4i>(second), 8));
335
- }
336
- };
290
+ #endif
337
291
 
338
292
  template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
339
293
  {
@@ -344,16 +298,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
344
298
  return vfrom->v4i;
345
299
  }
346
300
 
347
- template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
348
- {
349
- // FIXME: No intrinsic yet
350
- EIGEN_DEBUG_ALIGNED_LOAD
351
- Packet4f vfrom;
352
- vfrom.v4f[0] = vec_ld2f(&from[0]);
353
- vfrom.v4f[1] = vec_ld2f(&from[2]);
354
- return vfrom;
355
- }
356
-
357
301
  template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
358
302
  {
359
303
  // FIXME: No intrinsic yet
@@ -372,15 +316,6 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& f
372
316
  vto->v4i = from;
373
317
  }
374
318
 
375
- template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
376
- {
377
- // FIXME: No intrinsic yet
378
- EIGEN_DEBUG_ALIGNED_STORE
379
- vec_st2f(from.v4f[0], &to[0]);
380
- vec_st2f(from.v4f[1], &to[2]);
381
- }
382
-
383
-
384
319
  template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
385
320
  {
386
321
  // FIXME: No intrinsic yet
@@ -397,13 +332,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from)
397
332
  template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
398
333
  return vec_splats(from);
399
334
  }
400
- template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
401
- {
402
- Packet4f to;
403
- to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
404
- to.v4f[1] = to.v4f[0];
405
- return to;
406
- }
407
335
 
408
336
  template<> EIGEN_STRONG_INLINE void
409
337
  pbroadcast4<Packet4i>(const int *a,
@@ -416,17 +344,6 @@ pbroadcast4<Packet4i>(const int *a,
416
344
  a3 = vec_splat(a3, 3);
417
345
  }
418
346
 
419
- template<> EIGEN_STRONG_INLINE void
420
- pbroadcast4<Packet4f>(const float *a,
421
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
422
- {
423
- a3 = pload<Packet4f>(a);
424
- a0 = vec_splat_packet4f<0>(a3);
425
- a1 = vec_splat_packet4f<1>(a3);
426
- a2 = vec_splat_packet4f<2>(a3);
427
- a3 = vec_splat_packet4f<3>(a3);
428
- }
429
-
430
347
  template<> EIGEN_STRONG_INLINE void
431
348
  pbroadcast4<Packet2d>(const double *a,
432
349
  Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
@@ -449,16 +366,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
449
366
  return pload<Packet4i>(ai);
450
367
  }
451
368
 
452
- template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
453
- {
454
- float EIGEN_ALIGN16 ai[4];
455
- ai[0] = from[0*stride];
456
- ai[1] = from[1*stride];
457
- ai[2] = from[2*stride];
458
- ai[3] = from[3*stride];
459
- return pload<Packet4f>(ai);
460
- }
461
-
462
369
  template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
463
370
  {
464
371
  double EIGEN_ALIGN16 af[2];
@@ -477,16 +384,6 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
477
384
  to[3*stride] = ai[3];
478
385
  }
479
386
 
480
- template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
481
- {
482
- float EIGEN_ALIGN16 ai[4];
483
- pstore<float>((float *)ai, from);
484
- to[0*stride] = ai[0];
485
- to[1*stride] = ai[1];
486
- to[2*stride] = ai[2];
487
- to[3*stride] = ai[3];
488
- }
489
-
490
387
  template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
491
388
  {
492
389
  double EIGEN_ALIGN16 af[2];
@@ -496,160 +393,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
496
393
  }
497
394
 
498
395
  template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
499
- template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
500
- {
501
- Packet4f c;
502
- c.v4f[0] = a.v4f[0] + b.v4f[0];
503
- c.v4f[1] = a.v4f[1] + b.v4f[1];
504
- return c;
505
- }
506
396
  template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
507
397
 
508
398
  template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
509
- template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
510
- {
511
- Packet4f c;
512
- c.v4f[0] = a.v4f[0] - b.v4f[0];
513
- c.v4f[1] = a.v4f[1] - b.v4f[1];
514
- return c;
515
- }
516
399
  template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
517
400
 
518
401
  template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
519
- template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
520
- {
521
- Packet4f c;
522
- c.v4f[0] = a.v4f[0] * b.v4f[0];
523
- c.v4f[1] = a.v4f[1] * b.v4f[1];
524
- return c;
525
- }
526
402
  template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
527
403
 
528
404
  template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
529
- template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
530
- {
531
- Packet4f c;
532
- c.v4f[0] = a.v4f[0] / b.v4f[0];
533
- c.v4f[1] = a.v4f[1] / b.v4f[1];
534
- return c;
535
- }
536
405
  template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
537
406
 
538
407
  template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
539
- template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
540
- {
541
- Packet4f c;
542
- c.v4f[0] = -a.v4f[0];
543
- c.v4f[1] = -a.v4f[1];
544
- return c;
545
- }
546
408
  template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
547
409
 
548
410
  template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
549
- template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
550
411
  template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
551
412
 
552
413
  template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
553
- template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
554
- {
555
- Packet4f res;
556
- res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
557
- res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
558
- return res;
559
- }
560
414
  template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
561
415
 
562
416
  template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
563
- template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
564
417
  template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
565
418
 
566
419
  template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
567
420
  template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
568
- template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
569
- {
570
- Packet4f res;
571
- res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
572
- res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
573
- return res;
574
- }
575
421
 
576
422
  template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
577
423
  template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
578
- template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
579
- {
580
- Packet4f res;
581
- res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
582
- res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
583
- return res;
584
- }
585
424
 
586
425
  template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
587
426
  template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
588
- template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
589
- {
590
- Packet4f res;
591
- res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
592
- res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
593
- return res;
594
- }
595
427
 
596
428
  template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
597
429
  template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
598
- template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
599
- {
600
- Packet4f res;
601
- res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
602
- res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
603
- return res;
604
- }
605
430
 
606
431
  template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
607
432
  template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
608
- template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
609
- {
610
- Packet4f res;
611
- res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
612
- res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
613
- return res;
614
- }
615
433
 
616
434
  template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
617
435
  template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
618
- template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
619
- {
620
- Packet4f res;
621
- res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
622
- res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
623
- return res;
624
- }
625
436
 
626
- template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
627
- {
628
- Packet4f res;
629
- res.v4f[0] = vec_round(a.v4f[0]);
630
- res.v4f[1] = vec_round(a.v4f[1]);
631
- return res;
632
- }
633
437
  template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
634
- template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
635
- {
636
- Packet4f res;
637
- res.v4f[0] = vec_ceil(a.v4f[0]);
638
- res.v4f[1] = vec_ceil(a.v4f[1]);
639
- return res;
640
- }
641
438
  template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
642
- template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
643
- {
644
- Packet4f res;
645
- res.v4f[0] = vec_floor(a.v4f[0]);
646
- res.v4f[1] = vec_floor(a.v4f[1]);
647
- return res;
648
- }
649
439
  template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
650
440
 
651
441
  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { return pload<Packet4i>(from); }
652
- template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { return pload<Packet4f>(from); }
653
442
  template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { return pload<Packet2d>(from); }
654
443
 
655
444
 
@@ -659,14 +448,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
659
448
  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
660
449
  }
661
450
 
662
- template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
663
- {
664
- Packet4f p = pload<Packet4f>(from);
665
- p.v4f[1] = vec_splat(p.v4f[0], 1);
666
- p.v4f[0] = vec_splat(p.v4f[0], 0);
667
- return p;
668
- }
669
-
670
451
  template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
671
452
  {
672
453
  Packet2d p = pload<Packet2d>(from);
@@ -674,15 +455,12 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
674
455
  }
675
456
 
676
457
  template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { pstore<int>(to, from); }
677
- template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
678
458
  template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { pstore<double>(to, from); }
679
459
 
680
460
  template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
681
- template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
682
461
  template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
683
462
 
684
463
  template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
685
- template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
686
464
  template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
687
465
 
688
466
  template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
@@ -695,23 +473,8 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
695
473
  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
696
474
  }
697
475
 
698
- template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
699
- {
700
- Packet4f rev;
701
- rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
702
- rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
703
- return rev;
704
- }
705
-
706
476
  template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
707
477
  template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
708
- template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
709
- {
710
- Packet4f res;
711
- res.v4f[0] = pabs(a.v4f[0]);
712
- res.v4f[1] = pabs(a.v4f[1]);
713
- return res;
714
- }
715
478
 
716
479
  template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
717
480
  {
@@ -730,71 +493,10 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
730
493
  sum = padd<Packet2d>(a, b);
731
494
  return pfirst(sum);
732
495
  }
733
- template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
734
- {
735
- Packet2d sum;
736
- sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
737
- double first = predux<Packet2d>(sum);
738
- return static_cast<float>(first);
739
- }
740
-
741
- template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
742
- {
743
- Packet4i v[4], sum[4];
744
-
745
- // It's easier and faster to transpose then add as columns
746
- // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
747
- // Do the transpose, first set of moves
748
- v[0] = vec_mergeh(vecs[0], vecs[2]);
749
- v[1] = vec_mergel(vecs[0], vecs[2]);
750
- v[2] = vec_mergeh(vecs[1], vecs[3]);
751
- v[3] = vec_mergel(vecs[1], vecs[3]);
752
- // Get the resulting vectors
753
- sum[0] = vec_mergeh(v[0], v[2]);
754
- sum[1] = vec_mergel(v[0], v[2]);
755
- sum[2] = vec_mergeh(v[1], v[3]);
756
- sum[3] = vec_mergel(v[1], v[3]);
757
-
758
- // Now do the summation:
759
- // Lines 0+1
760
- sum[0] = padd<Packet4i>(sum[0], sum[1]);
761
- // Lines 2+3
762
- sum[1] = padd<Packet4i>(sum[2], sum[3]);
763
- // Add the results
764
- sum[0] = padd<Packet4i>(sum[0], sum[1]);
765
-
766
- return sum[0];
767
- }
768
-
769
- template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
770
- {
771
- Packet2d v[2], sum;
772
- v[0] = padd<Packet2d>(vecs[0], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[0]), reinterpret_cast<Packet4ui>(vecs[0]), 8)));
773
- v[1] = padd<Packet2d>(vecs[1], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[1]), reinterpret_cast<Packet4ui>(vecs[1]), 8)));
774
-
775
- sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v[0]), reinterpret_cast<Packet4ui>(v[1]), 8));
776
-
777
- return sum;
778
- }
779
-
780
- template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
781
- {
782
- PacketBlock<Packet4f,4> transpose;
783
- transpose.packet[0] = vecs[0];
784
- transpose.packet[1] = vecs[1];
785
- transpose.packet[2] = vecs[2];
786
- transpose.packet[3] = vecs[3];
787
- ptranspose(transpose);
788
-
789
- Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
790
- sum = padd(sum, transpose.packet[2]);
791
- sum = padd(sum, transpose.packet[3]);
792
- return sum;
793
- }
794
-
795
- // Other reduction functions:
796
- // mul
797
- template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
496
+
497
+ // Other reduction functions:
498
+ // mul
499
+ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
798
500
  {
799
501
  EIGEN_ALIGN16 int aux[4];
800
502
  pstore(aux, a);
@@ -806,12 +508,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
806
508
  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
807
509
  }
808
510
 
809
- template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
810
- {
811
- // Return predux_mul<Packet2d> of the subvectors product
812
- return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
813
- }
814
-
815
511
  // min
816
512
  template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
817
513
  {
@@ -826,14 +522,6 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
826
522
  return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
827
523
  }
828
524
 
829
- template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
830
- {
831
- Packet2d b, res;
832
- b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
833
- res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
834
- return static_cast<float>(pfirst(res));
835
- }
836
-
837
525
  // max
838
526
  template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
839
527
  {
@@ -849,14 +537,6 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
849
537
  return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
850
538
  }
851
539
 
852
- template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
853
- {
854
- Packet2d b, res;
855
- b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
856
- res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
857
- return static_cast<float>(pfirst(res));
858
- }
859
-
860
540
  EIGEN_DEVICE_FUNC inline void
861
541
  ptranspose(PacketBlock<Packet4i,4>& kernel) {
862
542
  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
@@ -877,6 +557,282 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
877
557
  kernel.packet[1] = t1;
878
558
  }
879
559
 
560
+ template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
561
+ Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
562
+ Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
563
+ return vec_sel(elsePacket, thenPacket, mask);
564
+ }
565
+
566
+
567
+ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
568
+ Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
569
+ Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
570
+ return vec_sel(elsePacket, thenPacket, mask);
571
+ }
572
+
573
+ /* z13 has no vector float support so we emulate that with double
574
+ z14 has proper vector float support.
575
+ */
576
+ #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
577
+ /* Helper function to simulate a vec_splat_packet4f
578
+ */
579
+ template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from)
580
+ {
581
+ Packet4f splat;
582
+ switch (element) {
583
+ case 0:
584
+ splat.v4f[0] = vec_splat(from.v4f[0], 0);
585
+ splat.v4f[1] = splat.v4f[0];
586
+ break;
587
+ case 1:
588
+ splat.v4f[0] = vec_splat(from.v4f[0], 1);
589
+ splat.v4f[1] = splat.v4f[0];
590
+ break;
591
+ case 2:
592
+ splat.v4f[0] = vec_splat(from.v4f[1], 0);
593
+ splat.v4f[1] = splat.v4f[0];
594
+ break;
595
+ case 3:
596
+ splat.v4f[0] = vec_splat(from.v4f[1], 1);
597
+ splat.v4f[1] = splat.v4f[0];
598
+ break;
599
+ }
600
+ return splat;
601
+ }
602
+
603
+ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
604
+ {
605
+ // FIXME: No intrinsic yet
606
+ EIGEN_DEBUG_ALIGNED_LOAD
607
+ Packet4f vfrom;
608
+ vfrom.v4f[0] = vec_ld2f(&from[0]);
609
+ vfrom.v4f[1] = vec_ld2f(&from[2]);
610
+ return vfrom;
611
+ }
612
+
613
+ template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
614
+ {
615
+ // FIXME: No intrinsic yet
616
+ EIGEN_DEBUG_ALIGNED_STORE
617
+ vec_st2f(from.v4f[0], &to[0]);
618
+ vec_st2f(from.v4f[1], &to[2]);
619
+ }
620
+
621
+ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
622
+ {
623
+ Packet4f to;
624
+ to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
625
+ to.v4f[1] = to.v4f[0];
626
+ return to;
627
+ }
628
+
629
+ template<> EIGEN_STRONG_INLINE void
630
+ pbroadcast4<Packet4f>(const float *a,
631
+ Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
632
+ {
633
+ a3 = pload<Packet4f>(a);
634
+ a0 = vec_splat_packet4f<0>(a3);
635
+ a1 = vec_splat_packet4f<1>(a3);
636
+ a2 = vec_splat_packet4f<2>(a3);
637
+ a3 = vec_splat_packet4f<3>(a3);
638
+ }
639
+
640
+ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
641
+ {
642
+ float EIGEN_ALIGN16 ai[4];
643
+ ai[0] = from[0*stride];
644
+ ai[1] = from[1*stride];
645
+ ai[2] = from[2*stride];
646
+ ai[3] = from[3*stride];
647
+ return pload<Packet4f>(ai);
648
+ }
649
+
650
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
651
+ {
652
+ float EIGEN_ALIGN16 ai[4];
653
+ pstore<float>((float *)ai, from);
654
+ to[0*stride] = ai[0];
655
+ to[1*stride] = ai[1];
656
+ to[2*stride] = ai[2];
657
+ to[3*stride] = ai[3];
658
+ }
659
+
660
+ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
661
+ {
662
+ Packet4f c;
663
+ c.v4f[0] = a.v4f[0] + b.v4f[0];
664
+ c.v4f[1] = a.v4f[1] + b.v4f[1];
665
+ return c;
666
+ }
667
+
668
+ template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
669
+ {
670
+ Packet4f c;
671
+ c.v4f[0] = a.v4f[0] - b.v4f[0];
672
+ c.v4f[1] = a.v4f[1] - b.v4f[1];
673
+ return c;
674
+ }
675
+
676
+ template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
677
+ {
678
+ Packet4f c;
679
+ c.v4f[0] = a.v4f[0] * b.v4f[0];
680
+ c.v4f[1] = a.v4f[1] * b.v4f[1];
681
+ return c;
682
+ }
683
+
684
+ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
685
+ {
686
+ Packet4f c;
687
+ c.v4f[0] = a.v4f[0] / b.v4f[0];
688
+ c.v4f[1] = a.v4f[1] / b.v4f[1];
689
+ return c;
690
+ }
691
+
692
+ template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
693
+ {
694
+ Packet4f c;
695
+ c.v4f[0] = -a.v4f[0];
696
+ c.v4f[1] = -a.v4f[1];
697
+ return c;
698
+ }
699
+
700
+ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
701
+ {
702
+ Packet4f res;
703
+ res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
704
+ res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
705
+ return res;
706
+ }
707
+
708
+ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
709
+ {
710
+ Packet4f res;
711
+ res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
712
+ res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
713
+ return res;
714
+ }
715
+
716
+ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
717
+ {
718
+ Packet4f res;
719
+ res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
720
+ res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
721
+ return res;
722
+ }
723
+
724
+ template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
725
+ {
726
+ Packet4f res;
727
+ res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
728
+ res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
729
+ return res;
730
+ }
731
+
732
+ template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
733
+ {
734
+ Packet4f res;
735
+ res.v4f[0] = por(a.v4f[0], b.v4f[0]);
736
+ res.v4f[1] = por(a.v4f[1], b.v4f[1]);
737
+ return res;
738
+ }
739
+
740
+ template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
741
+ {
742
+ Packet4f res;
743
+ res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
744
+ res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
745
+ return res;
746
+ }
747
+
748
+ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
749
+ {
750
+ Packet4f res;
751
+ res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
752
+ res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
753
+ return res;
754
+ }
755
+
756
+ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
757
+ {
758
+ Packet4f res;
759
+ res.v4f[0] = vec_round(a.v4f[0]);
760
+ res.v4f[1] = vec_round(a.v4f[1]);
761
+ return res;
762
+ }
763
+
764
+ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
765
+ {
766
+ Packet4f res;
767
+ res.v4f[0] = vec_ceil(a.v4f[0]);
768
+ res.v4f[1] = vec_ceil(a.v4f[1]);
769
+ return res;
770
+ }
771
+
772
+ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
773
+ {
774
+ Packet4f res;
775
+ res.v4f[0] = vec_floor(a.v4f[0]);
776
+ res.v4f[1] = vec_floor(a.v4f[1]);
777
+ return res;
778
+ }
779
+
780
+ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
781
+ {
782
+ Packet4f p = pload<Packet4f>(from);
783
+ p.v4f[1] = vec_splat(p.v4f[0], 1);
784
+ p.v4f[0] = vec_splat(p.v4f[0], 0);
785
+ return p;
786
+ }
787
+
788
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
789
+
790
+ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
791
+ {
792
+ Packet4f rev;
793
+ rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
794
+ rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
795
+ return rev;
796
+ }
797
+
798
+ template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
799
+ {
800
+ Packet4f res;
801
+ res.v4f[0] = pabs(a.v4f[0]);
802
+ res.v4f[1] = pabs(a.v4f[1]);
803
+ return res;
804
+ }
805
+
806
+ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
807
+ {
808
+ Packet2d sum;
809
+ sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
810
+ double first = predux<Packet2d>(sum);
811
+ return static_cast<float>(first);
812
+ }
813
+
814
+ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
815
+ {
816
+ // Return predux_mul<Packet2d> of the subvectors product
817
+ return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
818
+ }
819
+
820
+ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
821
+ {
822
+ Packet2d b, res;
823
+ b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
824
+ res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
825
+ return static_cast<float>(pfirst(res));
826
+ }
827
+
828
+ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
829
+ {
830
+ Packet2d b, res;
831
+ b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
832
+ res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
833
+ return static_cast<float>(pfirst(res));
834
+ }
835
+
880
836
  /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
881
837
  */
882
838
  EIGEN_DEVICE_FUNC inline void
@@ -915,12 +871,6 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
915
871
  kernel.packet[3].v4f[1] = t3.packet[1];
916
872
  }
917
873
 
918
- template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
919
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
920
- Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
921
- return vec_sel(elsePacket, thenPacket, mask);
922
- }
923
-
924
874
  template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
925
875
  Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
926
876
  Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
@@ -932,12 +882,177 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
932
882
  return result;
933
883
  }
934
884
 
935
- template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
936
- Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
937
- Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
885
+ template<> Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
886
+ {
887
+ Packet4f res;
888
+ res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
889
+ res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
890
+ return res;
891
+ }
892
+
893
+ template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
894
+ {
895
+ Packet4f res;
896
+ res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
897
+ res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
898
+ return res;
899
+ }
900
+
901
+ template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
902
+ {
903
+ Packet4f res;
904
+ res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
905
+ res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
906
+ return res;
907
+ }
908
+
909
+ #else
910
+ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
911
+ {
912
+ // FIXME: No intrinsic yet
913
+ EIGEN_DEBUG_ALIGNED_LOAD
914
+ Packet *vfrom;
915
+ vfrom = (Packet *) from;
916
+ return vfrom->v4f;
917
+ }
918
+
919
+ template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
920
+ {
921
+ // FIXME: No intrinsic yet
922
+ EIGEN_DEBUG_ALIGNED_STORE
923
+ Packet *vto;
924
+ vto = (Packet *) to;
925
+ vto->v4f = from;
926
+ }
927
+
928
+ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
929
+ {
930
+ return vec_splats(from);
931
+ }
932
+
933
+ template<> EIGEN_STRONG_INLINE void
934
+ pbroadcast4<Packet4f>(const float *a,
935
+ Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
936
+ {
937
+ a3 = pload<Packet4f>(a);
938
+ a0 = vec_splat(a3, 0);
939
+ a1 = vec_splat(a3, 1);
940
+ a2 = vec_splat(a3, 2);
941
+ a3 = vec_splat(a3, 3);
942
+ }
943
+
944
+ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
945
+ {
946
+ float EIGEN_ALIGN16 af[4];
947
+ af[0] = from[0*stride];
948
+ af[1] = from[1*stride];
949
+ af[2] = from[2*stride];
950
+ af[3] = from[3*stride];
951
+ return pload<Packet4f>(af);
952
+ }
953
+
954
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
955
+ {
956
+ float EIGEN_ALIGN16 af[4];
957
+ pstore<float>((float*)af, from);
958
+ to[0*stride] = af[0];
959
+ to[1*stride] = af[1];
960
+ to[2*stride] = af[2];
961
+ to[3*stride] = af[3];
962
+ }
963
+
964
+ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
965
+ template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
966
+ template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
967
+ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
968
+ template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
969
+ template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f> (const Packet4f& a) { return a; }
970
+ template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f> (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
971
+ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
972
+ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
973
+ template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
974
+ template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
975
+ template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
976
+ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
977
+ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
978
+ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f> (const Packet4f& a) { return vec_ceil(a); }
979
+ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
980
+ template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f> (const Packet4f& a) { return vec_abs(a); }
981
+ template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
982
+
983
+ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
984
+ {
985
+ Packet4f p = pload<Packet4f>(from);
986
+ return vec_perm(p, p, p16uc_DUPLICATE32_HI);
987
+ }
988
+
989
+ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
990
+ {
991
+ return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
992
+ }
993
+
994
+ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
995
+ {
996
+ Packet4f b, sum;
997
+ b = vec_sld(a, a, 8);
998
+ sum = padd<Packet4f>(a, b);
999
+ b = vec_sld(sum, sum, 4);
1000
+ sum = padd<Packet4f>(sum, b);
1001
+ return pfirst(sum);
1002
+ }
1003
+
1004
+ // Other reduction functions:
1005
+ // mul
1006
+ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
1007
+ {
1008
+ Packet4f prod;
1009
+ prod = pmul(a, vec_sld(a, a, 8));
1010
+ return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1011
+ }
1012
+
1013
+ // min
1014
+ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
1015
+ {
1016
+ Packet4f b, res;
1017
+ b = pmin<Packet4f>(a, vec_sld(a, a, 8));
1018
+ res = pmin<Packet4f>(b, vec_sld(b, b, 4));
1019
+ return pfirst(res);
1020
+ }
1021
+
1022
+ // max
1023
+ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
1024
+ {
1025
+ Packet4f b, res;
1026
+ b = pmax<Packet4f>(a, vec_sld(a, a, 8));
1027
+ res = pmax<Packet4f>(b, vec_sld(b, b, 4));
1028
+ return pfirst(res);
1029
+ }
1030
+
1031
+ EIGEN_DEVICE_FUNC inline void
1032
+ ptranspose(PacketBlock<Packet4f,4>& kernel) {
1033
+ Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1034
+ Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1035
+ Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1036
+ Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1037
+ kernel.packet[0] = vec_mergeh(t0, t2);
1038
+ kernel.packet[1] = vec_mergel(t0, t2);
1039
+ kernel.packet[2] = vec_mergeh(t1, t3);
1040
+ kernel.packet[3] = vec_mergel(t1, t3);
1041
+ }
1042
+
1043
+ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
1044
+ Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
1045
+ Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
938
1046
  return vec_sel(elsePacket, thenPacket, mask);
939
1047
  }
940
1048
 
1049
+ #endif
1050
+
1051
+ template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
1052
+ template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
1053
+ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
1054
+ template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f> (const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
1055
+
941
1056
  } // end namespace internal
942
1057
 
943
1058
  } // end namespace Eigen