xtgeo 4.10.0__cp310-cp310-macosx_11_0_arm64.whl → 4.10.1__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xtgeo might be problematic. Click here for more details.

Files changed (561) hide show
  1. xtgeo/_internal.cpython-310-darwin.so +0 -0
  2. xtgeo/common/version.py +2 -2
  3. xtgeo/include/fmt/args.h +220 -0
  4. xtgeo/include/fmt/base.h +2989 -0
  5. xtgeo/include/fmt/chrono.h +2330 -0
  6. xtgeo/include/fmt/color.h +637 -0
  7. xtgeo/include/fmt/compile.h +539 -0
  8. xtgeo/include/fmt/core.h +5 -0
  9. xtgeo/include/fmt/format-inl.h +1948 -0
  10. xtgeo/include/fmt/format.h +4244 -0
  11. xtgeo/include/fmt/os.h +427 -0
  12. xtgeo/include/fmt/ostream.h +167 -0
  13. xtgeo/include/fmt/printf.h +633 -0
  14. xtgeo/include/fmt/ranges.h +850 -0
  15. xtgeo/include/fmt/std.h +728 -0
  16. xtgeo/include/fmt/xchar.h +369 -0
  17. xtgeo/lib/cmake/fmt/fmt-config-version.cmake +43 -0
  18. xtgeo/lib/cmake/fmt/fmt-config.cmake +31 -0
  19. xtgeo/lib/cmake/fmt/fmt-targets-release.cmake +19 -0
  20. xtgeo/{share/eigen3/cmake/Eigen3Targets.cmake → lib/cmake/fmt/fmt-targets.cmake} +16 -6
  21. xtgeo/lib/libfmt.a +0 -0
  22. xtgeo/lib/pkgconfig/fmt.pc +11 -0
  23. {xtgeo-4.10.0.dist-info → xtgeo-4.10.1.dist-info}/METADATA +1 -1
  24. xtgeo-4.10.1.dist-info/RECORD +137 -0
  25. xtgeo/include/eigen3/Eigen/Cholesky +0 -45
  26. xtgeo/include/eigen3/Eigen/CholmodSupport +0 -48
  27. xtgeo/include/eigen3/Eigen/Core +0 -384
  28. xtgeo/include/eigen3/Eigen/Dense +0 -7
  29. xtgeo/include/eigen3/Eigen/Eigen +0 -2
  30. xtgeo/include/eigen3/Eigen/Eigenvalues +0 -60
  31. xtgeo/include/eigen3/Eigen/Geometry +0 -59
  32. xtgeo/include/eigen3/Eigen/Householder +0 -29
  33. xtgeo/include/eigen3/Eigen/IterativeLinearSolvers +0 -48
  34. xtgeo/include/eigen3/Eigen/Jacobi +0 -32
  35. xtgeo/include/eigen3/Eigen/KLUSupport +0 -41
  36. xtgeo/include/eigen3/Eigen/LU +0 -47
  37. xtgeo/include/eigen3/Eigen/MetisSupport +0 -35
  38. xtgeo/include/eigen3/Eigen/OrderingMethods +0 -70
  39. xtgeo/include/eigen3/Eigen/PaStiXSupport +0 -49
  40. xtgeo/include/eigen3/Eigen/PardisoSupport +0 -35
  41. xtgeo/include/eigen3/Eigen/QR +0 -50
  42. xtgeo/include/eigen3/Eigen/QtAlignedMalloc +0 -39
  43. xtgeo/include/eigen3/Eigen/SPQRSupport +0 -34
  44. xtgeo/include/eigen3/Eigen/SVD +0 -50
  45. xtgeo/include/eigen3/Eigen/Sparse +0 -34
  46. xtgeo/include/eigen3/Eigen/SparseCholesky +0 -37
  47. xtgeo/include/eigen3/Eigen/SparseCore +0 -69
  48. xtgeo/include/eigen3/Eigen/SparseLU +0 -50
  49. xtgeo/include/eigen3/Eigen/SparseQR +0 -36
  50. xtgeo/include/eigen3/Eigen/StdDeque +0 -27
  51. xtgeo/include/eigen3/Eigen/StdList +0 -26
  52. xtgeo/include/eigen3/Eigen/StdVector +0 -27
  53. xtgeo/include/eigen3/Eigen/SuperLUSupport +0 -64
  54. xtgeo/include/eigen3/Eigen/UmfPackSupport +0 -40
  55. xtgeo/include/eigen3/Eigen/src/Cholesky/LDLT.h +0 -688
  56. xtgeo/include/eigen3/Eigen/src/Cholesky/LLT.h +0 -558
  57. xtgeo/include/eigen3/Eigen/src/Cholesky/LLT_LAPACKE.h +0 -99
  58. xtgeo/include/eigen3/Eigen/src/CholmodSupport/CholmodSupport.h +0 -682
  59. xtgeo/include/eigen3/Eigen/src/Core/ArithmeticSequence.h +0 -413
  60. xtgeo/include/eigen3/Eigen/src/Core/Array.h +0 -417
  61. xtgeo/include/eigen3/Eigen/src/Core/ArrayBase.h +0 -226
  62. xtgeo/include/eigen3/Eigen/src/Core/ArrayWrapper.h +0 -209
  63. xtgeo/include/eigen3/Eigen/src/Core/Assign.h +0 -90
  64. xtgeo/include/eigen3/Eigen/src/Core/AssignEvaluator.h +0 -1010
  65. xtgeo/include/eigen3/Eigen/src/Core/Assign_MKL.h +0 -178
  66. xtgeo/include/eigen3/Eigen/src/Core/BandMatrix.h +0 -353
  67. xtgeo/include/eigen3/Eigen/src/Core/Block.h +0 -448
  68. xtgeo/include/eigen3/Eigen/src/Core/BooleanRedux.h +0 -162
  69. xtgeo/include/eigen3/Eigen/src/Core/CommaInitializer.h +0 -164
  70. xtgeo/include/eigen3/Eigen/src/Core/ConditionEstimator.h +0 -175
  71. xtgeo/include/eigen3/Eigen/src/Core/CoreEvaluators.h +0 -1741
  72. xtgeo/include/eigen3/Eigen/src/Core/CoreIterators.h +0 -132
  73. xtgeo/include/eigen3/Eigen/src/Core/CwiseBinaryOp.h +0 -183
  74. xtgeo/include/eigen3/Eigen/src/Core/CwiseNullaryOp.h +0 -1001
  75. xtgeo/include/eigen3/Eigen/src/Core/CwiseTernaryOp.h +0 -197
  76. xtgeo/include/eigen3/Eigen/src/Core/CwiseUnaryOp.h +0 -103
  77. xtgeo/include/eigen3/Eigen/src/Core/CwiseUnaryView.h +0 -132
  78. xtgeo/include/eigen3/Eigen/src/Core/DenseBase.h +0 -701
  79. xtgeo/include/eigen3/Eigen/src/Core/DenseCoeffsBase.h +0 -685
  80. xtgeo/include/eigen3/Eigen/src/Core/DenseStorage.h +0 -652
  81. xtgeo/include/eigen3/Eigen/src/Core/Diagonal.h +0 -258
  82. xtgeo/include/eigen3/Eigen/src/Core/DiagonalMatrix.h +0 -391
  83. xtgeo/include/eigen3/Eigen/src/Core/DiagonalProduct.h +0 -28
  84. xtgeo/include/eigen3/Eigen/src/Core/Dot.h +0 -318
  85. xtgeo/include/eigen3/Eigen/src/Core/EigenBase.h +0 -160
  86. xtgeo/include/eigen3/Eigen/src/Core/ForceAlignedAccess.h +0 -150
  87. xtgeo/include/eigen3/Eigen/src/Core/Fuzzy.h +0 -155
  88. xtgeo/include/eigen3/Eigen/src/Core/GeneralProduct.h +0 -465
  89. xtgeo/include/eigen3/Eigen/src/Core/GenericPacketMath.h +0 -1040
  90. xtgeo/include/eigen3/Eigen/src/Core/GlobalFunctions.h +0 -194
  91. xtgeo/include/eigen3/Eigen/src/Core/IO.h +0 -258
  92. xtgeo/include/eigen3/Eigen/src/Core/IndexedView.h +0 -237
  93. xtgeo/include/eigen3/Eigen/src/Core/Inverse.h +0 -117
  94. xtgeo/include/eigen3/Eigen/src/Core/Map.h +0 -171
  95. xtgeo/include/eigen3/Eigen/src/Core/MapBase.h +0 -310
  96. xtgeo/include/eigen3/Eigen/src/Core/MathFunctions.h +0 -2057
  97. xtgeo/include/eigen3/Eigen/src/Core/MathFunctionsImpl.h +0 -200
  98. xtgeo/include/eigen3/Eigen/src/Core/Matrix.h +0 -565
  99. xtgeo/include/eigen3/Eigen/src/Core/MatrixBase.h +0 -547
  100. xtgeo/include/eigen3/Eigen/src/Core/NestByValue.h +0 -85
  101. xtgeo/include/eigen3/Eigen/src/Core/NoAlias.h +0 -109
  102. xtgeo/include/eigen3/Eigen/src/Core/NumTraits.h +0 -335
  103. xtgeo/include/eigen3/Eigen/src/Core/PartialReduxEvaluator.h +0 -232
  104. xtgeo/include/eigen3/Eigen/src/Core/PermutationMatrix.h +0 -605
  105. xtgeo/include/eigen3/Eigen/src/Core/PlainObjectBase.h +0 -1128
  106. xtgeo/include/eigen3/Eigen/src/Core/Product.h +0 -191
  107. xtgeo/include/eigen3/Eigen/src/Core/ProductEvaluators.h +0 -1179
  108. xtgeo/include/eigen3/Eigen/src/Core/Random.h +0 -218
  109. xtgeo/include/eigen3/Eigen/src/Core/Redux.h +0 -515
  110. xtgeo/include/eigen3/Eigen/src/Core/Ref.h +0 -381
  111. xtgeo/include/eigen3/Eigen/src/Core/Replicate.h +0 -142
  112. xtgeo/include/eigen3/Eigen/src/Core/Reshaped.h +0 -454
  113. xtgeo/include/eigen3/Eigen/src/Core/ReturnByValue.h +0 -119
  114. xtgeo/include/eigen3/Eigen/src/Core/Reverse.h +0 -217
  115. xtgeo/include/eigen3/Eigen/src/Core/Select.h +0 -164
  116. xtgeo/include/eigen3/Eigen/src/Core/SelfAdjointView.h +0 -365
  117. xtgeo/include/eigen3/Eigen/src/Core/SelfCwiseBinaryOp.h +0 -47
  118. xtgeo/include/eigen3/Eigen/src/Core/Solve.h +0 -188
  119. xtgeo/include/eigen3/Eigen/src/Core/SolveTriangular.h +0 -235
  120. xtgeo/include/eigen3/Eigen/src/Core/SolverBase.h +0 -168
  121. xtgeo/include/eigen3/Eigen/src/Core/StableNorm.h +0 -251
  122. xtgeo/include/eigen3/Eigen/src/Core/StlIterators.h +0 -463
  123. xtgeo/include/eigen3/Eigen/src/Core/Stride.h +0 -116
  124. xtgeo/include/eigen3/Eigen/src/Core/Swap.h +0 -68
  125. xtgeo/include/eigen3/Eigen/src/Core/Transpose.h +0 -464
  126. xtgeo/include/eigen3/Eigen/src/Core/Transpositions.h +0 -386
  127. xtgeo/include/eigen3/Eigen/src/Core/TriangularMatrix.h +0 -1001
  128. xtgeo/include/eigen3/Eigen/src/Core/VectorBlock.h +0 -96
  129. xtgeo/include/eigen3/Eigen/src/Core/VectorwiseOp.h +0 -784
  130. xtgeo/include/eigen3/Eigen/src/Core/Visitor.h +0 -381
  131. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX/Complex.h +0 -372
  132. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX/MathFunctions.h +0 -228
  133. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX/PacketMath.h +0 -1574
  134. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX/TypeCasting.h +0 -115
  135. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX512/Complex.h +0 -422
  136. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX512/MathFunctions.h +0 -362
  137. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX512/PacketMath.h +0 -2303
  138. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX512/TypeCasting.h +0 -89
  139. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h +0 -417
  140. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/MathFunctions.h +0 -90
  141. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +0 -2937
  142. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +0 -221
  143. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +0 -629
  144. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h +0 -2711
  145. xtgeo/include/eigen3/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
  146. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/BFloat16.h +0 -700
  147. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/ConjHelper.h +0 -117
  148. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +0 -1649
  149. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +0 -110
  150. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/Half.h +0 -942
  151. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/Settings.h +0 -49
  152. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
  153. xtgeo/include/eigen3/Eigen/src/Core/arch/GPU/MathFunctions.h +0 -103
  154. xtgeo/include/eigen3/Eigen/src/Core/arch/GPU/PacketMath.h +0 -1685
  155. xtgeo/include/eigen3/Eigen/src/Core/arch/GPU/TypeCasting.h +0 -80
  156. xtgeo/include/eigen3/Eigen/src/Core/arch/HIP/hcc/math_constants.h +0 -23
  157. xtgeo/include/eigen3/Eigen/src/Core/arch/MSA/Complex.h +0 -648
  158. xtgeo/include/eigen3/Eigen/src/Core/arch/MSA/MathFunctions.h +0 -387
  159. xtgeo/include/eigen3/Eigen/src/Core/arch/MSA/PacketMath.h +0 -1233
  160. xtgeo/include/eigen3/Eigen/src/Core/arch/NEON/Complex.h +0 -584
  161. xtgeo/include/eigen3/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +0 -183
  162. xtgeo/include/eigen3/Eigen/src/Core/arch/NEON/MathFunctions.h +0 -75
  163. xtgeo/include/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h +0 -4587
  164. xtgeo/include/eigen3/Eigen/src/Core/arch/NEON/TypeCasting.h +0 -1419
  165. xtgeo/include/eigen3/Eigen/src/Core/arch/SSE/Complex.h +0 -351
  166. xtgeo/include/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h +0 -199
  167. xtgeo/include/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h +0 -1505
  168. xtgeo/include/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h +0 -142
  169. xtgeo/include/eigen3/Eigen/src/Core/arch/SVE/MathFunctions.h +0 -44
  170. xtgeo/include/eigen3/Eigen/src/Core/arch/SVE/PacketMath.h +0 -752
  171. xtgeo/include/eigen3/Eigen/src/Core/arch/SVE/TypeCasting.h +0 -49
  172. xtgeo/include/eigen3/Eigen/src/Core/arch/SYCL/InteropHeaders.h +0 -232
  173. xtgeo/include/eigen3/Eigen/src/Core/arch/SYCL/MathFunctions.h +0 -301
  174. xtgeo/include/eigen3/Eigen/src/Core/arch/SYCL/PacketMath.h +0 -670
  175. xtgeo/include/eigen3/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
  176. xtgeo/include/eigen3/Eigen/src/Core/arch/SYCL/TypeCasting.h +0 -85
  177. xtgeo/include/eigen3/Eigen/src/Core/arch/ZVector/Complex.h +0 -426
  178. xtgeo/include/eigen3/Eigen/src/Core/arch/ZVector/MathFunctions.h +0 -233
  179. xtgeo/include/eigen3/Eigen/src/Core/arch/ZVector/PacketMath.h +0 -1060
  180. xtgeo/include/eigen3/Eigen/src/Core/functors/AssignmentFunctors.h +0 -177
  181. xtgeo/include/eigen3/Eigen/src/Core/functors/BinaryFunctors.h +0 -541
  182. xtgeo/include/eigen3/Eigen/src/Core/functors/NullaryFunctors.h +0 -189
  183. xtgeo/include/eigen3/Eigen/src/Core/functors/StlFunctors.h +0 -166
  184. xtgeo/include/eigen3/Eigen/src/Core/functors/TernaryFunctors.h +0 -25
  185. xtgeo/include/eigen3/Eigen/src/Core/functors/UnaryFunctors.h +0 -1131
  186. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h +0 -2645
  187. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h +0 -517
  188. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +0 -317
  189. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +0 -145
  190. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +0 -124
  191. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h +0 -518
  192. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +0 -136
  193. xtgeo/include/eigen3/Eigen/src/Core/products/Parallelizer.h +0 -180
  194. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +0 -544
  195. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +0 -295
  196. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h +0 -262
  197. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +0 -118
  198. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointProduct.h +0 -133
  199. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h +0 -94
  200. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h +0 -472
  201. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +0 -317
  202. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h +0 -350
  203. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +0 -255
  204. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h +0 -337
  205. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +0 -167
  206. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularSolverVector.h +0 -148
  207. xtgeo/include/eigen3/Eigen/src/Core/util/BlasUtil.h +0 -583
  208. xtgeo/include/eigen3/Eigen/src/Core/util/ConfigureVectorization.h +0 -512
  209. xtgeo/include/eigen3/Eigen/src/Core/util/Constants.h +0 -563
  210. xtgeo/include/eigen3/Eigen/src/Core/util/DisableStupidWarnings.h +0 -106
  211. xtgeo/include/eigen3/Eigen/src/Core/util/ForwardDeclarations.h +0 -322
  212. xtgeo/include/eigen3/Eigen/src/Core/util/IndexedViewHelper.h +0 -186
  213. xtgeo/include/eigen3/Eigen/src/Core/util/IntegralConstant.h +0 -272
  214. xtgeo/include/eigen3/Eigen/src/Core/util/MKL_support.h +0 -137
  215. xtgeo/include/eigen3/Eigen/src/Core/util/Macros.h +0 -1464
  216. xtgeo/include/eigen3/Eigen/src/Core/util/Memory.h +0 -1163
  217. xtgeo/include/eigen3/Eigen/src/Core/util/Meta.h +0 -812
  218. xtgeo/include/eigen3/Eigen/src/Core/util/NonMPL2.h +0 -3
  219. xtgeo/include/eigen3/Eigen/src/Core/util/ReenableStupidWarnings.h +0 -31
  220. xtgeo/include/eigen3/Eigen/src/Core/util/ReshapedHelper.h +0 -51
  221. xtgeo/include/eigen3/Eigen/src/Core/util/StaticAssert.h +0 -221
  222. xtgeo/include/eigen3/Eigen/src/Core/util/SymbolicIndex.h +0 -293
  223. xtgeo/include/eigen3/Eigen/src/Core/util/XprHelper.h +0 -856
  224. xtgeo/include/eigen3/Eigen/src/Eigenvalues/ComplexEigenSolver.h +0 -346
  225. xtgeo/include/eigen3/Eigen/src/Eigenvalues/ComplexSchur.h +0 -462
  226. xtgeo/include/eigen3/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +0 -91
  227. xtgeo/include/eigen3/Eigen/src/Eigenvalues/EigenSolver.h +0 -622
  228. xtgeo/include/eigen3/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +0 -418
  229. xtgeo/include/eigen3/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +0 -226
  230. xtgeo/include/eigen3/Eigen/src/Eigenvalues/HessenbergDecomposition.h +0 -374
  231. xtgeo/include/eigen3/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +0 -158
  232. xtgeo/include/eigen3/Eigen/src/Eigenvalues/RealQZ.h +0 -657
  233. xtgeo/include/eigen3/Eigen/src/Eigenvalues/RealSchur.h +0 -558
  234. xtgeo/include/eigen3/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +0 -77
  235. xtgeo/include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +0 -904
  236. xtgeo/include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +0 -87
  237. xtgeo/include/eigen3/Eigen/src/Eigenvalues/Tridiagonalization.h +0 -561
  238. xtgeo/include/eigen3/Eigen/src/Geometry/AlignedBox.h +0 -486
  239. xtgeo/include/eigen3/Eigen/src/Geometry/AngleAxis.h +0 -247
  240. xtgeo/include/eigen3/Eigen/src/Geometry/EulerAngles.h +0 -114
  241. xtgeo/include/eigen3/Eigen/src/Geometry/Homogeneous.h +0 -501
  242. xtgeo/include/eigen3/Eigen/src/Geometry/Hyperplane.h +0 -282
  243. xtgeo/include/eigen3/Eigen/src/Geometry/OrthoMethods.h +0 -235
  244. xtgeo/include/eigen3/Eigen/src/Geometry/ParametrizedLine.h +0 -232
  245. xtgeo/include/eigen3/Eigen/src/Geometry/Quaternion.h +0 -870
  246. xtgeo/include/eigen3/Eigen/src/Geometry/Rotation2D.h +0 -199
  247. xtgeo/include/eigen3/Eigen/src/Geometry/RotationBase.h +0 -206
  248. xtgeo/include/eigen3/Eigen/src/Geometry/Scaling.h +0 -188
  249. xtgeo/include/eigen3/Eigen/src/Geometry/Transform.h +0 -1563
  250. xtgeo/include/eigen3/Eigen/src/Geometry/Translation.h +0 -202
  251. xtgeo/include/eigen3/Eigen/src/Geometry/Umeyama.h +0 -166
  252. xtgeo/include/eigen3/Eigen/src/Geometry/arch/Geometry_SIMD.h +0 -168
  253. xtgeo/include/eigen3/Eigen/src/Householder/BlockHouseholder.h +0 -110
  254. xtgeo/include/eigen3/Eigen/src/Householder/Householder.h +0 -176
  255. xtgeo/include/eigen3/Eigen/src/Householder/HouseholderSequence.h +0 -545
  256. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +0 -226
  257. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +0 -212
  258. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +0 -229
  259. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +0 -394
  260. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +0 -453
  261. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +0 -444
  262. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +0 -198
  263. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +0 -117
  264. xtgeo/include/eigen3/Eigen/src/Jacobi/Jacobi.h +0 -483
  265. xtgeo/include/eigen3/Eigen/src/KLUSupport/KLUSupport.h +0 -358
  266. xtgeo/include/eigen3/Eigen/src/LU/Determinant.h +0 -117
  267. xtgeo/include/eigen3/Eigen/src/LU/FullPivLU.h +0 -877
  268. xtgeo/include/eigen3/Eigen/src/LU/InverseImpl.h +0 -432
  269. xtgeo/include/eigen3/Eigen/src/LU/PartialPivLU.h +0 -624
  270. xtgeo/include/eigen3/Eigen/src/LU/PartialPivLU_LAPACKE.h +0 -83
  271. xtgeo/include/eigen3/Eigen/src/LU/arch/InverseSize4.h +0 -351
  272. xtgeo/include/eigen3/Eigen/src/MetisSupport/MetisSupport.h +0 -137
  273. xtgeo/include/eigen3/Eigen/src/OrderingMethods/Amd.h +0 -435
  274. xtgeo/include/eigen3/Eigen/src/OrderingMethods/Eigen_Colamd.h +0 -1863
  275. xtgeo/include/eigen3/Eigen/src/OrderingMethods/Ordering.h +0 -153
  276. xtgeo/include/eigen3/Eigen/src/PaStiXSupport/PaStiXSupport.h +0 -678
  277. xtgeo/include/eigen3/Eigen/src/PardisoSupport/PardisoSupport.h +0 -545
  278. xtgeo/include/eigen3/Eigen/src/QR/ColPivHouseholderQR.h +0 -674
  279. xtgeo/include/eigen3/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +0 -97
  280. xtgeo/include/eigen3/Eigen/src/QR/CompleteOrthogonalDecomposition.h +0 -635
  281. xtgeo/include/eigen3/Eigen/src/QR/FullPivHouseholderQR.h +0 -713
  282. xtgeo/include/eigen3/Eigen/src/QR/HouseholderQR.h +0 -434
  283. xtgeo/include/eigen3/Eigen/src/QR/HouseholderQR_LAPACKE.h +0 -68
  284. xtgeo/include/eigen3/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +0 -335
  285. xtgeo/include/eigen3/Eigen/src/SVD/BDCSVD.h +0 -1366
  286. xtgeo/include/eigen3/Eigen/src/SVD/JacobiSVD.h +0 -812
  287. xtgeo/include/eigen3/Eigen/src/SVD/JacobiSVD_LAPACKE.h +0 -91
  288. xtgeo/include/eigen3/Eigen/src/SVD/SVDBase.h +0 -376
  289. xtgeo/include/eigen3/Eigen/src/SVD/UpperBidiagonalization.h +0 -414
  290. xtgeo/include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky.h +0 -697
  291. xtgeo/include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +0 -174
  292. xtgeo/include/eigen3/Eigen/src/SparseCore/AmbiVector.h +0 -378
  293. xtgeo/include/eigen3/Eigen/src/SparseCore/CompressedStorage.h +0 -274
  294. xtgeo/include/eigen3/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +0 -352
  295. xtgeo/include/eigen3/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  296. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseAssign.h +0 -270
  297. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseBlock.h +0 -571
  298. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseColEtree.h +0 -206
  299. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseCompressedBase.h +0 -370
  300. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +0 -722
  301. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +0 -150
  302. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseDenseProduct.h +0 -342
  303. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseDiagonalProduct.h +0 -138
  304. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseDot.h +0 -98
  305. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseFuzzy.h +0 -29
  306. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseMap.h +0 -305
  307. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseMatrix.h +0 -1518
  308. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseMatrixBase.h +0 -398
  309. xtgeo/include/eigen3/Eigen/src/SparseCore/SparsePermutation.h +0 -178
  310. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseProduct.h +0 -181
  311. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseRedux.h +0 -49
  312. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseRef.h +0 -397
  313. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseSelfAdjointView.h +0 -659
  314. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseSolverBase.h +0 -124
  315. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +0 -198
  316. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseTranspose.h +0 -92
  317. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseTriangularView.h +0 -189
  318. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseUtil.h +0 -186
  319. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseVector.h +0 -478
  320. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseView.h +0 -254
  321. xtgeo/include/eigen3/Eigen/src/SparseCore/TriangularSolver.h +0 -315
  322. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU.h +0 -923
  323. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLUImpl.h +0 -66
  324. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_Memory.h +0 -226
  325. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_Structs.h +0 -110
  326. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +0 -375
  327. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_Utils.h +0 -80
  328. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_column_bmod.h +0 -181
  329. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_column_dfs.h +0 -179
  330. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +0 -107
  331. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  332. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +0 -126
  333. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +0 -130
  334. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_panel_bmod.h +0 -223
  335. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_panel_dfs.h +0 -258
  336. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_pivotL.h +0 -137
  337. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_pruneL.h +0 -136
  338. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_relax_snode.h +0 -83
  339. xtgeo/include/eigen3/Eigen/src/SparseQR/SparseQR.h +0 -758
  340. xtgeo/include/eigen3/Eigen/src/StlSupport/StdDeque.h +0 -116
  341. xtgeo/include/eigen3/Eigen/src/StlSupport/StdList.h +0 -106
  342. xtgeo/include/eigen3/Eigen/src/StlSupport/StdVector.h +0 -131
  343. xtgeo/include/eigen3/Eigen/src/StlSupport/details.h +0 -84
  344. xtgeo/include/eigen3/Eigen/src/SuperLUSupport/SuperLUSupport.h +0 -1025
  345. xtgeo/include/eigen3/Eigen/src/UmfPackSupport/UmfPackSupport.h +0 -642
  346. xtgeo/include/eigen3/Eigen/src/misc/Image.h +0 -82
  347. xtgeo/include/eigen3/Eigen/src/misc/Kernel.h +0 -79
  348. xtgeo/include/eigen3/Eigen/src/misc/RealSvd2x2.h +0 -55
  349. xtgeo/include/eigen3/Eigen/src/misc/blas.h +0 -440
  350. xtgeo/include/eigen3/Eigen/src/misc/lapack.h +0 -152
  351. xtgeo/include/eigen3/Eigen/src/misc/lapacke.h +0 -16292
  352. xtgeo/include/eigen3/Eigen/src/misc/lapacke_mangling.h +0 -17
  353. xtgeo/include/eigen3/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
  354. xtgeo/include/eigen3/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
  355. xtgeo/include/eigen3/Eigen/src/plugins/BlockMethods.h +0 -1442
  356. xtgeo/include/eigen3/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  357. xtgeo/include/eigen3/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -177
  358. xtgeo/include/eigen3/Eigen/src/plugins/IndexedViewMethods.h +0 -262
  359. xtgeo/include/eigen3/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  360. xtgeo/include/eigen3/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
  361. xtgeo/include/eigen3/Eigen/src/plugins/ReshapedMethods.h +0 -149
  362. xtgeo/include/eigen3/signature_of_eigen3_matrix_library +0 -1
  363. xtgeo/include/eigen3/unsupported/Eigen/AdolcForward +0 -159
  364. xtgeo/include/eigen3/unsupported/Eigen/AlignedVector3 +0 -234
  365. xtgeo/include/eigen3/unsupported/Eigen/ArpackSupport +0 -30
  366. xtgeo/include/eigen3/unsupported/Eigen/AutoDiff +0 -46
  367. xtgeo/include/eigen3/unsupported/Eigen/BVH +0 -95
  368. xtgeo/include/eigen3/unsupported/Eigen/CXX11/Tensor +0 -137
  369. xtgeo/include/eigen3/unsupported/Eigen/CXX11/TensorSymmetry +0 -42
  370. xtgeo/include/eigen3/unsupported/Eigen/CXX11/ThreadPool +0 -74
  371. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +0 -554
  372. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +0 -329
  373. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +0 -247
  374. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +0 -1176
  375. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +0 -1559
  376. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +0 -1093
  377. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +0 -518
  378. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +0 -377
  379. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +0 -1023
  380. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +0 -73
  381. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +0 -6
  382. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +0 -1413
  383. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +0 -575
  384. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +0 -1650
  385. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +0 -1679
  386. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +0 -456
  387. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +0 -1132
  388. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +0 -544
  389. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +0 -214
  390. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +0 -347
  391. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +0 -137
  392. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +0 -6
  393. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +0 -104
  394. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +0 -389
  395. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +0 -1048
  396. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +0 -409
  397. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h +0 -236
  398. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +0 -490
  399. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +0 -236
  400. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +0 -983
  401. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +0 -703
  402. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +0 -388
  403. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +0 -669
  404. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +0 -379
  405. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +0 -237
  406. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +0 -191
  407. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +0 -488
  408. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +0 -302
  409. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h +0 -33
  410. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +0 -99
  411. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h +0 -44
  412. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +0 -79
  413. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +0 -603
  414. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +0 -738
  415. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +0 -247
  416. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +0 -82
  417. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +0 -263
  418. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +0 -216
  419. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +0 -98
  420. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +0 -327
  421. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +0 -311
  422. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +0 -1102
  423. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +0 -708
  424. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +0 -291
  425. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +0 -322
  426. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +0 -998
  427. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +0 -6
  428. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +0 -966
  429. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +0 -582
  430. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +0 -454
  431. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +0 -465
  432. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +0 -528
  433. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h +0 -513
  434. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +0 -471
  435. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +0 -161
  436. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +0 -346
  437. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +0 -303
  438. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +0 -264
  439. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +0 -249
  440. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +0 -629
  441. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +0 -293
  442. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +0 -236
  443. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h +0 -338
  444. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h +0 -669
  445. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h +0 -67
  446. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +0 -249
  447. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +0 -486
  448. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +0 -236
  449. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h +0 -23
  450. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h +0 -40
  451. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h +0 -301
  452. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h +0 -48
  453. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h +0 -20
  454. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +0 -537
  455. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h +0 -88
  456. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/util/EmulateArray.h +0 -261
  457. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h +0 -158
  458. xtgeo/include/eigen3/unsupported/Eigen/EulerAngles +0 -43
  459. xtgeo/include/eigen3/unsupported/Eigen/FFT +0 -419
  460. xtgeo/include/eigen3/unsupported/Eigen/IterativeSolvers +0 -51
  461. xtgeo/include/eigen3/unsupported/Eigen/KroneckerProduct +0 -36
  462. xtgeo/include/eigen3/unsupported/Eigen/LevenbergMarquardt +0 -49
  463. xtgeo/include/eigen3/unsupported/Eigen/MPRealSupport +0 -213
  464. xtgeo/include/eigen3/unsupported/Eigen/MatrixFunctions +0 -504
  465. xtgeo/include/eigen3/unsupported/Eigen/MoreVectorization +0 -24
  466. xtgeo/include/eigen3/unsupported/Eigen/NonLinearOptimization +0 -140
  467. xtgeo/include/eigen3/unsupported/Eigen/NumericalDiff +0 -56
  468. xtgeo/include/eigen3/unsupported/Eigen/OpenGLSupport +0 -322
  469. xtgeo/include/eigen3/unsupported/Eigen/Polynomials +0 -137
  470. xtgeo/include/eigen3/unsupported/Eigen/Skyline +0 -39
  471. xtgeo/include/eigen3/unsupported/Eigen/SparseExtra +0 -54
  472. xtgeo/include/eigen3/unsupported/Eigen/SpecialFunctions +0 -103
  473. xtgeo/include/eigen3/unsupported/Eigen/Splines +0 -35
  474. xtgeo/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h +0 -108
  475. xtgeo/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +0 -730
  476. xtgeo/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h +0 -220
  477. xtgeo/include/eigen3/unsupported/Eigen/src/BVH/BVAlgorithms.h +0 -293
  478. xtgeo/include/eigen3/unsupported/Eigen/src/BVH/KdBVH.h +0 -223
  479. xtgeo/include/eigen3/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h +0 -790
  480. xtgeo/include/eigen3/unsupported/Eigen/src/EulerAngles/EulerAngles.h +0 -355
  481. xtgeo/include/eigen3/unsupported/Eigen/src/EulerAngles/EulerSystem.h +0 -305
  482. xtgeo/include/eigen3/unsupported/Eigen/src/FFT/ei_fftw_impl.h +0 -261
  483. xtgeo/include/eigen3/unsupported/Eigen/src/FFT/ei_kissfft_impl.h +0 -449
  484. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h +0 -187
  485. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/DGMRES.h +0 -511
  486. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/GMRES.h +0 -335
  487. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IDRS.h +0 -436
  488. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h +0 -90
  489. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IterationController.h +0 -154
  490. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/MINRES.h +0 -267
  491. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/Scaling.h +0 -193
  492. xtgeo/include/eigen3/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h +0 -305
  493. xtgeo/include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h +0 -84
  494. xtgeo/include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h +0 -202
  495. xtgeo/include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h +0 -160
  496. xtgeo/include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h +0 -188
  497. xtgeo/include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h +0 -396
  498. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +0 -441
  499. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +0 -569
  500. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +0 -373
  501. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +0 -705
  502. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h +0 -368
  503. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/StemFunction.h +0 -117
  504. xtgeo/include/eigen3/unsupported/Eigen/src/MoreVectorization/MathFunctions.h +0 -95
  505. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h +0 -601
  506. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h +0 -657
  507. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/chkder.h +0 -66
  508. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/covar.h +0 -70
  509. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/dogleg.h +0 -107
  510. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h +0 -79
  511. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/lmpar.h +0 -298
  512. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h +0 -91
  513. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h +0 -30
  514. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1updt.h +0 -99
  515. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h +0 -49
  516. xtgeo/include/eigen3/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h +0 -130
  517. xtgeo/include/eigen3/unsupported/Eigen/src/Polynomials/Companion.h +0 -280
  518. xtgeo/include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +0 -428
  519. xtgeo/include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialUtils.h +0 -143
  520. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h +0 -352
  521. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrix.h +0 -862
  522. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h +0 -212
  523. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineProduct.h +0 -295
  524. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineStorage.h +0 -259
  525. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineUtil.h +0 -89
  526. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h +0 -122
  527. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h +0 -1079
  528. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h +0 -404
  529. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/MarketIO.h +0 -282
  530. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h +0 -247
  531. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/RandomSetter.h +0 -349
  532. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h +0 -286
  533. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h +0 -68
  534. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h +0 -357
  535. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h +0 -66
  536. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h +0 -1959
  537. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h +0 -118
  538. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h +0 -67
  539. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h +0 -167
  540. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h +0 -58
  541. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h +0 -330
  542. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h +0 -58
  543. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +0 -2045
  544. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h +0 -79
  545. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h +0 -46
  546. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h +0 -16
  547. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h +0 -46
  548. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h +0 -16
  549. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h +0 -369
  550. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h +0 -54
  551. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h +0 -34
  552. xtgeo/include/eigen3/unsupported/Eigen/src/Splines/Spline.h +0 -507
  553. xtgeo/include/eigen3/unsupported/Eigen/src/Splines/SplineFitting.h +0 -431
  554. xtgeo/include/eigen3/unsupported/Eigen/src/Splines/SplineFwd.h +0 -93
  555. xtgeo/share/eigen3/cmake/Eigen3Config.cmake +0 -37
  556. xtgeo/share/eigen3/cmake/Eigen3ConfigVersion.cmake +0 -65
  557. xtgeo/share/eigen3/cmake/UseEigen3.cmake +0 -6
  558. xtgeo/share/pkgconfig/eigen3.pc +0 -9
  559. xtgeo-4.10.0.dist-info/RECORD +0 -652
  560. {xtgeo-4.10.0.dist-info → xtgeo-4.10.1.dist-info}/WHEEL +0 -0
  561. {xtgeo-4.10.0.dist-info → xtgeo-4.10.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,1413 +0,0 @@
1
- // This file is part of Eigen, a lightweight C++ template library
2
- // for linear algebra.
3
- //
4
- // Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
- // Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
6
- // Copyright (C) 2014 Eric Martin <eric@ericmart.in>
7
- //
8
- // This Source Code Form is subject to the terms of the Mozilla
9
- // Public License v. 2.0. If a copy of the MPL was not distributed
10
- // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
-
12
- #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
13
- #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
14
-
15
- #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
16
-
17
- namespace Eigen {
18
-
19
- template<typename Scalar, typename Index, typename LhsMapper,
20
- typename RhsMapper, typename OutputMapper, bool needs_edge_check>
21
- __device__ EIGEN_STRONG_INLINE void
22
- EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
23
- const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
24
- const Index m_size, const Index n_size, const Index k_size) {
25
-
26
- const Index m_block_idx = blockIdx.x;
27
- const Index n_block_idx = blockIdx.y;
28
-
29
- const Index base_m = 64 * m_block_idx;
30
- const Index base_n = 64 * n_block_idx;
31
-
32
- // declare and initialize 64 registers for output 8x8 block
33
-
34
- // prefetch registers
35
- Scalar lhs_pf0;
36
- Scalar lhs_pf1;
37
- Scalar lhs_pf2;
38
- Scalar lhs_pf3;
39
- Scalar lhs_pf4;
40
- Scalar lhs_pf5;
41
- Scalar lhs_pf6;
42
- Scalar lhs_pf7;
43
-
44
- Scalar rhs_pf0;
45
- Scalar rhs_pf1;
46
- Scalar rhs_pf2;
47
- Scalar rhs_pf3;
48
- Scalar rhs_pf4;
49
- Scalar rhs_pf5;
50
- Scalar rhs_pf6;
51
- Scalar rhs_pf7;
52
-
53
- // shared memory is formatted
54
- // (contract idx in block, nocontract idx in block, block idx)
55
- // where block idx is column major. This transposition limits the number of
56
- // bank conflicts when reading the LHS. The core idea is that since the contracting
57
- // index is shared by both sides, then the contracting index should be in threadIdx.x.
58
-
59
- // On the LHS, we pad each row inside of each block with an extra element. This makes
60
- // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
61
- // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
62
-
63
- // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
64
- // conflicts on writes and also none on reads.
65
-
66
- // storage indices
67
- const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
68
- const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
69
-
70
- const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
71
- const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
72
- const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
73
- const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
74
- const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
75
- const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
76
- const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
77
- const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
78
-
79
- const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
80
- const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
81
- const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
82
- const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
83
- const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
84
- const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
85
- const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
86
- const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
87
-
88
- // in the loading code, the following variables are important:
89
- // threadIdx.x: the vertical position in an 8x8 block
90
- // threadIdx.y: the vertical index of the 8x8 block in the grid
91
- // threadIdx.z: the horizontal position in an 8x8 block
92
- // k: the horizontal index of the 8x8 block in the grid
93
- //
94
- // The k parameter is implicit (it was the loop counter for a loop that went
95
- // from 0 to <8, but now that loop is unrolled in the below code.
96
-
97
- const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
98
- const Index lhs_vert = base_m + load_idx_vert;
99
-
100
- #define prefetchIntoRegisters(base_k) \
101
- { \
102
- lhs_pf0 = conv(0); \
103
- lhs_pf1 = conv(0); \
104
- lhs_pf2 = conv(0); \
105
- lhs_pf3 = conv(0); \
106
- lhs_pf4 = conv(0); \
107
- lhs_pf5 = conv(0); \
108
- lhs_pf6 = conv(0); \
109
- lhs_pf7 = conv(0); \
110
- \
111
- rhs_pf0 = conv(0); \
112
- rhs_pf1 = conv(0); \
113
- rhs_pf2 = conv(0); \
114
- rhs_pf3 = conv(0); \
115
- rhs_pf4 = conv(0); \
116
- rhs_pf5 = conv(0); \
117
- rhs_pf6 = conv(0); \
118
- rhs_pf7 = conv(0); \
119
- \
120
- if (!needs_edge_check || lhs_vert < m_size) { \
121
- const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \
122
- const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \
123
- const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \
124
- const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \
125
- const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \
126
- const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \
127
- const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \
128
- const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \
129
- \
130
- if (!needs_edge_check || lhs_horiz_7 < k_size) { \
131
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
132
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
133
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
134
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
135
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
136
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
137
- lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
138
- lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \
139
- } else if (lhs_horiz_6 < k_size) { \
140
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
141
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
142
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
143
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
144
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
145
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
146
- lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
147
- } else if (lhs_horiz_5 < k_size) { \
148
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
149
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
150
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
151
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
152
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
153
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
154
- } else if (lhs_horiz_4 < k_size) { \
155
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
156
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
157
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
158
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
159
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
160
- } else if (lhs_horiz_3 < k_size) { \
161
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
162
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
163
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
164
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
165
- } else if (lhs_horiz_2 < k_size) { \
166
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
167
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
168
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
169
- } else if (lhs_horiz_1 < k_size) { \
170
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
171
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
172
- } else if (lhs_horiz_0 < k_size) { \
173
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
174
- } \
175
- } \
176
- \
177
- const Index rhs_vert = base_k + load_idx_vert; \
178
- if (!needs_edge_check || rhs_vert < k_size) { \
179
- const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \
180
- const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \
181
- const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \
182
- const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \
183
- const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \
184
- const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \
185
- const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \
186
- const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \
187
- \
188
- if (rhs_horiz_7 < n_size) { \
189
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
190
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
191
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
192
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
193
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
194
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
195
- rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
196
- rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \
197
- } else if (rhs_horiz_6 < n_size) { \
198
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
199
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
200
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
201
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
202
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
203
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
204
- rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
205
- } else if (rhs_horiz_5 < n_size) { \
206
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
207
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
208
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
209
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
210
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
211
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
212
- } else if (rhs_horiz_4 < n_size) { \
213
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
214
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
215
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
216
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
217
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
218
- } else if (rhs_horiz_3 < n_size) { \
219
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
220
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
221
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
222
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
223
- } else if (rhs_horiz_2 < n_size) { \
224
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
225
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
226
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
227
- } else if (rhs_horiz_1 < n_size) { \
228
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
229
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
230
- } else if (rhs_horiz_0 < n_size) { \
231
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
232
- } \
233
- } \
234
- } \
235
-
236
- #define writeRegToShmem(_) \
237
- lhs_shmem[lhs_store_idx_0] = lhs_pf0; \
238
- rhs_shmem[rhs_store_idx_0] = rhs_pf0; \
239
- \
240
- lhs_shmem[lhs_store_idx_1] = lhs_pf1; \
241
- rhs_shmem[rhs_store_idx_1] = rhs_pf1; \
242
- \
243
- lhs_shmem[lhs_store_idx_2] = lhs_pf2; \
244
- rhs_shmem[rhs_store_idx_2] = rhs_pf2; \
245
- \
246
- lhs_shmem[lhs_store_idx_3] = lhs_pf3; \
247
- rhs_shmem[rhs_store_idx_3] = rhs_pf3; \
248
- \
249
- lhs_shmem[lhs_store_idx_4] = lhs_pf4; \
250
- rhs_shmem[rhs_store_idx_4] = rhs_pf4; \
251
- \
252
- lhs_shmem[lhs_store_idx_5] = lhs_pf5; \
253
- rhs_shmem[rhs_store_idx_5] = rhs_pf5; \
254
- \
255
- lhs_shmem[lhs_store_idx_6] = lhs_pf6; \
256
- rhs_shmem[rhs_store_idx_6] = rhs_pf6; \
257
- \
258
- lhs_shmem[lhs_store_idx_7] = lhs_pf7; \
259
- rhs_shmem[rhs_store_idx_7] = rhs_pf7; \
260
-
261
- // declare and initialize result array
262
- #define res(i, j) _res_##i##j
263
- #define initResultRow(i) \
264
- Scalar res(i, 0) = conv(0); \
265
- Scalar res(i, 1) = conv(0); \
266
- Scalar res(i, 2) = conv(0); \
267
- Scalar res(i, 3) = conv(0); \
268
- Scalar res(i, 4) = conv(0); \
269
- Scalar res(i, 5) = conv(0); \
270
- Scalar res(i, 6) = conv(0); \
271
- Scalar res(i, 7) = conv(0); \
272
-
273
- internal::scalar_cast_op<int, Scalar> conv;
274
- initResultRow(0);
275
- initResultRow(1);
276
- initResultRow(2);
277
- initResultRow(3);
278
- initResultRow(4);
279
- initResultRow(5);
280
- initResultRow(6);
281
- initResultRow(7);
282
- #undef initResultRow
283
-
284
- for (Index base_k = 0; base_k < k_size; base_k += 64) {
285
- // wait for previous iteration to finish with shmem. Despite common sense,
286
- // the code is a bit faster with this here then at bottom of loop
287
- __syncthreads();
288
-
289
- prefetchIntoRegisters(base_k);
290
- writeRegToShmem();
291
-
292
- #undef prefetchIntoRegisters
293
- #undef writeRegToShmem
294
-
295
- // wait for shared mem packing to be done before starting computation
296
- __syncthreads();
297
-
298
- // compute 8x8 matrix product by outer product. This involves packing one column
299
- // of LHS and one row of RHS into registers (takes 16 registers).
300
-
301
- #define lcol(i) _lcol##i
302
- Scalar lcol(0);
303
- Scalar lcol(1);
304
- Scalar lcol(2);
305
- Scalar lcol(3);
306
- Scalar lcol(4);
307
- Scalar lcol(5);
308
- Scalar lcol(6);
309
- Scalar lcol(7);
310
-
311
- #define rrow(j) _rrow##j
312
- Scalar rrow(0);
313
- Scalar rrow(1);
314
- Scalar rrow(2);
315
- Scalar rrow(3);
316
- Scalar rrow(4);
317
- Scalar rrow(5);
318
- Scalar rrow(6);
319
- Scalar rrow(7);
320
-
321
- // Now x corresponds to k, y to m, and z to n
322
- const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
323
- const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
324
-
325
- #define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
326
- #define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
327
-
328
- #define loadData(i, j) \
329
- lcol(0) = lhs_element(0, j); \
330
- rrow(0) = rhs_element(i, 0); \
331
- lcol(1) = lhs_element(1, j); \
332
- rrow(1) = rhs_element(i, 1); \
333
- lcol(2) = lhs_element(2, j); \
334
- rrow(2) = rhs_element(i, 2); \
335
- lcol(3) = lhs_element(3, j); \
336
- rrow(3) = rhs_element(i, 3); \
337
- lcol(4) = lhs_element(4, j); \
338
- rrow(4) = rhs_element(i, 4); \
339
- lcol(5) = lhs_element(5, j); \
340
- rrow(5) = rhs_element(i, 5); \
341
- lcol(6) = lhs_element(6, j); \
342
- rrow(6) = rhs_element(i, 6); \
343
- lcol(7) = lhs_element(7, j); \
344
- rrow(7) = rhs_element(i, 7); \
345
-
346
- #define computeCol(j) \
347
- res(0, j) += lcol(0) * rrow(j); \
348
- res(1, j) += lcol(1) * rrow(j); \
349
- res(2, j) += lcol(2) * rrow(j); \
350
- res(3, j) += lcol(3) * rrow(j); \
351
- res(4, j) += lcol(4) * rrow(j); \
352
- res(5, j) += lcol(5) * rrow(j); \
353
- res(6, j) += lcol(6) * rrow(j); \
354
- res(7, j) += lcol(7) * rrow(j); \
355
-
356
- #define computePass(i) \
357
- loadData(i, i); \
358
- \
359
- computeCol(0); \
360
- computeCol(1); \
361
- computeCol(2); \
362
- computeCol(3); \
363
- computeCol(4); \
364
- computeCol(5); \
365
- computeCol(6); \
366
- computeCol(7); \
367
-
368
- computePass(0);
369
- computePass(1);
370
- computePass(2);
371
- computePass(3);
372
- computePass(4);
373
- computePass(5);
374
- computePass(6);
375
- computePass(7);
376
-
377
- #undef lcol
378
- #undef rrow
379
- #undef lhs_element
380
- #undef rhs_element
381
- #undef loadData
382
- #undef computeCol
383
- #undef computePass
384
- } // end loop over k
385
-
386
- // we've now iterated over all of the large (ie width 64) k blocks and
387
- // accumulated results in registers. At this point thread (x, y, z) contains
388
- // the sum across all big k blocks of the product of little k block of index (x, y)
389
- // with block of index (y, z). To compute the final output, we need to reduce
390
- // the 8 threads over y by summation.
391
- #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
392
- #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
393
- #else
394
- #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
395
- #endif
396
-
397
- #define reduceRow(i, mask) \
398
- shuffleInc(i, 0, mask); \
399
- shuffleInc(i, 1, mask); \
400
- shuffleInc(i, 2, mask); \
401
- shuffleInc(i, 3, mask); \
402
- shuffleInc(i, 4, mask); \
403
- shuffleInc(i, 5, mask); \
404
- shuffleInc(i, 6, mask); \
405
- shuffleInc(i, 7, mask); \
406
-
407
- #define reduceMatrix(mask) \
408
- reduceRow(0, mask); \
409
- reduceRow(1, mask); \
410
- reduceRow(2, mask); \
411
- reduceRow(3, mask); \
412
- reduceRow(4, mask); \
413
- reduceRow(5, mask); \
414
- reduceRow(6, mask); \
415
- reduceRow(7, mask); \
416
-
417
- // actually perform the reduction, now each thread of index (_, y, z)
418
- // contains the correct values in its registers that belong in the output
419
- // block
420
- reduceMatrix(1);
421
- reduceMatrix(2);
422
- reduceMatrix(4);
423
-
424
- #undef shuffleInc
425
- #undef reduceRow
426
- #undef reduceMatrix
427
-
428
- // now we need to copy the 64 values into main memory. We can't split work
429
- // among threads because all variables are in registers. There's 2 ways
430
- // to do this:
431
- // (1) have 1 thread do 64 writes from registers into global memory
432
- // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
433
- // each do 8 writes into global memory. We can just overwrite the shared
434
- // memory from the problem we just solved.
435
- // (2) is slightly faster than (1) due to less branching and more ILP
436
-
437
- // TODO: won't yield much gain, but could just use currently unused shared mem
438
- // and then we won't have to sync
439
- // wait for shared mem to be out of use
440
- __syncthreads();
441
-
442
- #define writeResultShmem(i, j) \
443
- lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
444
-
445
- #define writeRow(i) \
446
- writeResultShmem(i, 0); \
447
- writeResultShmem(i, 1); \
448
- writeResultShmem(i, 2); \
449
- writeResultShmem(i, 3); \
450
- writeResultShmem(i, 4); \
451
- writeResultShmem(i, 5); \
452
- writeResultShmem(i, 6); \
453
- writeResultShmem(i, 7); \
454
-
455
- if (threadIdx.x == 0) {
456
- writeRow(0);
457
- writeRow(1);
458
- writeRow(2);
459
- writeRow(3);
460
- writeRow(4);
461
- writeRow(5);
462
- writeRow(6);
463
- writeRow(7);
464
- }
465
- #undef writeResultShmem
466
- #undef writeRow
467
-
468
- const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
469
- const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
470
-
471
- if (threadIdx.x < max_i_write) {
472
- if (max_j_write == 8) {
473
- // TODO: can i trade bank conflicts for coalesced writes?
474
- Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
475
- Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
476
- Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
477
- Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
478
- Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
479
- Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
480
- Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
481
- Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
482
-
483
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
484
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
485
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
486
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
487
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
488
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
489
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
490
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
491
- } else {
492
- #pragma unroll 7
493
- for (int j = 0; j < max_j_write; j++) {
494
- Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
495
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
496
- }
497
- }
498
- }
499
- #undef res
500
- }
501
-
502
-
503
- template<typename Scalar, typename Index, typename LhsMapper,
504
- typename RhsMapper, typename OutputMapper>
505
- __global__ void
506
- #if defined(EIGEN_HIPCC)
507
- __launch_bounds__(512, 1)
508
- #else
509
- __launch_bounds__(512)
510
- #endif
511
- EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
512
- const OutputMapper output,
513
- const Index m_size, const Index n_size, const Index k_size) {
514
- __shared__ Scalar lhs_shmem[72 * 64];
515
- __shared__ Scalar rhs_shmem[72 * 64];
516
-
517
- const Index m_block_idx = blockIdx.x;
518
- const Index n_block_idx = blockIdx.y;
519
-
520
- const Index base_m = 64 * m_block_idx;
521
- const Index base_n = 64 * n_block_idx;
522
-
523
- if (base_m + 63 < m_size && base_n + 63 < n_size) {
524
- EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
525
- } else {
526
- EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
527
- }
528
- }
529
-
530
-
531
- template<typename Index, typename LhsMapper,
532
- typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
533
- bool CHECK_RHS_BOUNDARY>
534
- __device__ __forceinline__ void
535
- EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
536
- const OutputMapper output, float2 lhs_shmem2[][16],
537
- float2 rhs_shmem2[][8], const Index m_size,
538
- const Index n_size, const Index k_size,
539
- const Index base_m, const Index base_n) {
540
-
541
- // prefetch registers
542
- float4 lhs_pf0, rhs_pf0;
543
-
544
- float4 results[4];
545
- for (int i=0; i < 4; i++) {
546
- results[i].x = results[i].y = results[i].z = results[i].w = 0;
547
- }
548
-
549
- #define prefetch_lhs(reg, row, col) \
550
- if (!CHECK_LHS_BOUNDARY) { \
551
- if (col < k_size) { \
552
- reg =lhs.template loadPacket<float4,Unaligned>(row, col); \
553
- } \
554
- } else { \
555
- if (col < k_size) { \
556
- if (row + 3 < m_size) { \
557
- reg =lhs.template loadPacket<float4,Unaligned>(row, col); \
558
- } else if (row + 2 < m_size) { \
559
- reg.x =lhs(row + 0, col); \
560
- reg.y =lhs(row + 1, col); \
561
- reg.z =lhs(row + 2, col); \
562
- } else if (row + 1 < m_size) { \
563
- reg.x =lhs(row + 0, col); \
564
- reg.y =lhs(row + 1, col); \
565
- } else if (row < m_size) { \
566
- reg.x =lhs(row + 0, col); \
567
- } \
568
- } \
569
- } \
570
-
571
- Index lhs_vert = base_m+threadIdx.x*4;
572
-
573
- for (Index k = 0; k < k_size; k += 16) {
574
-
575
- lhs_pf0 = internal::pset1<float4>(0);
576
- rhs_pf0 = internal::pset1<float4>(0);
577
-
578
- Index lhs_horiz = threadIdx.y+k;
579
- prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
580
-
581
- Index rhs_vert = k+(threadIdx.x%4)*4;
582
- Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
583
-
584
- if (!CHECK_RHS_BOUNDARY) {
585
- if ((rhs_vert + 3) < k_size) {
586
- // just CHECK_RHS_BOUNDARY
587
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
588
- } else if (rhs_vert + 2 < k_size) {
589
- // just CHECK_RHS_BOUNDARY
590
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
591
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
592
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
593
- } else if (rhs_vert + 1 < k_size) {
594
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
595
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
596
- } else if (rhs_vert < k_size) {
597
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
598
- }
599
- } else {
600
- if (rhs_horiz0 < n_size) {
601
- if ((rhs_vert + 3) < k_size) {
602
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
603
- } else if ((rhs_vert + 2) < k_size) {
604
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
605
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
606
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
607
- } else if ((rhs_vert + 1) < k_size) {
608
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
609
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
610
- } else if (rhs_vert < k_size) {
611
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
612
- }
613
- }
614
- }
615
- float x1, x2 ;
616
- // the following can be a bitwise operation..... some day.
617
- if((threadIdx.x%8) < 4) {
618
- x1 = rhs_pf0.y;
619
- x2 = rhs_pf0.w;
620
- } else {
621
- x1 = rhs_pf0.x;
622
- x2 = rhs_pf0.z;
623
- }
624
- #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
625
- x1 = __shfl_xor(x1, 4);
626
- x2 = __shfl_xor(x2, 4);
627
- #else
628
- x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
629
- x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
630
- #endif
631
- if((threadIdx.x%8) < 4) {
632
- rhs_pf0.y = x1;
633
- rhs_pf0.w = x2;
634
- } else {
635
- rhs_pf0.x = x1;
636
- rhs_pf0.z = x2;
637
- }
638
-
639
- // We have 64 features.
640
- // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
641
- // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
642
- // ...
643
- // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
644
- // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
645
- // ...
646
- rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
647
- rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
648
-
649
- // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
650
- // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
651
- // ...
652
- // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
653
- // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63)
654
- // ...
655
-
656
- lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
657
- lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
658
-
659
-
660
- #define add_vals(fl1, fl2, fr1, fr2)\
661
- results[0].x += fl1.x * fr1.x;\
662
- results[0].y += fl1.y * fr1.x;\
663
- results[0].z += fl2.x * fr1.x;\
664
- results[0].w += fl2.y * fr1.x;\
665
- \
666
- results[1].x += fl1.x * fr1.y;\
667
- results[1].y += fl1.y * fr1.y;\
668
- results[1].z += fl2.x * fr1.y;\
669
- results[1].w += fl2.y * fr1.y;\
670
- \
671
- results[2].x += fl1.x * fr2.x;\
672
- results[2].y += fl1.y * fr2.x;\
673
- results[2].z += fl2.x * fr2.x;\
674
- results[2].w += fl2.y * fr2.x;\
675
- \
676
- results[3].x += fl1.x * fr2.y;\
677
- results[3].y += fl1.y * fr2.y;\
678
- results[3].z += fl2.x * fr2.y;\
679
- results[3].w += fl2.y * fr2.y;\
680
-
681
- __syncthreads();
682
-
683
- // Do the multiplies.
684
- #pragma unroll
685
- for (int koff = 0; koff < 16; koff ++) {
686
- // 32 x threads.
687
- float2 fl1 = lhs_shmem2[koff][threadIdx.x];
688
- float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
689
-
690
- int start_feature = threadIdx.y * 4;
691
- float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
692
- float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
693
-
694
- add_vals(fl1, fl2, fr1, fr2)
695
- }
696
- __syncthreads();
697
- }
698
-
699
- #undef prefetch_lhs
700
- #undef add_vals
701
-
702
- Index horiz_base = threadIdx.y*4+base_n;
703
- if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
704
- for (int i = 0; i < 4; i++) {
705
- output(lhs_vert, horiz_base + i) = results[i].x;
706
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
707
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
708
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
709
- }
710
- } else if (!CHECK_RHS_BOUNDARY) {
711
- // CHECK LHS
712
- if (lhs_vert + 3 < m_size) {
713
- for (int i = 0; i < 4; i++) {
714
- output(lhs_vert, horiz_base + i) = results[i].x;
715
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
716
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
717
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
718
- }
719
- } else if (lhs_vert + 2 < m_size) {
720
- for (int i = 0; i < 4; i++) {
721
- output(lhs_vert, horiz_base + i) = results[i].x;
722
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
723
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
724
- }
725
- } else if (lhs_vert + 1 < m_size) {
726
- for (int i = 0; i < 4; i++) {
727
- output(lhs_vert, horiz_base + i) = results[i].x;
728
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
729
- }
730
- } else if (lhs_vert < m_size) {
731
- for (int i = 0; i < 4; i++) {
732
- output(lhs_vert, horiz_base + i) = results[i].x;
733
- }
734
- }
735
- } else if (!CHECK_LHS_BOUNDARY) {
736
- // CHECK RHS
737
- /*
738
- int ncols_rem = fminf(n_size- horiz_base, 4);
739
- for (int i = 0; i < ncols_rem; i++) {
740
- output(lhs_vert, horiz_base + i) = results[i].x;
741
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
742
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
743
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
744
- }*/
745
- for (int i = 0; i < 4; i++) {
746
- if (horiz_base+i < n_size) {
747
- output(lhs_vert, horiz_base + i) = results[i].x;
748
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
749
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
750
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
751
- }
752
- }
753
- } else {
754
- // CHECK both boundaries.
755
- for (int i = 0; i < 4; i++) {
756
- if (horiz_base+i < n_size) {
757
- if (lhs_vert < m_size)
758
- output(lhs_vert, horiz_base + i) = results[i].x;
759
- if (lhs_vert + 1 < m_size)
760
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
761
- if (lhs_vert + 2 < m_size)
762
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
763
- if (lhs_vert + 3 < m_size)
764
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
765
- }
766
- }
767
- }
768
- }
769
-
770
-
771
- template<typename Index, typename LhsMapper,
772
- typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
773
- bool CHECK_RHS_BOUNDARY>
774
- __device__ __forceinline__ void
775
- EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
776
- const OutputMapper output, float2 lhs_shmem2[][32],
777
- float2 rhs_shmem2[][8], const Index m_size,
778
- const Index n_size, const Index k_size,
779
- const Index base_m, const Index base_n) {
780
-
781
- // prefetch registers
782
- float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
783
- float4 rhs_pf0, rhs_pf1;
784
-
785
- float4 results[8];
786
- for (int i=0; i < 8; i++) {
787
- results[i].x = results[i].y = results[i].z = results[i].w = 0;
788
- }
789
-
790
- Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
791
- for (Index k = 0; k < k_size; k += 32) {
792
- lhs_pf0 = internal::pset1<float4>(0);
793
- lhs_pf1 = internal::pset1<float4>(0);
794
- lhs_pf2 = internal::pset1<float4>(0);
795
- lhs_pf3 = internal::pset1<float4>(0);
796
-
797
- rhs_pf0 = internal::pset1<float4>(0);
798
- rhs_pf1 = internal::pset1<float4>(0);
799
-
800
- if (!CHECK_LHS_BOUNDARY) {
801
- if ((threadIdx.y/4+k+24) < k_size) {
802
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
803
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
804
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
805
- lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
806
- } else if ((threadIdx.y/4+k+16) < k_size) {
807
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
808
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
809
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
810
- } else if ((threadIdx.y/4+k+8) < k_size) {
811
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
812
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
813
- } else if ((threadIdx.y/4+k) < k_size) {
814
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
815
- }
816
- } else {
817
- // just CHECK_LHS_BOUNDARY
818
- if (lhs_vert + 3 < m_size) {
819
- if ((threadIdx.y/4+k+24) < k_size) {
820
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
821
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
822
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
823
- lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
824
- } else if ((threadIdx.y/4+k+16) < k_size) {
825
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
826
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
827
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
828
- } else if ((threadIdx.y/4+k+8) < k_size) {
829
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
830
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
831
- } else if ((threadIdx.y/4+k) < k_size) {
832
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
833
- }
834
- } else if (lhs_vert + 2 < m_size) {
835
- if ((threadIdx.y/4+k+24) < k_size) {
836
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
837
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
838
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
839
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
840
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
841
- lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
842
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
843
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
844
- lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
845
- lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
846
- lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
847
- lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
848
- } else if ((threadIdx.y/4+k+16) < k_size) {
849
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
850
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
851
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
852
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
853
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
854
- lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
855
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
856
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
857
- lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
858
- } else if ((threadIdx.y/4+k+8) < k_size) {
859
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
860
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
861
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
862
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
863
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
864
- lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
865
- } else if ((threadIdx.y/4+k) < k_size) {
866
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
867
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
868
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
869
- }
870
- } else if (lhs_vert + 1 < m_size) {
871
- if ((threadIdx.y/4+k+24) < k_size) {
872
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
873
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
874
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
875
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
876
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
877
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
878
- lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
879
- lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
880
- } else if ((threadIdx.y/4+k+16) < k_size) {
881
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
882
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
883
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
884
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
885
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
886
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
887
- } else if ((threadIdx.y/4+k+8) < k_size) {
888
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
889
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
890
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
891
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
892
- } else if ((threadIdx.y/4+k) < k_size) {
893
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
894
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
895
- }
896
- } else if (lhs_vert < m_size) {
897
- if ((threadIdx.y/4+k+24) < k_size) {
898
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
899
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
900
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
901
- lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
902
- } else if ((threadIdx.y/4+k+16) < k_size) {
903
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
904
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
905
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
906
- } else if ((threadIdx.y/4+k+8) < k_size) {
907
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
908
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
909
- } else if ((threadIdx.y/4+k) < k_size) {
910
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
911
- }
912
- }
913
- }
914
- __syncthreads();
915
- Index rhs_vert = k+threadIdx.x*4;
916
- Index rhs_horiz0 = threadIdx.y*2+base_n;
917
- Index rhs_horiz1 = threadIdx.y*2+1+base_n;
918
- if (!CHECK_RHS_BOUNDARY) {
919
- if ((rhs_vert + 3) < k_size) {
920
- // just CHECK_RHS_BOUNDARY
921
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
922
- rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
923
- } else if (rhs_vert + 2 < k_size) {
924
- // just CHECK_RHS_BOUNDARY
925
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
926
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
927
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
928
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
929
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
930
- rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
931
- } else if (rhs_vert + 1 < k_size) {
932
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
933
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
934
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
935
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
936
- } else if (rhs_vert < k_size) {
937
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
938
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
939
- }
940
- } else {
941
- if (rhs_horiz1 < n_size) {
942
- if ((rhs_vert + 3) < k_size) {
943
- // just CHECK_RHS_BOUNDARY
944
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
945
- rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
946
- } else if (rhs_vert + 2 < k_size) {
947
- // just CHECK_RHS_BOUNDARY
948
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
949
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
950
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
951
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
952
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
953
- rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
954
- } else if (k+threadIdx.x*4 + 1 < k_size) {
955
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
956
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
957
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
958
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
959
- } else if (k+threadIdx.x*4 < k_size) {
960
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
961
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
962
- }
963
- } else if (rhs_horiz0 < n_size) {
964
- if ((rhs_vert + 3) < k_size) {
965
- // just CHECK_RHS_BOUNDARY
966
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
967
- } else if ((rhs_vert + 2) < k_size) {
968
- // just CHECK_RHS_BOUNDARY
969
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
970
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
971
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
972
- } else if ((rhs_vert + 1) < k_size) {
973
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
974
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
975
- } else if (rhs_vert < k_size) {
976
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
977
- }
978
- }
979
- }
980
- __syncthreads();
981
- // Loaded. Do computation
982
- // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
983
- // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
984
- // ..
985
- // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
986
- rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
987
- // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
988
- // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
989
- // ..
990
- rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
991
- // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
992
- // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
993
- rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
994
- // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
995
- // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
996
- rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
997
-
998
- // LHS.
999
- // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
1000
- // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
1001
- // ...
1002
- // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
1003
- // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
1004
-
1005
-
1006
- #define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
1007
- results[0].x += a_feat1.x * f1.x;\
1008
- results[1].x += a_feat1.x * f1.y;\
1009
- results[2].x += a_feat1.x * f2.x;\
1010
- results[3].x += a_feat1.x * f2.y;\
1011
- results[4].x += a_feat1.x * f3.x;\
1012
- results[5].x += a_feat1.x * f3.y;\
1013
- results[6].x += a_feat1.x * f4.x;\
1014
- results[7].x += a_feat1.x * f4.y;\
1015
- \
1016
- results[0].y += a_feat1.y * f1.x;\
1017
- results[1].y += a_feat1.y * f1.y;\
1018
- results[2].y += a_feat1.y * f2.x;\
1019
- results[3].y += a_feat1.y * f2.y;\
1020
- results[4].y += a_feat1.y * f3.x;\
1021
- results[5].y += a_feat1.y * f3.y;\
1022
- results[6].y += a_feat1.y * f4.x;\
1023
- results[7].y += a_feat1.y * f4.y;\
1024
- \
1025
- results[0].z += a_feat2.x * f1.x;\
1026
- results[1].z += a_feat2.x * f1.y;\
1027
- results[2].z += a_feat2.x * f2.x;\
1028
- results[3].z += a_feat2.x * f2.y;\
1029
- results[4].z += a_feat2.x * f3.x;\
1030
- results[5].z += a_feat2.x * f3.y;\
1031
- results[6].z += a_feat2.x * f4.x;\
1032
- results[7].z += a_feat2.x * f4.y;\
1033
- \
1034
- results[0].w += a_feat2.y * f1.x;\
1035
- results[1].w += a_feat2.y * f1.y;\
1036
- results[2].w += a_feat2.y * f2.x;\
1037
- results[3].w += a_feat2.y * f2.y;\
1038
- results[4].w += a_feat2.y * f3.x;\
1039
- results[5].w += a_feat2.y * f3.y;\
1040
- results[6].w += a_feat2.y * f4.x;\
1041
- results[7].w += a_feat2.y * f4.y;\
1042
-
1043
- lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
1044
- lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
1045
- lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
1046
- lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
1047
-
1048
- lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
1049
- lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
1050
- lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
1051
- lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
1052
-
1053
- __syncthreads();
1054
-
1055
- // Do the multiplies.
1056
- #pragma unroll
1057
- for (int koff = 0; koff < 32; koff ++) {
1058
- float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
1059
- float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
1060
-
1061
- // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
1062
- int start_feature = (threadIdx.y / 4) * 8;
1063
-
1064
- float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4];
1065
- float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
1066
- float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
1067
- float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
1068
-
1069
- add_vals(a3, a4, br1, br2, br3, br4)
1070
- }
1071
- __syncthreads();
1072
- } // end loop over k
1073
-
1074
- __syncthreads();
1075
- Index horiz_base = (threadIdx.y/4)*8+base_n;
1076
- if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
1077
- for (int i = 0; i < 8; i++) {
1078
- output(lhs_vert, horiz_base + i) = results[i].x;
1079
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1080
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1081
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1082
- }
1083
- } else if (!CHECK_RHS_BOUNDARY) {
1084
- if (lhs_vert + 3 < m_size) {
1085
- for (int i = 0; i < 8; i++) {
1086
- output(lhs_vert, horiz_base + i) = results[i].x;
1087
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1088
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1089
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1090
- }
1091
- } else if (lhs_vert + 2 < m_size) {
1092
- for (int i = 0; i < 8; i++) {
1093
- output(lhs_vert, horiz_base + i) = results[i].x;
1094
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1095
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1096
- }
1097
- } else if (lhs_vert + 1 < m_size) {
1098
- for (int i = 0; i < 8; i++) {
1099
- output(lhs_vert, horiz_base + i) = results[i].x;
1100
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1101
- }
1102
- } else if (lhs_vert < m_size) {
1103
- for (int i = 0; i < 8; i++) {
1104
- output(lhs_vert, horiz_base + i) = results[i].x;
1105
- }
1106
- }
1107
- } else if (!CHECK_LHS_BOUNDARY) {
1108
- // CHECK BOUNDARY_B
1109
- for (int i = 0; i < 8; i++) {
1110
- if (horiz_base + i < n_size) {
1111
- output(lhs_vert, horiz_base + i) = results[i].x;
1112
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1113
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1114
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1115
- }
1116
- }
1117
- } else {
1118
- // CHECK both boundaries.
1119
- for (int i = 0; i < 8; i++) {
1120
- if (horiz_base + i < n_size) {
1121
- if (lhs_vert < m_size)
1122
- output(lhs_vert, horiz_base + i) = results[i].x;
1123
- if (lhs_vert + 1 < m_size)
1124
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1125
- if (lhs_vert + 2 < m_size)
1126
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1127
- if (lhs_vert + 3 < m_size)
1128
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1129
- }
1130
- }
1131
- }
1132
- }
1133
-
1134
-
1135
- template<typename Index, typename LhsMapper,
1136
- typename RhsMapper, typename OutputMapper>
1137
- __global__ void
1138
- #if defined(EIGEN_HIPCC)
1139
- __launch_bounds__(256, 1)
1140
- #else
1141
- __launch_bounds__(256)
1142
- #endif
1143
- EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
1144
- const OutputMapper output,
1145
- const Index m_size, const Index n_size, const Index k_size) {
1146
- __shared__ float2 lhs_shmem[64*32];
1147
- __shared__ float2 rhs_shmem[128*8];
1148
-
1149
- typedef float2 LHS_MEM[64][32];
1150
- typedef float2 RHS_MEM[128][8];
1151
-
1152
- const Index m_block_idx = blockIdx.x;
1153
- const Index n_block_idx = blockIdx.y;
1154
-
1155
- const Index base_m = 128 * m_block_idx;
1156
- const Index base_n = 64 * n_block_idx;
1157
-
1158
- bool check_rhs = (base_n + 63) >= n_size;
1159
- bool check_lhs128 = (base_m + 127) >= m_size;
1160
-
1161
- if (!check_rhs) {
1162
- if (!check_lhs128) {
1163
- // >= 128 rows left
1164
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
1165
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1166
- } else {
1167
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
1168
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1169
- }
1170
- } else {
1171
- if (!check_lhs128) {
1172
- // >= 128 rows left
1173
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
1174
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1175
- } else {
1176
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
1177
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1178
- }
1179
- }
1180
- }
1181
-
1182
- template<typename Index, typename LhsMapper,
1183
- typename RhsMapper, typename OutputMapper>
1184
- __global__ void
1185
- #if defined(EIGEN_HIPCC)
1186
- __launch_bounds__(256, 1)
1187
- #else
1188
- __launch_bounds__(256)
1189
- #endif
1190
- EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
1191
- const OutputMapper output,
1192
- const Index m_size, const Index n_size, const Index k_size) {
1193
- __shared__ float2 lhs_shmem[32][16];
1194
- __shared__ float2 rhs_shmem[64][8];
1195
-
1196
- const Index m_block_idx = blockIdx.x;
1197
- const Index n_block_idx = blockIdx.y;
1198
-
1199
- const Index base_m = 64 * m_block_idx;
1200
- const Index base_n = 64 * n_block_idx;
1201
-
1202
- if (base_m + 63 < m_size) {
1203
- if (base_n + 63 < n_size) {
1204
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1205
- } else {
1206
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1207
- }
1208
- } else {
1209
- if (base_n + 63 < n_size) {
1210
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1211
- } else {
1212
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1213
- }
1214
- }
1215
- }
1216
-
1217
-
1218
- template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
1219
- struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
1220
- public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
1221
-
1222
- typedef GpuDevice Device;
1223
-
1224
- typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
1225
- typedef TensorContractionEvaluatorBase<Self> Base;
1226
-
1227
- typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
1228
- typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
1229
- typedef typename XprType::Index Index;
1230
- typedef typename XprType::CoeffReturnType CoeffReturnType;
1231
- typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
1232
-
1233
- enum {
1234
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
1235
- };
1236
-
1237
- // Most of the code is assuming that both input tensors are ColMajor. If the
1238
- // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
1239
- // If we want to compute A * B = C, where A is LHS and B is RHS, the code
1240
- // will pretend B is LHS and A is RHS.
1241
- typedef typename internal::conditional<
1242
- static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
1243
- typedef typename internal::conditional<
1244
- static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
1245
-
1246
- static const int LDims =
1247
- internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
1248
- static const int RDims =
1249
- internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
1250
- static const int ContractDims = internal::array_size<Indices>::value;
1251
-
1252
- typedef array<Index, LDims> left_dim_mapper_t;
1253
- typedef array<Index, RDims> right_dim_mapper_t;
1254
-
1255
- typedef array<Index, ContractDims> contract_t;
1256
- typedef array<Index, LDims - ContractDims> left_nocontract_t;
1257
- typedef array<Index, RDims - ContractDims> right_nocontract_t;
1258
-
1259
- static const int NumDims = LDims + RDims - 2 * ContractDims;
1260
-
1261
- typedef DSizes<Index, NumDims> Dimensions;
1262
-
1263
- // typedefs needed in evalTo
1264
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
1265
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
1266
-
1267
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
1268
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
1269
-
1270
- typedef typename LeftEvaluator::Dimensions LeftDimensions;
1271
- typedef typename RightEvaluator::Dimensions RightDimensions;
1272
-
1273
- TensorEvaluator(const XprType& op, const Device& device) :
1274
- Base(op, device)
1275
- {
1276
- EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
1277
- GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
1278
- }
1279
-
1280
- // We need to redefine this method to make nvcc happy
1281
- EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
1282
- this->m_leftImpl.evalSubExprsIfNeeded(NULL);
1283
- this->m_rightImpl.evalSubExprsIfNeeded(NULL);
1284
- if (data) {
1285
- evalTo(data);
1286
- return false;
1287
- } else {
1288
- this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
1289
- evalTo(this->m_result);
1290
- return true;
1291
- }
1292
- }
1293
-
1294
- void evalTo(Scalar* buffer) const {
1295
- if (this->m_lhs_inner_dim_contiguous) {
1296
- if (this->m_rhs_inner_dim_contiguous) {
1297
- if (this->m_rhs_inner_dim_reordered) {
1298
- evalTyped<true, true, true, Unaligned>(buffer);
1299
- }
1300
- else {
1301
- evalTyped<true, true, false, Unaligned>(buffer);
1302
- }
1303
- }
1304
- else {
1305
- if (this->m_rhs_inner_dim_reordered) {
1306
- evalTyped<true, false, true, Unaligned>(buffer);
1307
- }
1308
- else {
1309
- evalTyped<true, false, false, Unaligned>(buffer);
1310
- }
1311
- }
1312
- }
1313
- else {
1314
- if (this->m_rhs_inner_dim_contiguous) {
1315
- if (this->m_rhs_inner_dim_reordered) {
1316
- evalTyped<false, true, true, Unaligned>(buffer);
1317
- }
1318
- else {
1319
- evalTyped<false, true, false, Unaligned>(buffer);
1320
- }
1321
- }
1322
- else {
1323
- if (this->m_rhs_inner_dim_reordered) {
1324
- evalTyped<false, false, true, Unaligned>(buffer);
1325
- }
1326
- else {
1327
- evalTyped<false, false, false, Unaligned>(buffer);
1328
- }
1329
- }
1330
- }
1331
- }
1332
-
1333
- template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
1334
- static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
1335
- const Index m_blocks = (m + 63) / 64;
1336
- const Index n_blocks = (n + 63) / 64;
1337
- const dim3 num_blocks(m_blocks, n_blocks, 1);
1338
- const dim3 block_size(8, 8, 8);
1339
- LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
1340
- }
1341
- };
1342
-
1343
- template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
1344
- static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
1345
- if (m < 768 || n < 768) {
1346
- const Index m_blocks = (m + 63) / 64;
1347
- const Index n_blocks = (n + 63) / 64;
1348
- const dim3 num_blocks(m_blocks, n_blocks, 1);
1349
- const dim3 block_size(16, 16, 1);
1350
- LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
1351
- } else {
1352
- const Index m_blocks = (m + 127) / 128;
1353
- const Index n_blocks = (n + 63) / 64;
1354
- const dim3 num_blocks(m_blocks, n_blocks, 1);
1355
- const dim3 block_size(8, 32, 1);
1356
- LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
1357
- }
1358
- }
1359
- };
1360
-
1361
- template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
1362
- void evalTyped(Scalar* buffer) const {
1363
- // columns in left side, rows in right side
1364
- const Index k = this->m_k_size;
1365
- EIGEN_UNUSED_VARIABLE(k)
1366
-
1367
- // rows in left side
1368
- const Index m = this->m_i_size;
1369
-
1370
- // columns in right side
1371
- const Index n = this->m_j_size;
1372
-
1373
- // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
1374
- this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
1375
-
1376
- typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
1377
- LeftEvaluator, left_nocontract_t,
1378
- contract_t, 4,
1379
- lhs_inner_dim_contiguous,
1380
- false, Unaligned> LhsMapper;
1381
-
1382
- typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
1383
- RightEvaluator, right_nocontract_t,
1384
- contract_t, 4,
1385
- rhs_inner_dim_contiguous,
1386
- rhs_inner_dim_reordered, Unaligned> RhsMapper;
1387
-
1388
- typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
1389
-
1390
-
1391
- // initialize data mappers
1392
- LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
1393
- this->m_left_contracting_strides, this->m_k_strides);
1394
-
1395
- RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
1396
- this->m_right_contracting_strides, this->m_k_strides);
1397
-
1398
- OutputMapper output(buffer, m);
1399
-
1400
- #if defined(EIGEN_USE_HIP)
1401
- setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
1402
- #else
1403
- setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
1404
- #endif
1405
-
1406
- LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device);
1407
- }
1408
- };
1409
-
1410
- } // end namespace Eigen
1411
-
1412
- #endif // EIGEN_USE_GPU and EIGEN_GPUCC
1413
- #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H