xtgeo 4.10.0__cp310-cp310-macosx_11_0_arm64.whl → 4.11.0__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xtgeo might be problematic. Click here for more details.

Files changed (566) hide show
  1. xtgeo/_cxtgeo.cpython-310-darwin.so +0 -0
  2. xtgeo/_internal.cpython-310-darwin.so +0 -0
  3. xtgeo/common/version.py +16 -3
  4. xtgeo/cube/_cube_window_attributes.py +13 -4
  5. xtgeo/grid3d/_grid_etc1.py +10 -5
  6. xtgeo/grid3d/grid.py +22 -6
  7. xtgeo/include/fmt/args.h +220 -0
  8. xtgeo/include/fmt/base.h +2989 -0
  9. xtgeo/include/fmt/chrono.h +2330 -0
  10. xtgeo/include/fmt/color.h +637 -0
  11. xtgeo/include/fmt/compile.h +539 -0
  12. xtgeo/include/fmt/core.h +5 -0
  13. xtgeo/include/fmt/format-inl.h +1948 -0
  14. xtgeo/include/fmt/format.h +4244 -0
  15. xtgeo/include/fmt/os.h +427 -0
  16. xtgeo/include/fmt/ostream.h +167 -0
  17. xtgeo/include/fmt/printf.h +633 -0
  18. xtgeo/include/fmt/ranges.h +850 -0
  19. xtgeo/include/fmt/std.h +728 -0
  20. xtgeo/include/fmt/xchar.h +369 -0
  21. xtgeo/lib/cmake/fmt/fmt-config-version.cmake +43 -0
  22. xtgeo/lib/cmake/fmt/fmt-config.cmake +31 -0
  23. xtgeo/lib/cmake/fmt/fmt-targets-release.cmake +19 -0
  24. xtgeo/{share/eigen3/cmake/Eigen3Targets.cmake → lib/cmake/fmt/fmt-targets.cmake} +16 -6
  25. xtgeo/lib/libfmt.a +0 -0
  26. xtgeo/lib/pkgconfig/fmt.pc +11 -0
  27. xtgeo/metadata/metadata.py +20 -13
  28. {xtgeo-4.10.0.dist-info → xtgeo-4.11.0.dist-info}/METADATA +1 -1
  29. xtgeo-4.11.0.dist-info/RECORD +137 -0
  30. {xtgeo-4.10.0.dist-info → xtgeo-4.11.0.dist-info}/WHEEL +1 -1
  31. xtgeo/include/eigen3/Eigen/Cholesky +0 -45
  32. xtgeo/include/eigen3/Eigen/CholmodSupport +0 -48
  33. xtgeo/include/eigen3/Eigen/Core +0 -384
  34. xtgeo/include/eigen3/Eigen/Dense +0 -7
  35. xtgeo/include/eigen3/Eigen/Eigen +0 -2
  36. xtgeo/include/eigen3/Eigen/Eigenvalues +0 -60
  37. xtgeo/include/eigen3/Eigen/Geometry +0 -59
  38. xtgeo/include/eigen3/Eigen/Householder +0 -29
  39. xtgeo/include/eigen3/Eigen/IterativeLinearSolvers +0 -48
  40. xtgeo/include/eigen3/Eigen/Jacobi +0 -32
  41. xtgeo/include/eigen3/Eigen/KLUSupport +0 -41
  42. xtgeo/include/eigen3/Eigen/LU +0 -47
  43. xtgeo/include/eigen3/Eigen/MetisSupport +0 -35
  44. xtgeo/include/eigen3/Eigen/OrderingMethods +0 -70
  45. xtgeo/include/eigen3/Eigen/PaStiXSupport +0 -49
  46. xtgeo/include/eigen3/Eigen/PardisoSupport +0 -35
  47. xtgeo/include/eigen3/Eigen/QR +0 -50
  48. xtgeo/include/eigen3/Eigen/QtAlignedMalloc +0 -39
  49. xtgeo/include/eigen3/Eigen/SPQRSupport +0 -34
  50. xtgeo/include/eigen3/Eigen/SVD +0 -50
  51. xtgeo/include/eigen3/Eigen/Sparse +0 -34
  52. xtgeo/include/eigen3/Eigen/SparseCholesky +0 -37
  53. xtgeo/include/eigen3/Eigen/SparseCore +0 -69
  54. xtgeo/include/eigen3/Eigen/SparseLU +0 -50
  55. xtgeo/include/eigen3/Eigen/SparseQR +0 -36
  56. xtgeo/include/eigen3/Eigen/StdDeque +0 -27
  57. xtgeo/include/eigen3/Eigen/StdList +0 -26
  58. xtgeo/include/eigen3/Eigen/StdVector +0 -27
  59. xtgeo/include/eigen3/Eigen/SuperLUSupport +0 -64
  60. xtgeo/include/eigen3/Eigen/UmfPackSupport +0 -40
  61. xtgeo/include/eigen3/Eigen/src/Cholesky/LDLT.h +0 -688
  62. xtgeo/include/eigen3/Eigen/src/Cholesky/LLT.h +0 -558
  63. xtgeo/include/eigen3/Eigen/src/Cholesky/LLT_LAPACKE.h +0 -99
  64. xtgeo/include/eigen3/Eigen/src/CholmodSupport/CholmodSupport.h +0 -682
  65. xtgeo/include/eigen3/Eigen/src/Core/ArithmeticSequence.h +0 -413
  66. xtgeo/include/eigen3/Eigen/src/Core/Array.h +0 -417
  67. xtgeo/include/eigen3/Eigen/src/Core/ArrayBase.h +0 -226
  68. xtgeo/include/eigen3/Eigen/src/Core/ArrayWrapper.h +0 -209
  69. xtgeo/include/eigen3/Eigen/src/Core/Assign.h +0 -90
  70. xtgeo/include/eigen3/Eigen/src/Core/AssignEvaluator.h +0 -1010
  71. xtgeo/include/eigen3/Eigen/src/Core/Assign_MKL.h +0 -178
  72. xtgeo/include/eigen3/Eigen/src/Core/BandMatrix.h +0 -353
  73. xtgeo/include/eigen3/Eigen/src/Core/Block.h +0 -448
  74. xtgeo/include/eigen3/Eigen/src/Core/BooleanRedux.h +0 -162
  75. xtgeo/include/eigen3/Eigen/src/Core/CommaInitializer.h +0 -164
  76. xtgeo/include/eigen3/Eigen/src/Core/ConditionEstimator.h +0 -175
  77. xtgeo/include/eigen3/Eigen/src/Core/CoreEvaluators.h +0 -1741
  78. xtgeo/include/eigen3/Eigen/src/Core/CoreIterators.h +0 -132
  79. xtgeo/include/eigen3/Eigen/src/Core/CwiseBinaryOp.h +0 -183
  80. xtgeo/include/eigen3/Eigen/src/Core/CwiseNullaryOp.h +0 -1001
  81. xtgeo/include/eigen3/Eigen/src/Core/CwiseTernaryOp.h +0 -197
  82. xtgeo/include/eigen3/Eigen/src/Core/CwiseUnaryOp.h +0 -103
  83. xtgeo/include/eigen3/Eigen/src/Core/CwiseUnaryView.h +0 -132
  84. xtgeo/include/eigen3/Eigen/src/Core/DenseBase.h +0 -701
  85. xtgeo/include/eigen3/Eigen/src/Core/DenseCoeffsBase.h +0 -685
  86. xtgeo/include/eigen3/Eigen/src/Core/DenseStorage.h +0 -652
  87. xtgeo/include/eigen3/Eigen/src/Core/Diagonal.h +0 -258
  88. xtgeo/include/eigen3/Eigen/src/Core/DiagonalMatrix.h +0 -391
  89. xtgeo/include/eigen3/Eigen/src/Core/DiagonalProduct.h +0 -28
  90. xtgeo/include/eigen3/Eigen/src/Core/Dot.h +0 -318
  91. xtgeo/include/eigen3/Eigen/src/Core/EigenBase.h +0 -160
  92. xtgeo/include/eigen3/Eigen/src/Core/ForceAlignedAccess.h +0 -150
  93. xtgeo/include/eigen3/Eigen/src/Core/Fuzzy.h +0 -155
  94. xtgeo/include/eigen3/Eigen/src/Core/GeneralProduct.h +0 -465
  95. xtgeo/include/eigen3/Eigen/src/Core/GenericPacketMath.h +0 -1040
  96. xtgeo/include/eigen3/Eigen/src/Core/GlobalFunctions.h +0 -194
  97. xtgeo/include/eigen3/Eigen/src/Core/IO.h +0 -258
  98. xtgeo/include/eigen3/Eigen/src/Core/IndexedView.h +0 -237
  99. xtgeo/include/eigen3/Eigen/src/Core/Inverse.h +0 -117
  100. xtgeo/include/eigen3/Eigen/src/Core/Map.h +0 -171
  101. xtgeo/include/eigen3/Eigen/src/Core/MapBase.h +0 -310
  102. xtgeo/include/eigen3/Eigen/src/Core/MathFunctions.h +0 -2057
  103. xtgeo/include/eigen3/Eigen/src/Core/MathFunctionsImpl.h +0 -200
  104. xtgeo/include/eigen3/Eigen/src/Core/Matrix.h +0 -565
  105. xtgeo/include/eigen3/Eigen/src/Core/MatrixBase.h +0 -547
  106. xtgeo/include/eigen3/Eigen/src/Core/NestByValue.h +0 -85
  107. xtgeo/include/eigen3/Eigen/src/Core/NoAlias.h +0 -109
  108. xtgeo/include/eigen3/Eigen/src/Core/NumTraits.h +0 -335
  109. xtgeo/include/eigen3/Eigen/src/Core/PartialReduxEvaluator.h +0 -232
  110. xtgeo/include/eigen3/Eigen/src/Core/PermutationMatrix.h +0 -605
  111. xtgeo/include/eigen3/Eigen/src/Core/PlainObjectBase.h +0 -1128
  112. xtgeo/include/eigen3/Eigen/src/Core/Product.h +0 -191
  113. xtgeo/include/eigen3/Eigen/src/Core/ProductEvaluators.h +0 -1179
  114. xtgeo/include/eigen3/Eigen/src/Core/Random.h +0 -218
  115. xtgeo/include/eigen3/Eigen/src/Core/Redux.h +0 -515
  116. xtgeo/include/eigen3/Eigen/src/Core/Ref.h +0 -381
  117. xtgeo/include/eigen3/Eigen/src/Core/Replicate.h +0 -142
  118. xtgeo/include/eigen3/Eigen/src/Core/Reshaped.h +0 -454
  119. xtgeo/include/eigen3/Eigen/src/Core/ReturnByValue.h +0 -119
  120. xtgeo/include/eigen3/Eigen/src/Core/Reverse.h +0 -217
  121. xtgeo/include/eigen3/Eigen/src/Core/Select.h +0 -164
  122. xtgeo/include/eigen3/Eigen/src/Core/SelfAdjointView.h +0 -365
  123. xtgeo/include/eigen3/Eigen/src/Core/SelfCwiseBinaryOp.h +0 -47
  124. xtgeo/include/eigen3/Eigen/src/Core/Solve.h +0 -188
  125. xtgeo/include/eigen3/Eigen/src/Core/SolveTriangular.h +0 -235
  126. xtgeo/include/eigen3/Eigen/src/Core/SolverBase.h +0 -168
  127. xtgeo/include/eigen3/Eigen/src/Core/StableNorm.h +0 -251
  128. xtgeo/include/eigen3/Eigen/src/Core/StlIterators.h +0 -463
  129. xtgeo/include/eigen3/Eigen/src/Core/Stride.h +0 -116
  130. xtgeo/include/eigen3/Eigen/src/Core/Swap.h +0 -68
  131. xtgeo/include/eigen3/Eigen/src/Core/Transpose.h +0 -464
  132. xtgeo/include/eigen3/Eigen/src/Core/Transpositions.h +0 -386
  133. xtgeo/include/eigen3/Eigen/src/Core/TriangularMatrix.h +0 -1001
  134. xtgeo/include/eigen3/Eigen/src/Core/VectorBlock.h +0 -96
  135. xtgeo/include/eigen3/Eigen/src/Core/VectorwiseOp.h +0 -784
  136. xtgeo/include/eigen3/Eigen/src/Core/Visitor.h +0 -381
  137. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX/Complex.h +0 -372
  138. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX/MathFunctions.h +0 -228
  139. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX/PacketMath.h +0 -1574
  140. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX/TypeCasting.h +0 -115
  141. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX512/Complex.h +0 -422
  142. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX512/MathFunctions.h +0 -362
  143. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX512/PacketMath.h +0 -2303
  144. xtgeo/include/eigen3/Eigen/src/Core/arch/AVX512/TypeCasting.h +0 -89
  145. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h +0 -417
  146. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/MathFunctions.h +0 -90
  147. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +0 -2937
  148. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +0 -221
  149. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +0 -629
  150. xtgeo/include/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h +0 -2711
  151. xtgeo/include/eigen3/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
  152. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/BFloat16.h +0 -700
  153. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/ConjHelper.h +0 -117
  154. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +0 -1649
  155. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +0 -110
  156. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/Half.h +0 -942
  157. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/Settings.h +0 -49
  158. xtgeo/include/eigen3/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
  159. xtgeo/include/eigen3/Eigen/src/Core/arch/GPU/MathFunctions.h +0 -103
  160. xtgeo/include/eigen3/Eigen/src/Core/arch/GPU/PacketMath.h +0 -1685
  161. xtgeo/include/eigen3/Eigen/src/Core/arch/GPU/TypeCasting.h +0 -80
  162. xtgeo/include/eigen3/Eigen/src/Core/arch/HIP/hcc/math_constants.h +0 -23
  163. xtgeo/include/eigen3/Eigen/src/Core/arch/MSA/Complex.h +0 -648
  164. xtgeo/include/eigen3/Eigen/src/Core/arch/MSA/MathFunctions.h +0 -387
  165. xtgeo/include/eigen3/Eigen/src/Core/arch/MSA/PacketMath.h +0 -1233
  166. xtgeo/include/eigen3/Eigen/src/Core/arch/NEON/Complex.h +0 -584
  167. xtgeo/include/eigen3/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +0 -183
  168. xtgeo/include/eigen3/Eigen/src/Core/arch/NEON/MathFunctions.h +0 -75
  169. xtgeo/include/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h +0 -4587
  170. xtgeo/include/eigen3/Eigen/src/Core/arch/NEON/TypeCasting.h +0 -1419
  171. xtgeo/include/eigen3/Eigen/src/Core/arch/SSE/Complex.h +0 -351
  172. xtgeo/include/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h +0 -199
  173. xtgeo/include/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h +0 -1505
  174. xtgeo/include/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h +0 -142
  175. xtgeo/include/eigen3/Eigen/src/Core/arch/SVE/MathFunctions.h +0 -44
  176. xtgeo/include/eigen3/Eigen/src/Core/arch/SVE/PacketMath.h +0 -752
  177. xtgeo/include/eigen3/Eigen/src/Core/arch/SVE/TypeCasting.h +0 -49
  178. xtgeo/include/eigen3/Eigen/src/Core/arch/SYCL/InteropHeaders.h +0 -232
  179. xtgeo/include/eigen3/Eigen/src/Core/arch/SYCL/MathFunctions.h +0 -301
  180. xtgeo/include/eigen3/Eigen/src/Core/arch/SYCL/PacketMath.h +0 -670
  181. xtgeo/include/eigen3/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
  182. xtgeo/include/eigen3/Eigen/src/Core/arch/SYCL/TypeCasting.h +0 -85
  183. xtgeo/include/eigen3/Eigen/src/Core/arch/ZVector/Complex.h +0 -426
  184. xtgeo/include/eigen3/Eigen/src/Core/arch/ZVector/MathFunctions.h +0 -233
  185. xtgeo/include/eigen3/Eigen/src/Core/arch/ZVector/PacketMath.h +0 -1060
  186. xtgeo/include/eigen3/Eigen/src/Core/functors/AssignmentFunctors.h +0 -177
  187. xtgeo/include/eigen3/Eigen/src/Core/functors/BinaryFunctors.h +0 -541
  188. xtgeo/include/eigen3/Eigen/src/Core/functors/NullaryFunctors.h +0 -189
  189. xtgeo/include/eigen3/Eigen/src/Core/functors/StlFunctors.h +0 -166
  190. xtgeo/include/eigen3/Eigen/src/Core/functors/TernaryFunctors.h +0 -25
  191. xtgeo/include/eigen3/Eigen/src/Core/functors/UnaryFunctors.h +0 -1131
  192. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h +0 -2645
  193. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h +0 -517
  194. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +0 -317
  195. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +0 -145
  196. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +0 -124
  197. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h +0 -518
  198. xtgeo/include/eigen3/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +0 -136
  199. xtgeo/include/eigen3/Eigen/src/Core/products/Parallelizer.h +0 -180
  200. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +0 -544
  201. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +0 -295
  202. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h +0 -262
  203. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +0 -118
  204. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointProduct.h +0 -133
  205. xtgeo/include/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h +0 -94
  206. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h +0 -472
  207. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +0 -317
  208. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h +0 -350
  209. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +0 -255
  210. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h +0 -337
  211. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +0 -167
  212. xtgeo/include/eigen3/Eigen/src/Core/products/TriangularSolverVector.h +0 -148
  213. xtgeo/include/eigen3/Eigen/src/Core/util/BlasUtil.h +0 -583
  214. xtgeo/include/eigen3/Eigen/src/Core/util/ConfigureVectorization.h +0 -512
  215. xtgeo/include/eigen3/Eigen/src/Core/util/Constants.h +0 -563
  216. xtgeo/include/eigen3/Eigen/src/Core/util/DisableStupidWarnings.h +0 -106
  217. xtgeo/include/eigen3/Eigen/src/Core/util/ForwardDeclarations.h +0 -322
  218. xtgeo/include/eigen3/Eigen/src/Core/util/IndexedViewHelper.h +0 -186
  219. xtgeo/include/eigen3/Eigen/src/Core/util/IntegralConstant.h +0 -272
  220. xtgeo/include/eigen3/Eigen/src/Core/util/MKL_support.h +0 -137
  221. xtgeo/include/eigen3/Eigen/src/Core/util/Macros.h +0 -1464
  222. xtgeo/include/eigen3/Eigen/src/Core/util/Memory.h +0 -1163
  223. xtgeo/include/eigen3/Eigen/src/Core/util/Meta.h +0 -812
  224. xtgeo/include/eigen3/Eigen/src/Core/util/NonMPL2.h +0 -3
  225. xtgeo/include/eigen3/Eigen/src/Core/util/ReenableStupidWarnings.h +0 -31
  226. xtgeo/include/eigen3/Eigen/src/Core/util/ReshapedHelper.h +0 -51
  227. xtgeo/include/eigen3/Eigen/src/Core/util/StaticAssert.h +0 -221
  228. xtgeo/include/eigen3/Eigen/src/Core/util/SymbolicIndex.h +0 -293
  229. xtgeo/include/eigen3/Eigen/src/Core/util/XprHelper.h +0 -856
  230. xtgeo/include/eigen3/Eigen/src/Eigenvalues/ComplexEigenSolver.h +0 -346
  231. xtgeo/include/eigen3/Eigen/src/Eigenvalues/ComplexSchur.h +0 -462
  232. xtgeo/include/eigen3/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +0 -91
  233. xtgeo/include/eigen3/Eigen/src/Eigenvalues/EigenSolver.h +0 -622
  234. xtgeo/include/eigen3/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +0 -418
  235. xtgeo/include/eigen3/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +0 -226
  236. xtgeo/include/eigen3/Eigen/src/Eigenvalues/HessenbergDecomposition.h +0 -374
  237. xtgeo/include/eigen3/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +0 -158
  238. xtgeo/include/eigen3/Eigen/src/Eigenvalues/RealQZ.h +0 -657
  239. xtgeo/include/eigen3/Eigen/src/Eigenvalues/RealSchur.h +0 -558
  240. xtgeo/include/eigen3/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +0 -77
  241. xtgeo/include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +0 -904
  242. xtgeo/include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +0 -87
  243. xtgeo/include/eigen3/Eigen/src/Eigenvalues/Tridiagonalization.h +0 -561
  244. xtgeo/include/eigen3/Eigen/src/Geometry/AlignedBox.h +0 -486
  245. xtgeo/include/eigen3/Eigen/src/Geometry/AngleAxis.h +0 -247
  246. xtgeo/include/eigen3/Eigen/src/Geometry/EulerAngles.h +0 -114
  247. xtgeo/include/eigen3/Eigen/src/Geometry/Homogeneous.h +0 -501
  248. xtgeo/include/eigen3/Eigen/src/Geometry/Hyperplane.h +0 -282
  249. xtgeo/include/eigen3/Eigen/src/Geometry/OrthoMethods.h +0 -235
  250. xtgeo/include/eigen3/Eigen/src/Geometry/ParametrizedLine.h +0 -232
  251. xtgeo/include/eigen3/Eigen/src/Geometry/Quaternion.h +0 -870
  252. xtgeo/include/eigen3/Eigen/src/Geometry/Rotation2D.h +0 -199
  253. xtgeo/include/eigen3/Eigen/src/Geometry/RotationBase.h +0 -206
  254. xtgeo/include/eigen3/Eigen/src/Geometry/Scaling.h +0 -188
  255. xtgeo/include/eigen3/Eigen/src/Geometry/Transform.h +0 -1563
  256. xtgeo/include/eigen3/Eigen/src/Geometry/Translation.h +0 -202
  257. xtgeo/include/eigen3/Eigen/src/Geometry/Umeyama.h +0 -166
  258. xtgeo/include/eigen3/Eigen/src/Geometry/arch/Geometry_SIMD.h +0 -168
  259. xtgeo/include/eigen3/Eigen/src/Householder/BlockHouseholder.h +0 -110
  260. xtgeo/include/eigen3/Eigen/src/Householder/Householder.h +0 -176
  261. xtgeo/include/eigen3/Eigen/src/Householder/HouseholderSequence.h +0 -545
  262. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +0 -226
  263. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +0 -212
  264. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +0 -229
  265. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +0 -394
  266. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +0 -453
  267. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +0 -444
  268. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +0 -198
  269. xtgeo/include/eigen3/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +0 -117
  270. xtgeo/include/eigen3/Eigen/src/Jacobi/Jacobi.h +0 -483
  271. xtgeo/include/eigen3/Eigen/src/KLUSupport/KLUSupport.h +0 -358
  272. xtgeo/include/eigen3/Eigen/src/LU/Determinant.h +0 -117
  273. xtgeo/include/eigen3/Eigen/src/LU/FullPivLU.h +0 -877
  274. xtgeo/include/eigen3/Eigen/src/LU/InverseImpl.h +0 -432
  275. xtgeo/include/eigen3/Eigen/src/LU/PartialPivLU.h +0 -624
  276. xtgeo/include/eigen3/Eigen/src/LU/PartialPivLU_LAPACKE.h +0 -83
  277. xtgeo/include/eigen3/Eigen/src/LU/arch/InverseSize4.h +0 -351
  278. xtgeo/include/eigen3/Eigen/src/MetisSupport/MetisSupport.h +0 -137
  279. xtgeo/include/eigen3/Eigen/src/OrderingMethods/Amd.h +0 -435
  280. xtgeo/include/eigen3/Eigen/src/OrderingMethods/Eigen_Colamd.h +0 -1863
  281. xtgeo/include/eigen3/Eigen/src/OrderingMethods/Ordering.h +0 -153
  282. xtgeo/include/eigen3/Eigen/src/PaStiXSupport/PaStiXSupport.h +0 -678
  283. xtgeo/include/eigen3/Eigen/src/PardisoSupport/PardisoSupport.h +0 -545
  284. xtgeo/include/eigen3/Eigen/src/QR/ColPivHouseholderQR.h +0 -674
  285. xtgeo/include/eigen3/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +0 -97
  286. xtgeo/include/eigen3/Eigen/src/QR/CompleteOrthogonalDecomposition.h +0 -635
  287. xtgeo/include/eigen3/Eigen/src/QR/FullPivHouseholderQR.h +0 -713
  288. xtgeo/include/eigen3/Eigen/src/QR/HouseholderQR.h +0 -434
  289. xtgeo/include/eigen3/Eigen/src/QR/HouseholderQR_LAPACKE.h +0 -68
  290. xtgeo/include/eigen3/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +0 -335
  291. xtgeo/include/eigen3/Eigen/src/SVD/BDCSVD.h +0 -1366
  292. xtgeo/include/eigen3/Eigen/src/SVD/JacobiSVD.h +0 -812
  293. xtgeo/include/eigen3/Eigen/src/SVD/JacobiSVD_LAPACKE.h +0 -91
  294. xtgeo/include/eigen3/Eigen/src/SVD/SVDBase.h +0 -376
  295. xtgeo/include/eigen3/Eigen/src/SVD/UpperBidiagonalization.h +0 -414
  296. xtgeo/include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky.h +0 -697
  297. xtgeo/include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +0 -174
  298. xtgeo/include/eigen3/Eigen/src/SparseCore/AmbiVector.h +0 -378
  299. xtgeo/include/eigen3/Eigen/src/SparseCore/CompressedStorage.h +0 -274
  300. xtgeo/include/eigen3/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +0 -352
  301. xtgeo/include/eigen3/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  302. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseAssign.h +0 -270
  303. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseBlock.h +0 -571
  304. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseColEtree.h +0 -206
  305. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseCompressedBase.h +0 -370
  306. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +0 -722
  307. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +0 -150
  308. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseDenseProduct.h +0 -342
  309. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseDiagonalProduct.h +0 -138
  310. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseDot.h +0 -98
  311. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseFuzzy.h +0 -29
  312. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseMap.h +0 -305
  313. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseMatrix.h +0 -1518
  314. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseMatrixBase.h +0 -398
  315. xtgeo/include/eigen3/Eigen/src/SparseCore/SparsePermutation.h +0 -178
  316. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseProduct.h +0 -181
  317. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseRedux.h +0 -49
  318. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseRef.h +0 -397
  319. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseSelfAdjointView.h +0 -659
  320. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseSolverBase.h +0 -124
  321. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +0 -198
  322. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseTranspose.h +0 -92
  323. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseTriangularView.h +0 -189
  324. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseUtil.h +0 -186
  325. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseVector.h +0 -478
  326. xtgeo/include/eigen3/Eigen/src/SparseCore/SparseView.h +0 -254
  327. xtgeo/include/eigen3/Eigen/src/SparseCore/TriangularSolver.h +0 -315
  328. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU.h +0 -923
  329. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLUImpl.h +0 -66
  330. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_Memory.h +0 -226
  331. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_Structs.h +0 -110
  332. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +0 -375
  333. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_Utils.h +0 -80
  334. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_column_bmod.h +0 -181
  335. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_column_dfs.h +0 -179
  336. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +0 -107
  337. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  338. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +0 -126
  339. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +0 -130
  340. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_panel_bmod.h +0 -223
  341. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_panel_dfs.h +0 -258
  342. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_pivotL.h +0 -137
  343. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_pruneL.h +0 -136
  344. xtgeo/include/eigen3/Eigen/src/SparseLU/SparseLU_relax_snode.h +0 -83
  345. xtgeo/include/eigen3/Eigen/src/SparseQR/SparseQR.h +0 -758
  346. xtgeo/include/eigen3/Eigen/src/StlSupport/StdDeque.h +0 -116
  347. xtgeo/include/eigen3/Eigen/src/StlSupport/StdList.h +0 -106
  348. xtgeo/include/eigen3/Eigen/src/StlSupport/StdVector.h +0 -131
  349. xtgeo/include/eigen3/Eigen/src/StlSupport/details.h +0 -84
  350. xtgeo/include/eigen3/Eigen/src/SuperLUSupport/SuperLUSupport.h +0 -1025
  351. xtgeo/include/eigen3/Eigen/src/UmfPackSupport/UmfPackSupport.h +0 -642
  352. xtgeo/include/eigen3/Eigen/src/misc/Image.h +0 -82
  353. xtgeo/include/eigen3/Eigen/src/misc/Kernel.h +0 -79
  354. xtgeo/include/eigen3/Eigen/src/misc/RealSvd2x2.h +0 -55
  355. xtgeo/include/eigen3/Eigen/src/misc/blas.h +0 -440
  356. xtgeo/include/eigen3/Eigen/src/misc/lapack.h +0 -152
  357. xtgeo/include/eigen3/Eigen/src/misc/lapacke.h +0 -16292
  358. xtgeo/include/eigen3/Eigen/src/misc/lapacke_mangling.h +0 -17
  359. xtgeo/include/eigen3/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
  360. xtgeo/include/eigen3/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
  361. xtgeo/include/eigen3/Eigen/src/plugins/BlockMethods.h +0 -1442
  362. xtgeo/include/eigen3/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  363. xtgeo/include/eigen3/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -177
  364. xtgeo/include/eigen3/Eigen/src/plugins/IndexedViewMethods.h +0 -262
  365. xtgeo/include/eigen3/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  366. xtgeo/include/eigen3/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
  367. xtgeo/include/eigen3/Eigen/src/plugins/ReshapedMethods.h +0 -149
  368. xtgeo/include/eigen3/signature_of_eigen3_matrix_library +0 -1
  369. xtgeo/include/eigen3/unsupported/Eigen/AdolcForward +0 -159
  370. xtgeo/include/eigen3/unsupported/Eigen/AlignedVector3 +0 -234
  371. xtgeo/include/eigen3/unsupported/Eigen/ArpackSupport +0 -30
  372. xtgeo/include/eigen3/unsupported/Eigen/AutoDiff +0 -46
  373. xtgeo/include/eigen3/unsupported/Eigen/BVH +0 -95
  374. xtgeo/include/eigen3/unsupported/Eigen/CXX11/Tensor +0 -137
  375. xtgeo/include/eigen3/unsupported/Eigen/CXX11/TensorSymmetry +0 -42
  376. xtgeo/include/eigen3/unsupported/Eigen/CXX11/ThreadPool +0 -74
  377. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +0 -554
  378. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +0 -329
  379. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +0 -247
  380. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +0 -1176
  381. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +0 -1559
  382. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +0 -1093
  383. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +0 -518
  384. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +0 -377
  385. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +0 -1023
  386. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +0 -73
  387. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +0 -6
  388. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +0 -1413
  389. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +0 -575
  390. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +0 -1650
  391. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +0 -1679
  392. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +0 -456
  393. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +0 -1132
  394. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +0 -544
  395. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +0 -214
  396. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +0 -347
  397. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +0 -137
  398. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +0 -6
  399. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +0 -104
  400. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +0 -389
  401. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +0 -1048
  402. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +0 -409
  403. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h +0 -236
  404. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +0 -490
  405. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +0 -236
  406. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +0 -983
  407. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +0 -703
  408. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +0 -388
  409. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +0 -669
  410. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +0 -379
  411. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +0 -237
  412. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +0 -191
  413. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +0 -488
  414. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +0 -302
  415. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h +0 -33
  416. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +0 -99
  417. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h +0 -44
  418. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +0 -79
  419. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +0 -603
  420. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +0 -738
  421. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +0 -247
  422. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +0 -82
  423. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +0 -263
  424. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +0 -216
  425. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +0 -98
  426. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +0 -327
  427. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +0 -311
  428. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +0 -1102
  429. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +0 -708
  430. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +0 -291
  431. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +0 -322
  432. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +0 -998
  433. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +0 -6
  434. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +0 -966
  435. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +0 -582
  436. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +0 -454
  437. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +0 -465
  438. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +0 -528
  439. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h +0 -513
  440. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +0 -471
  441. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +0 -161
  442. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +0 -346
  443. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +0 -303
  444. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +0 -264
  445. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +0 -249
  446. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +0 -629
  447. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +0 -293
  448. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +0 -236
  449. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h +0 -338
  450. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h +0 -669
  451. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h +0 -67
  452. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +0 -249
  453. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +0 -486
  454. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +0 -236
  455. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h +0 -23
  456. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h +0 -40
  457. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h +0 -301
  458. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h +0 -48
  459. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h +0 -20
  460. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +0 -537
  461. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h +0 -88
  462. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/util/EmulateArray.h +0 -261
  463. xtgeo/include/eigen3/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h +0 -158
  464. xtgeo/include/eigen3/unsupported/Eigen/EulerAngles +0 -43
  465. xtgeo/include/eigen3/unsupported/Eigen/FFT +0 -419
  466. xtgeo/include/eigen3/unsupported/Eigen/IterativeSolvers +0 -51
  467. xtgeo/include/eigen3/unsupported/Eigen/KroneckerProduct +0 -36
  468. xtgeo/include/eigen3/unsupported/Eigen/LevenbergMarquardt +0 -49
  469. xtgeo/include/eigen3/unsupported/Eigen/MPRealSupport +0 -213
  470. xtgeo/include/eigen3/unsupported/Eigen/MatrixFunctions +0 -504
  471. xtgeo/include/eigen3/unsupported/Eigen/MoreVectorization +0 -24
  472. xtgeo/include/eigen3/unsupported/Eigen/NonLinearOptimization +0 -140
  473. xtgeo/include/eigen3/unsupported/Eigen/NumericalDiff +0 -56
  474. xtgeo/include/eigen3/unsupported/Eigen/OpenGLSupport +0 -322
  475. xtgeo/include/eigen3/unsupported/Eigen/Polynomials +0 -137
  476. xtgeo/include/eigen3/unsupported/Eigen/Skyline +0 -39
  477. xtgeo/include/eigen3/unsupported/Eigen/SparseExtra +0 -54
  478. xtgeo/include/eigen3/unsupported/Eigen/SpecialFunctions +0 -103
  479. xtgeo/include/eigen3/unsupported/Eigen/Splines +0 -35
  480. xtgeo/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h +0 -108
  481. xtgeo/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +0 -730
  482. xtgeo/include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h +0 -220
  483. xtgeo/include/eigen3/unsupported/Eigen/src/BVH/BVAlgorithms.h +0 -293
  484. xtgeo/include/eigen3/unsupported/Eigen/src/BVH/KdBVH.h +0 -223
  485. xtgeo/include/eigen3/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h +0 -790
  486. xtgeo/include/eigen3/unsupported/Eigen/src/EulerAngles/EulerAngles.h +0 -355
  487. xtgeo/include/eigen3/unsupported/Eigen/src/EulerAngles/EulerSystem.h +0 -305
  488. xtgeo/include/eigen3/unsupported/Eigen/src/FFT/ei_fftw_impl.h +0 -261
  489. xtgeo/include/eigen3/unsupported/Eigen/src/FFT/ei_kissfft_impl.h +0 -449
  490. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h +0 -187
  491. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/DGMRES.h +0 -511
  492. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/GMRES.h +0 -335
  493. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IDRS.h +0 -436
  494. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h +0 -90
  495. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/IterationController.h +0 -154
  496. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/MINRES.h +0 -267
  497. xtgeo/include/eigen3/unsupported/Eigen/src/IterativeSolvers/Scaling.h +0 -193
  498. xtgeo/include/eigen3/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h +0 -305
  499. xtgeo/include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h +0 -84
  500. xtgeo/include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h +0 -202
  501. xtgeo/include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h +0 -160
  502. xtgeo/include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h +0 -188
  503. xtgeo/include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h +0 -396
  504. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +0 -441
  505. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +0 -569
  506. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +0 -373
  507. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +0 -705
  508. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h +0 -368
  509. xtgeo/include/eigen3/unsupported/Eigen/src/MatrixFunctions/StemFunction.h +0 -117
  510. xtgeo/include/eigen3/unsupported/Eigen/src/MoreVectorization/MathFunctions.h +0 -95
  511. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h +0 -601
  512. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h +0 -657
  513. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/chkder.h +0 -66
  514. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/covar.h +0 -70
  515. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/dogleg.h +0 -107
  516. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h +0 -79
  517. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/lmpar.h +0 -298
  518. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h +0 -91
  519. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h +0 -30
  520. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1updt.h +0 -99
  521. xtgeo/include/eigen3/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h +0 -49
  522. xtgeo/include/eigen3/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h +0 -130
  523. xtgeo/include/eigen3/unsupported/Eigen/src/Polynomials/Companion.h +0 -280
  524. xtgeo/include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +0 -428
  525. xtgeo/include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialUtils.h +0 -143
  526. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h +0 -352
  527. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrix.h +0 -862
  528. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h +0 -212
  529. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineProduct.h +0 -295
  530. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineStorage.h +0 -259
  531. xtgeo/include/eigen3/unsupported/Eigen/src/Skyline/SkylineUtil.h +0 -89
  532. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h +0 -122
  533. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h +0 -1079
  534. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h +0 -404
  535. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/MarketIO.h +0 -282
  536. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h +0 -247
  537. xtgeo/include/eigen3/unsupported/Eigen/src/SparseExtra/RandomSetter.h +0 -349
  538. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h +0 -286
  539. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h +0 -68
  540. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h +0 -357
  541. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h +0 -66
  542. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h +0 -1959
  543. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h +0 -118
  544. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h +0 -67
  545. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h +0 -167
  546. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h +0 -58
  547. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h +0 -330
  548. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h +0 -58
  549. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +0 -2045
  550. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h +0 -79
  551. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h +0 -46
  552. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h +0 -16
  553. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h +0 -46
  554. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h +0 -16
  555. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h +0 -369
  556. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h +0 -54
  557. xtgeo/include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h +0 -34
  558. xtgeo/include/eigen3/unsupported/Eigen/src/Splines/Spline.h +0 -507
  559. xtgeo/include/eigen3/unsupported/Eigen/src/Splines/SplineFitting.h +0 -431
  560. xtgeo/include/eigen3/unsupported/Eigen/src/Splines/SplineFwd.h +0 -93
  561. xtgeo/share/eigen3/cmake/Eigen3Config.cmake +0 -37
  562. xtgeo/share/eigen3/cmake/Eigen3ConfigVersion.cmake +0 -65
  563. xtgeo/share/eigen3/cmake/UseEigen3.cmake +0 -6
  564. xtgeo/share/pkgconfig/eigen3.pc +0 -9
  565. xtgeo-4.10.0.dist-info/RECORD +0 -652
  566. {xtgeo-4.10.0.dist-info → xtgeo-4.11.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,1413 +0,0 @@
1
- // This file is part of Eigen, a lightweight C++ template library
2
- // for linear algebra.
3
- //
4
- // Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
- // Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
6
- // Copyright (C) 2014 Eric Martin <eric@ericmart.in>
7
- //
8
- // This Source Code Form is subject to the terms of the Mozilla
9
- // Public License v. 2.0. If a copy of the MPL was not distributed
10
- // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
-
12
- #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
13
- #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
14
-
15
- #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
16
-
17
- namespace Eigen {
18
-
19
- template<typename Scalar, typename Index, typename LhsMapper,
20
- typename RhsMapper, typename OutputMapper, bool needs_edge_check>
21
- __device__ EIGEN_STRONG_INLINE void
22
- EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
23
- const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
24
- const Index m_size, const Index n_size, const Index k_size) {
25
-
26
- const Index m_block_idx = blockIdx.x;
27
- const Index n_block_idx = blockIdx.y;
28
-
29
- const Index base_m = 64 * m_block_idx;
30
- const Index base_n = 64 * n_block_idx;
31
-
32
- // declare and initialize 64 registers for output 8x8 block
33
-
34
- // prefetch registers
35
- Scalar lhs_pf0;
36
- Scalar lhs_pf1;
37
- Scalar lhs_pf2;
38
- Scalar lhs_pf3;
39
- Scalar lhs_pf4;
40
- Scalar lhs_pf5;
41
- Scalar lhs_pf6;
42
- Scalar lhs_pf7;
43
-
44
- Scalar rhs_pf0;
45
- Scalar rhs_pf1;
46
- Scalar rhs_pf2;
47
- Scalar rhs_pf3;
48
- Scalar rhs_pf4;
49
- Scalar rhs_pf5;
50
- Scalar rhs_pf6;
51
- Scalar rhs_pf7;
52
-
53
- // shared memory is formatted
54
- // (contract idx in block, nocontract idx in block, block idx)
55
- // where block idx is column major. This transposition limits the number of
56
- // bank conflicts when reading the LHS. The core idea is that since the contracting
57
- // index is shared by both sides, then the contracting index should be in threadIdx.x.
58
-
59
- // On the LHS, we pad each row inside of each block with an extra element. This makes
60
- // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
61
- // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
62
-
63
- // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
64
- // conflicts on writes and also none on reads.
65
-
66
- // storage indices
67
- const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
68
- const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
69
-
70
- const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
71
- const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
72
- const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
73
- const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
74
- const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
75
- const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
76
- const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
77
- const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
78
-
79
- const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
80
- const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
81
- const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
82
- const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
83
- const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
84
- const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
85
- const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
86
- const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
87
-
88
- // in the loading code, the following variables are important:
89
- // threadIdx.x: the vertical position in an 8x8 block
90
- // threadIdx.y: the vertical index of the 8x8 block in the grid
91
- // threadIdx.z: the horizontal position in an 8x8 block
92
- // k: the horizontal index of the 8x8 block in the grid
93
- //
94
- // The k parameter is implicit (it was the loop counter for a loop that went
95
- // from 0 to <8, but now that loop is unrolled in the below code.
96
-
97
- const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
98
- const Index lhs_vert = base_m + load_idx_vert;
99
-
100
- #define prefetchIntoRegisters(base_k) \
101
- { \
102
- lhs_pf0 = conv(0); \
103
- lhs_pf1 = conv(0); \
104
- lhs_pf2 = conv(0); \
105
- lhs_pf3 = conv(0); \
106
- lhs_pf4 = conv(0); \
107
- lhs_pf5 = conv(0); \
108
- lhs_pf6 = conv(0); \
109
- lhs_pf7 = conv(0); \
110
- \
111
- rhs_pf0 = conv(0); \
112
- rhs_pf1 = conv(0); \
113
- rhs_pf2 = conv(0); \
114
- rhs_pf3 = conv(0); \
115
- rhs_pf4 = conv(0); \
116
- rhs_pf5 = conv(0); \
117
- rhs_pf6 = conv(0); \
118
- rhs_pf7 = conv(0); \
119
- \
120
- if (!needs_edge_check || lhs_vert < m_size) { \
121
- const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \
122
- const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \
123
- const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \
124
- const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \
125
- const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \
126
- const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \
127
- const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \
128
- const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \
129
- \
130
- if (!needs_edge_check || lhs_horiz_7 < k_size) { \
131
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
132
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
133
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
134
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
135
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
136
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
137
- lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
138
- lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \
139
- } else if (lhs_horiz_6 < k_size) { \
140
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
141
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
142
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
143
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
144
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
145
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
146
- lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
147
- } else if (lhs_horiz_5 < k_size) { \
148
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
149
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
150
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
151
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
152
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
153
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
154
- } else if (lhs_horiz_4 < k_size) { \
155
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
156
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
157
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
158
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
159
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
160
- } else if (lhs_horiz_3 < k_size) { \
161
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
162
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
163
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
164
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
165
- } else if (lhs_horiz_2 < k_size) { \
166
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
167
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
168
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
169
- } else if (lhs_horiz_1 < k_size) { \
170
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
171
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
172
- } else if (lhs_horiz_0 < k_size) { \
173
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
174
- } \
175
- } \
176
- \
177
- const Index rhs_vert = base_k + load_idx_vert; \
178
- if (!needs_edge_check || rhs_vert < k_size) { \
179
- const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \
180
- const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \
181
- const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \
182
- const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \
183
- const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \
184
- const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \
185
- const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \
186
- const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \
187
- \
188
- if (rhs_horiz_7 < n_size) { \
189
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
190
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
191
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
192
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
193
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
194
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
195
- rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
196
- rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \
197
- } else if (rhs_horiz_6 < n_size) { \
198
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
199
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
200
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
201
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
202
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
203
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
204
- rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
205
- } else if (rhs_horiz_5 < n_size) { \
206
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
207
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
208
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
209
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
210
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
211
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
212
- } else if (rhs_horiz_4 < n_size) { \
213
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
214
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
215
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
216
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
217
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
218
- } else if (rhs_horiz_3 < n_size) { \
219
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
220
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
221
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
222
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
223
- } else if (rhs_horiz_2 < n_size) { \
224
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
225
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
226
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
227
- } else if (rhs_horiz_1 < n_size) { \
228
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
229
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
230
- } else if (rhs_horiz_0 < n_size) { \
231
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
232
- } \
233
- } \
234
- } \
235
-
236
- #define writeRegToShmem(_) \
237
- lhs_shmem[lhs_store_idx_0] = lhs_pf0; \
238
- rhs_shmem[rhs_store_idx_0] = rhs_pf0; \
239
- \
240
- lhs_shmem[lhs_store_idx_1] = lhs_pf1; \
241
- rhs_shmem[rhs_store_idx_1] = rhs_pf1; \
242
- \
243
- lhs_shmem[lhs_store_idx_2] = lhs_pf2; \
244
- rhs_shmem[rhs_store_idx_2] = rhs_pf2; \
245
- \
246
- lhs_shmem[lhs_store_idx_3] = lhs_pf3; \
247
- rhs_shmem[rhs_store_idx_3] = rhs_pf3; \
248
- \
249
- lhs_shmem[lhs_store_idx_4] = lhs_pf4; \
250
- rhs_shmem[rhs_store_idx_4] = rhs_pf4; \
251
- \
252
- lhs_shmem[lhs_store_idx_5] = lhs_pf5; \
253
- rhs_shmem[rhs_store_idx_5] = rhs_pf5; \
254
- \
255
- lhs_shmem[lhs_store_idx_6] = lhs_pf6; \
256
- rhs_shmem[rhs_store_idx_6] = rhs_pf6; \
257
- \
258
- lhs_shmem[lhs_store_idx_7] = lhs_pf7; \
259
- rhs_shmem[rhs_store_idx_7] = rhs_pf7; \
260
-
261
- // declare and initialize result array
262
- #define res(i, j) _res_##i##j
263
- #define initResultRow(i) \
264
- Scalar res(i, 0) = conv(0); \
265
- Scalar res(i, 1) = conv(0); \
266
- Scalar res(i, 2) = conv(0); \
267
- Scalar res(i, 3) = conv(0); \
268
- Scalar res(i, 4) = conv(0); \
269
- Scalar res(i, 5) = conv(0); \
270
- Scalar res(i, 6) = conv(0); \
271
- Scalar res(i, 7) = conv(0); \
272
-
273
- internal::scalar_cast_op<int, Scalar> conv;
274
- initResultRow(0);
275
- initResultRow(1);
276
- initResultRow(2);
277
- initResultRow(3);
278
- initResultRow(4);
279
- initResultRow(5);
280
- initResultRow(6);
281
- initResultRow(7);
282
- #undef initResultRow
283
-
284
- for (Index base_k = 0; base_k < k_size; base_k += 64) {
285
- // wait for previous iteration to finish with shmem. Despite common sense,
286
- // the code is a bit faster with this here then at bottom of loop
287
- __syncthreads();
288
-
289
- prefetchIntoRegisters(base_k);
290
- writeRegToShmem();
291
-
292
- #undef prefetchIntoRegisters
293
- #undef writeRegToShmem
294
-
295
- // wait for shared mem packing to be done before starting computation
296
- __syncthreads();
297
-
298
- // compute 8x8 matrix product by outer product. This involves packing one column
299
- // of LHS and one row of RHS into registers (takes 16 registers).
300
-
301
- #define lcol(i) _lcol##i
302
- Scalar lcol(0);
303
- Scalar lcol(1);
304
- Scalar lcol(2);
305
- Scalar lcol(3);
306
- Scalar lcol(4);
307
- Scalar lcol(5);
308
- Scalar lcol(6);
309
- Scalar lcol(7);
310
-
311
- #define rrow(j) _rrow##j
312
- Scalar rrow(0);
313
- Scalar rrow(1);
314
- Scalar rrow(2);
315
- Scalar rrow(3);
316
- Scalar rrow(4);
317
- Scalar rrow(5);
318
- Scalar rrow(6);
319
- Scalar rrow(7);
320
-
321
- // Now x corresponds to k, y to m, and z to n
322
- const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
323
- const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
324
-
325
- #define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
326
- #define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
327
-
328
- #define loadData(i, j) \
329
- lcol(0) = lhs_element(0, j); \
330
- rrow(0) = rhs_element(i, 0); \
331
- lcol(1) = lhs_element(1, j); \
332
- rrow(1) = rhs_element(i, 1); \
333
- lcol(2) = lhs_element(2, j); \
334
- rrow(2) = rhs_element(i, 2); \
335
- lcol(3) = lhs_element(3, j); \
336
- rrow(3) = rhs_element(i, 3); \
337
- lcol(4) = lhs_element(4, j); \
338
- rrow(4) = rhs_element(i, 4); \
339
- lcol(5) = lhs_element(5, j); \
340
- rrow(5) = rhs_element(i, 5); \
341
- lcol(6) = lhs_element(6, j); \
342
- rrow(6) = rhs_element(i, 6); \
343
- lcol(7) = lhs_element(7, j); \
344
- rrow(7) = rhs_element(i, 7); \
345
-
346
- #define computeCol(j) \
347
- res(0, j) += lcol(0) * rrow(j); \
348
- res(1, j) += lcol(1) * rrow(j); \
349
- res(2, j) += lcol(2) * rrow(j); \
350
- res(3, j) += lcol(3) * rrow(j); \
351
- res(4, j) += lcol(4) * rrow(j); \
352
- res(5, j) += lcol(5) * rrow(j); \
353
- res(6, j) += lcol(6) * rrow(j); \
354
- res(7, j) += lcol(7) * rrow(j); \
355
-
356
- #define computePass(i) \
357
- loadData(i, i); \
358
- \
359
- computeCol(0); \
360
- computeCol(1); \
361
- computeCol(2); \
362
- computeCol(3); \
363
- computeCol(4); \
364
- computeCol(5); \
365
- computeCol(6); \
366
- computeCol(7); \
367
-
368
- computePass(0);
369
- computePass(1);
370
- computePass(2);
371
- computePass(3);
372
- computePass(4);
373
- computePass(5);
374
- computePass(6);
375
- computePass(7);
376
-
377
- #undef lcol
378
- #undef rrow
379
- #undef lhs_element
380
- #undef rhs_element
381
- #undef loadData
382
- #undef computeCol
383
- #undef computePass
384
- } // end loop over k
385
-
386
- // we've now iterated over all of the large (ie width 64) k blocks and
387
- // accumulated results in registers. At this point thread (x, y, z) contains
388
- // the sum across all big k blocks of the product of little k block of index (x, y)
389
- // with block of index (y, z). To compute the final output, we need to reduce
390
- // the 8 threads over y by summation.
391
- #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
392
- #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
393
- #else
394
- #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
395
- #endif
396
-
397
- #define reduceRow(i, mask) \
398
- shuffleInc(i, 0, mask); \
399
- shuffleInc(i, 1, mask); \
400
- shuffleInc(i, 2, mask); \
401
- shuffleInc(i, 3, mask); \
402
- shuffleInc(i, 4, mask); \
403
- shuffleInc(i, 5, mask); \
404
- shuffleInc(i, 6, mask); \
405
- shuffleInc(i, 7, mask); \
406
-
407
- #define reduceMatrix(mask) \
408
- reduceRow(0, mask); \
409
- reduceRow(1, mask); \
410
- reduceRow(2, mask); \
411
- reduceRow(3, mask); \
412
- reduceRow(4, mask); \
413
- reduceRow(5, mask); \
414
- reduceRow(6, mask); \
415
- reduceRow(7, mask); \
416
-
417
- // actually perform the reduction, now each thread of index (_, y, z)
418
- // contains the correct values in its registers that belong in the output
419
- // block
420
- reduceMatrix(1);
421
- reduceMatrix(2);
422
- reduceMatrix(4);
423
-
424
- #undef shuffleInc
425
- #undef reduceRow
426
- #undef reduceMatrix
427
-
428
- // now we need to copy the 64 values into main memory. We can't split work
429
- // among threads because all variables are in registers. There's 2 ways
430
- // to do this:
431
- // (1) have 1 thread do 64 writes from registers into global memory
432
- // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
433
- // each do 8 writes into global memory. We can just overwrite the shared
434
- // memory from the problem we just solved.
435
- // (2) is slightly faster than (1) due to less branching and more ILP
436
-
437
- // TODO: won't yield much gain, but could just use currently unused shared mem
438
- // and then we won't have to sync
439
- // wait for shared mem to be out of use
440
- __syncthreads();
441
-
442
- #define writeResultShmem(i, j) \
443
- lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
444
-
445
- #define writeRow(i) \
446
- writeResultShmem(i, 0); \
447
- writeResultShmem(i, 1); \
448
- writeResultShmem(i, 2); \
449
- writeResultShmem(i, 3); \
450
- writeResultShmem(i, 4); \
451
- writeResultShmem(i, 5); \
452
- writeResultShmem(i, 6); \
453
- writeResultShmem(i, 7); \
454
-
455
- if (threadIdx.x == 0) {
456
- writeRow(0);
457
- writeRow(1);
458
- writeRow(2);
459
- writeRow(3);
460
- writeRow(4);
461
- writeRow(5);
462
- writeRow(6);
463
- writeRow(7);
464
- }
465
- #undef writeResultShmem
466
- #undef writeRow
467
-
468
- const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
469
- const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
470
-
471
- if (threadIdx.x < max_i_write) {
472
- if (max_j_write == 8) {
473
- // TODO: can i trade bank conflicts for coalesced writes?
474
- Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
475
- Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
476
- Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
477
- Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
478
- Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
479
- Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
480
- Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
481
- Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
482
-
483
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
484
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
485
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
486
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
487
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
488
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
489
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
490
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
491
- } else {
492
- #pragma unroll 7
493
- for (int j = 0; j < max_j_write; j++) {
494
- Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
495
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
496
- }
497
- }
498
- }
499
- #undef res
500
- }
501
-
502
-
503
- template<typename Scalar, typename Index, typename LhsMapper,
504
- typename RhsMapper, typename OutputMapper>
505
- __global__ void
506
- #if defined(EIGEN_HIPCC)
507
- __launch_bounds__(512, 1)
508
- #else
509
- __launch_bounds__(512)
510
- #endif
511
- EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
512
- const OutputMapper output,
513
- const Index m_size, const Index n_size, const Index k_size) {
514
- __shared__ Scalar lhs_shmem[72 * 64];
515
- __shared__ Scalar rhs_shmem[72 * 64];
516
-
517
- const Index m_block_idx = blockIdx.x;
518
- const Index n_block_idx = blockIdx.y;
519
-
520
- const Index base_m = 64 * m_block_idx;
521
- const Index base_n = 64 * n_block_idx;
522
-
523
- if (base_m + 63 < m_size && base_n + 63 < n_size) {
524
- EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
525
- } else {
526
- EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
527
- }
528
- }
529
-
530
-
531
- template<typename Index, typename LhsMapper,
532
- typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
533
- bool CHECK_RHS_BOUNDARY>
534
- __device__ __forceinline__ void
535
- EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
536
- const OutputMapper output, float2 lhs_shmem2[][16],
537
- float2 rhs_shmem2[][8], const Index m_size,
538
- const Index n_size, const Index k_size,
539
- const Index base_m, const Index base_n) {
540
-
541
- // prefetch registers
542
- float4 lhs_pf0, rhs_pf0;
543
-
544
- float4 results[4];
545
- for (int i=0; i < 4; i++) {
546
- results[i].x = results[i].y = results[i].z = results[i].w = 0;
547
- }
548
-
549
- #define prefetch_lhs(reg, row, col) \
550
- if (!CHECK_LHS_BOUNDARY) { \
551
- if (col < k_size) { \
552
- reg =lhs.template loadPacket<float4,Unaligned>(row, col); \
553
- } \
554
- } else { \
555
- if (col < k_size) { \
556
- if (row + 3 < m_size) { \
557
- reg =lhs.template loadPacket<float4,Unaligned>(row, col); \
558
- } else if (row + 2 < m_size) { \
559
- reg.x =lhs(row + 0, col); \
560
- reg.y =lhs(row + 1, col); \
561
- reg.z =lhs(row + 2, col); \
562
- } else if (row + 1 < m_size) { \
563
- reg.x =lhs(row + 0, col); \
564
- reg.y =lhs(row + 1, col); \
565
- } else if (row < m_size) { \
566
- reg.x =lhs(row + 0, col); \
567
- } \
568
- } \
569
- } \
570
-
571
- Index lhs_vert = base_m+threadIdx.x*4;
572
-
573
- for (Index k = 0; k < k_size; k += 16) {
574
-
575
- lhs_pf0 = internal::pset1<float4>(0);
576
- rhs_pf0 = internal::pset1<float4>(0);
577
-
578
- Index lhs_horiz = threadIdx.y+k;
579
- prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
580
-
581
- Index rhs_vert = k+(threadIdx.x%4)*4;
582
- Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
583
-
584
- if (!CHECK_RHS_BOUNDARY) {
585
- if ((rhs_vert + 3) < k_size) {
586
- // just CHECK_RHS_BOUNDARY
587
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
588
- } else if (rhs_vert + 2 < k_size) {
589
- // just CHECK_RHS_BOUNDARY
590
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
591
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
592
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
593
- } else if (rhs_vert + 1 < k_size) {
594
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
595
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
596
- } else if (rhs_vert < k_size) {
597
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
598
- }
599
- } else {
600
- if (rhs_horiz0 < n_size) {
601
- if ((rhs_vert + 3) < k_size) {
602
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
603
- } else if ((rhs_vert + 2) < k_size) {
604
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
605
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
606
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
607
- } else if ((rhs_vert + 1) < k_size) {
608
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
609
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
610
- } else if (rhs_vert < k_size) {
611
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
612
- }
613
- }
614
- }
615
- float x1, x2 ;
616
- // the following can be a bitwise operation..... some day.
617
- if((threadIdx.x%8) < 4) {
618
- x1 = rhs_pf0.y;
619
- x2 = rhs_pf0.w;
620
- } else {
621
- x1 = rhs_pf0.x;
622
- x2 = rhs_pf0.z;
623
- }
624
- #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
625
- x1 = __shfl_xor(x1, 4);
626
- x2 = __shfl_xor(x2, 4);
627
- #else
628
- x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
629
- x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
630
- #endif
631
- if((threadIdx.x%8) < 4) {
632
- rhs_pf0.y = x1;
633
- rhs_pf0.w = x2;
634
- } else {
635
- rhs_pf0.x = x1;
636
- rhs_pf0.z = x2;
637
- }
638
-
639
- // We have 64 features.
640
- // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
641
- // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
642
- // ...
643
- // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
644
- // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
645
- // ...
646
- rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
647
- rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
648
-
649
- // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
650
- // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
651
- // ...
652
- // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
653
- // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63)
654
- // ...
655
-
656
- lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
657
- lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
658
-
659
-
660
- #define add_vals(fl1, fl2, fr1, fr2)\
661
- results[0].x += fl1.x * fr1.x;\
662
- results[0].y += fl1.y * fr1.x;\
663
- results[0].z += fl2.x * fr1.x;\
664
- results[0].w += fl2.y * fr1.x;\
665
- \
666
- results[1].x += fl1.x * fr1.y;\
667
- results[1].y += fl1.y * fr1.y;\
668
- results[1].z += fl2.x * fr1.y;\
669
- results[1].w += fl2.y * fr1.y;\
670
- \
671
- results[2].x += fl1.x * fr2.x;\
672
- results[2].y += fl1.y * fr2.x;\
673
- results[2].z += fl2.x * fr2.x;\
674
- results[2].w += fl2.y * fr2.x;\
675
- \
676
- results[3].x += fl1.x * fr2.y;\
677
- results[3].y += fl1.y * fr2.y;\
678
- results[3].z += fl2.x * fr2.y;\
679
- results[3].w += fl2.y * fr2.y;\
680
-
681
- __syncthreads();
682
-
683
- // Do the multiplies.
684
- #pragma unroll
685
- for (int koff = 0; koff < 16; koff ++) {
686
- // 32 x threads.
687
- float2 fl1 = lhs_shmem2[koff][threadIdx.x];
688
- float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
689
-
690
- int start_feature = threadIdx.y * 4;
691
- float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
692
- float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
693
-
694
- add_vals(fl1, fl2, fr1, fr2)
695
- }
696
- __syncthreads();
697
- }
698
-
699
- #undef prefetch_lhs
700
- #undef add_vals
701
-
702
- Index horiz_base = threadIdx.y*4+base_n;
703
- if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
704
- for (int i = 0; i < 4; i++) {
705
- output(lhs_vert, horiz_base + i) = results[i].x;
706
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
707
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
708
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
709
- }
710
- } else if (!CHECK_RHS_BOUNDARY) {
711
- // CHECK LHS
712
- if (lhs_vert + 3 < m_size) {
713
- for (int i = 0; i < 4; i++) {
714
- output(lhs_vert, horiz_base + i) = results[i].x;
715
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
716
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
717
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
718
- }
719
- } else if (lhs_vert + 2 < m_size) {
720
- for (int i = 0; i < 4; i++) {
721
- output(lhs_vert, horiz_base + i) = results[i].x;
722
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
723
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
724
- }
725
- } else if (lhs_vert + 1 < m_size) {
726
- for (int i = 0; i < 4; i++) {
727
- output(lhs_vert, horiz_base + i) = results[i].x;
728
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
729
- }
730
- } else if (lhs_vert < m_size) {
731
- for (int i = 0; i < 4; i++) {
732
- output(lhs_vert, horiz_base + i) = results[i].x;
733
- }
734
- }
735
- } else if (!CHECK_LHS_BOUNDARY) {
736
- // CHECK RHS
737
- /*
738
- int ncols_rem = fminf(n_size- horiz_base, 4);
739
- for (int i = 0; i < ncols_rem; i++) {
740
- output(lhs_vert, horiz_base + i) = results[i].x;
741
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
742
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
743
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
744
- }*/
745
- for (int i = 0; i < 4; i++) {
746
- if (horiz_base+i < n_size) {
747
- output(lhs_vert, horiz_base + i) = results[i].x;
748
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
749
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
750
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
751
- }
752
- }
753
- } else {
754
- // CHECK both boundaries.
755
- for (int i = 0; i < 4; i++) {
756
- if (horiz_base+i < n_size) {
757
- if (lhs_vert < m_size)
758
- output(lhs_vert, horiz_base + i) = results[i].x;
759
- if (lhs_vert + 1 < m_size)
760
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
761
- if (lhs_vert + 2 < m_size)
762
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
763
- if (lhs_vert + 3 < m_size)
764
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
765
- }
766
- }
767
- }
768
- }
769
-
770
-
771
- template<typename Index, typename LhsMapper,
772
- typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
773
- bool CHECK_RHS_BOUNDARY>
774
- __device__ __forceinline__ void
775
- EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
776
- const OutputMapper output, float2 lhs_shmem2[][32],
777
- float2 rhs_shmem2[][8], const Index m_size,
778
- const Index n_size, const Index k_size,
779
- const Index base_m, const Index base_n) {
780
-
781
- // prefetch registers
782
- float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
783
- float4 rhs_pf0, rhs_pf1;
784
-
785
- float4 results[8];
786
- for (int i=0; i < 8; i++) {
787
- results[i].x = results[i].y = results[i].z = results[i].w = 0;
788
- }
789
-
790
- Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
791
- for (Index k = 0; k < k_size; k += 32) {
792
- lhs_pf0 = internal::pset1<float4>(0);
793
- lhs_pf1 = internal::pset1<float4>(0);
794
- lhs_pf2 = internal::pset1<float4>(0);
795
- lhs_pf3 = internal::pset1<float4>(0);
796
-
797
- rhs_pf0 = internal::pset1<float4>(0);
798
- rhs_pf1 = internal::pset1<float4>(0);
799
-
800
- if (!CHECK_LHS_BOUNDARY) {
801
- if ((threadIdx.y/4+k+24) < k_size) {
802
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
803
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
804
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
805
- lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
806
- } else if ((threadIdx.y/4+k+16) < k_size) {
807
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
808
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
809
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
810
- } else if ((threadIdx.y/4+k+8) < k_size) {
811
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
812
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
813
- } else if ((threadIdx.y/4+k) < k_size) {
814
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
815
- }
816
- } else {
817
- // just CHECK_LHS_BOUNDARY
818
- if (lhs_vert + 3 < m_size) {
819
- if ((threadIdx.y/4+k+24) < k_size) {
820
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
821
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
822
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
823
- lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
824
- } else if ((threadIdx.y/4+k+16) < k_size) {
825
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
826
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
827
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
828
- } else if ((threadIdx.y/4+k+8) < k_size) {
829
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
830
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
831
- } else if ((threadIdx.y/4+k) < k_size) {
832
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
833
- }
834
- } else if (lhs_vert + 2 < m_size) {
835
- if ((threadIdx.y/4+k+24) < k_size) {
836
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
837
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
838
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
839
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
840
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
841
- lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
842
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
843
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
844
- lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
845
- lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
846
- lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
847
- lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
848
- } else if ((threadIdx.y/4+k+16) < k_size) {
849
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
850
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
851
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
852
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
853
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
854
- lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
855
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
856
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
857
- lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
858
- } else if ((threadIdx.y/4+k+8) < k_size) {
859
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
860
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
861
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
862
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
863
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
864
- lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
865
- } else if ((threadIdx.y/4+k) < k_size) {
866
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
867
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
868
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
869
- }
870
- } else if (lhs_vert + 1 < m_size) {
871
- if ((threadIdx.y/4+k+24) < k_size) {
872
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
873
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
874
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
875
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
876
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
877
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
878
- lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
879
- lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
880
- } else if ((threadIdx.y/4+k+16) < k_size) {
881
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
882
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
883
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
884
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
885
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
886
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
887
- } else if ((threadIdx.y/4+k+8) < k_size) {
888
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
889
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
890
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
891
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
892
- } else if ((threadIdx.y/4+k) < k_size) {
893
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
894
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
895
- }
896
- } else if (lhs_vert < m_size) {
897
- if ((threadIdx.y/4+k+24) < k_size) {
898
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
899
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
900
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
901
- lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
902
- } else if ((threadIdx.y/4+k+16) < k_size) {
903
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
904
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
905
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
906
- } else if ((threadIdx.y/4+k+8) < k_size) {
907
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
908
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
909
- } else if ((threadIdx.y/4+k) < k_size) {
910
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
911
- }
912
- }
913
- }
914
- __syncthreads();
915
- Index rhs_vert = k+threadIdx.x*4;
916
- Index rhs_horiz0 = threadIdx.y*2+base_n;
917
- Index rhs_horiz1 = threadIdx.y*2+1+base_n;
918
- if (!CHECK_RHS_BOUNDARY) {
919
- if ((rhs_vert + 3) < k_size) {
920
- // just CHECK_RHS_BOUNDARY
921
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
922
- rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
923
- } else if (rhs_vert + 2 < k_size) {
924
- // just CHECK_RHS_BOUNDARY
925
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
926
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
927
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
928
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
929
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
930
- rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
931
- } else if (rhs_vert + 1 < k_size) {
932
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
933
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
934
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
935
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
936
- } else if (rhs_vert < k_size) {
937
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
938
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
939
- }
940
- } else {
941
- if (rhs_horiz1 < n_size) {
942
- if ((rhs_vert + 3) < k_size) {
943
- // just CHECK_RHS_BOUNDARY
944
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
945
- rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
946
- } else if (rhs_vert + 2 < k_size) {
947
- // just CHECK_RHS_BOUNDARY
948
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
949
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
950
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
951
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
952
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
953
- rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
954
- } else if (k+threadIdx.x*4 + 1 < k_size) {
955
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
956
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
957
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
958
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
959
- } else if (k+threadIdx.x*4 < k_size) {
960
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
961
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
962
- }
963
- } else if (rhs_horiz0 < n_size) {
964
- if ((rhs_vert + 3) < k_size) {
965
- // just CHECK_RHS_BOUNDARY
966
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
967
- } else if ((rhs_vert + 2) < k_size) {
968
- // just CHECK_RHS_BOUNDARY
969
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
970
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
971
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
972
- } else if ((rhs_vert + 1) < k_size) {
973
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
974
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
975
- } else if (rhs_vert < k_size) {
976
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
977
- }
978
- }
979
- }
980
- __syncthreads();
981
- // Loaded. Do computation
982
- // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
983
- // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
984
- // ..
985
- // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
986
- rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
987
- // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
988
- // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
989
- // ..
990
- rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
991
- // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
992
- // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
993
- rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
994
- // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
995
- // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
996
- rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
997
-
998
- // LHS.
999
- // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
1000
- // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
1001
- // ...
1002
- // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
1003
- // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
1004
-
1005
-
1006
- #define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
1007
- results[0].x += a_feat1.x * f1.x;\
1008
- results[1].x += a_feat1.x * f1.y;\
1009
- results[2].x += a_feat1.x * f2.x;\
1010
- results[3].x += a_feat1.x * f2.y;\
1011
- results[4].x += a_feat1.x * f3.x;\
1012
- results[5].x += a_feat1.x * f3.y;\
1013
- results[6].x += a_feat1.x * f4.x;\
1014
- results[7].x += a_feat1.x * f4.y;\
1015
- \
1016
- results[0].y += a_feat1.y * f1.x;\
1017
- results[1].y += a_feat1.y * f1.y;\
1018
- results[2].y += a_feat1.y * f2.x;\
1019
- results[3].y += a_feat1.y * f2.y;\
1020
- results[4].y += a_feat1.y * f3.x;\
1021
- results[5].y += a_feat1.y * f3.y;\
1022
- results[6].y += a_feat1.y * f4.x;\
1023
- results[7].y += a_feat1.y * f4.y;\
1024
- \
1025
- results[0].z += a_feat2.x * f1.x;\
1026
- results[1].z += a_feat2.x * f1.y;\
1027
- results[2].z += a_feat2.x * f2.x;\
1028
- results[3].z += a_feat2.x * f2.y;\
1029
- results[4].z += a_feat2.x * f3.x;\
1030
- results[5].z += a_feat2.x * f3.y;\
1031
- results[6].z += a_feat2.x * f4.x;\
1032
- results[7].z += a_feat2.x * f4.y;\
1033
- \
1034
- results[0].w += a_feat2.y * f1.x;\
1035
- results[1].w += a_feat2.y * f1.y;\
1036
- results[2].w += a_feat2.y * f2.x;\
1037
- results[3].w += a_feat2.y * f2.y;\
1038
- results[4].w += a_feat2.y * f3.x;\
1039
- results[5].w += a_feat2.y * f3.y;\
1040
- results[6].w += a_feat2.y * f4.x;\
1041
- results[7].w += a_feat2.y * f4.y;\
1042
-
1043
- lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
1044
- lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
1045
- lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
1046
- lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
1047
-
1048
- lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
1049
- lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
1050
- lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
1051
- lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
1052
-
1053
- __syncthreads();
1054
-
1055
- // Do the multiplies.
1056
- #pragma unroll
1057
- for (int koff = 0; koff < 32; koff ++) {
1058
- float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
1059
- float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
1060
-
1061
- // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
1062
- int start_feature = (threadIdx.y / 4) * 8;
1063
-
1064
- float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4];
1065
- float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
1066
- float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
1067
- float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
1068
-
1069
- add_vals(a3, a4, br1, br2, br3, br4)
1070
- }
1071
- __syncthreads();
1072
- } // end loop over k
1073
-
1074
- __syncthreads();
1075
- Index horiz_base = (threadIdx.y/4)*8+base_n;
1076
- if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
1077
- for (int i = 0; i < 8; i++) {
1078
- output(lhs_vert, horiz_base + i) = results[i].x;
1079
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1080
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1081
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1082
- }
1083
- } else if (!CHECK_RHS_BOUNDARY) {
1084
- if (lhs_vert + 3 < m_size) {
1085
- for (int i = 0; i < 8; i++) {
1086
- output(lhs_vert, horiz_base + i) = results[i].x;
1087
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1088
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1089
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1090
- }
1091
- } else if (lhs_vert + 2 < m_size) {
1092
- for (int i = 0; i < 8; i++) {
1093
- output(lhs_vert, horiz_base + i) = results[i].x;
1094
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1095
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1096
- }
1097
- } else if (lhs_vert + 1 < m_size) {
1098
- for (int i = 0; i < 8; i++) {
1099
- output(lhs_vert, horiz_base + i) = results[i].x;
1100
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1101
- }
1102
- } else if (lhs_vert < m_size) {
1103
- for (int i = 0; i < 8; i++) {
1104
- output(lhs_vert, horiz_base + i) = results[i].x;
1105
- }
1106
- }
1107
- } else if (!CHECK_LHS_BOUNDARY) {
1108
- // CHECK BOUNDARY_B
1109
- for (int i = 0; i < 8; i++) {
1110
- if (horiz_base + i < n_size) {
1111
- output(lhs_vert, horiz_base + i) = results[i].x;
1112
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1113
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1114
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1115
- }
1116
- }
1117
- } else {
1118
- // CHECK both boundaries.
1119
- for (int i = 0; i < 8; i++) {
1120
- if (horiz_base + i < n_size) {
1121
- if (lhs_vert < m_size)
1122
- output(lhs_vert, horiz_base + i) = results[i].x;
1123
- if (lhs_vert + 1 < m_size)
1124
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1125
- if (lhs_vert + 2 < m_size)
1126
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1127
- if (lhs_vert + 3 < m_size)
1128
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1129
- }
1130
- }
1131
- }
1132
- }
1133
-
1134
-
1135
- template<typename Index, typename LhsMapper,
1136
- typename RhsMapper, typename OutputMapper>
1137
- __global__ void
1138
- #if defined(EIGEN_HIPCC)
1139
- __launch_bounds__(256, 1)
1140
- #else
1141
- __launch_bounds__(256)
1142
- #endif
1143
- EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
1144
- const OutputMapper output,
1145
- const Index m_size, const Index n_size, const Index k_size) {
1146
- __shared__ float2 lhs_shmem[64*32];
1147
- __shared__ float2 rhs_shmem[128*8];
1148
-
1149
- typedef float2 LHS_MEM[64][32];
1150
- typedef float2 RHS_MEM[128][8];
1151
-
1152
- const Index m_block_idx = blockIdx.x;
1153
- const Index n_block_idx = blockIdx.y;
1154
-
1155
- const Index base_m = 128 * m_block_idx;
1156
- const Index base_n = 64 * n_block_idx;
1157
-
1158
- bool check_rhs = (base_n + 63) >= n_size;
1159
- bool check_lhs128 = (base_m + 127) >= m_size;
1160
-
1161
- if (!check_rhs) {
1162
- if (!check_lhs128) {
1163
- // >= 128 rows left
1164
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
1165
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1166
- } else {
1167
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
1168
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1169
- }
1170
- } else {
1171
- if (!check_lhs128) {
1172
- // >= 128 rows left
1173
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
1174
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1175
- } else {
1176
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
1177
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1178
- }
1179
- }
1180
- }
1181
-
1182
- template<typename Index, typename LhsMapper,
1183
- typename RhsMapper, typename OutputMapper>
1184
- __global__ void
1185
- #if defined(EIGEN_HIPCC)
1186
- __launch_bounds__(256, 1)
1187
- #else
1188
- __launch_bounds__(256)
1189
- #endif
1190
- EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
1191
- const OutputMapper output,
1192
- const Index m_size, const Index n_size, const Index k_size) {
1193
- __shared__ float2 lhs_shmem[32][16];
1194
- __shared__ float2 rhs_shmem[64][8];
1195
-
1196
- const Index m_block_idx = blockIdx.x;
1197
- const Index n_block_idx = blockIdx.y;
1198
-
1199
- const Index base_m = 64 * m_block_idx;
1200
- const Index base_n = 64 * n_block_idx;
1201
-
1202
- if (base_m + 63 < m_size) {
1203
- if (base_n + 63 < n_size) {
1204
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1205
- } else {
1206
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1207
- }
1208
- } else {
1209
- if (base_n + 63 < n_size) {
1210
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1211
- } else {
1212
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1213
- }
1214
- }
1215
- }
1216
-
1217
-
1218
- template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
1219
- struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
1220
- public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
1221
-
1222
- typedef GpuDevice Device;
1223
-
1224
- typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
1225
- typedef TensorContractionEvaluatorBase<Self> Base;
1226
-
1227
- typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
1228
- typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
1229
- typedef typename XprType::Index Index;
1230
- typedef typename XprType::CoeffReturnType CoeffReturnType;
1231
- typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
1232
-
1233
- enum {
1234
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
1235
- };
1236
-
1237
- // Most of the code is assuming that both input tensors are ColMajor. If the
1238
- // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
1239
- // If we want to compute A * B = C, where A is LHS and B is RHS, the code
1240
- // will pretend B is LHS and A is RHS.
1241
- typedef typename internal::conditional<
1242
- static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
1243
- typedef typename internal::conditional<
1244
- static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
1245
-
1246
- static const int LDims =
1247
- internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
1248
- static const int RDims =
1249
- internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
1250
- static const int ContractDims = internal::array_size<Indices>::value;
1251
-
1252
- typedef array<Index, LDims> left_dim_mapper_t;
1253
- typedef array<Index, RDims> right_dim_mapper_t;
1254
-
1255
- typedef array<Index, ContractDims> contract_t;
1256
- typedef array<Index, LDims - ContractDims> left_nocontract_t;
1257
- typedef array<Index, RDims - ContractDims> right_nocontract_t;
1258
-
1259
- static const int NumDims = LDims + RDims - 2 * ContractDims;
1260
-
1261
- typedef DSizes<Index, NumDims> Dimensions;
1262
-
1263
- // typedefs needed in evalTo
1264
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
1265
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
1266
-
1267
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
1268
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
1269
-
1270
- typedef typename LeftEvaluator::Dimensions LeftDimensions;
1271
- typedef typename RightEvaluator::Dimensions RightDimensions;
1272
-
1273
- TensorEvaluator(const XprType& op, const Device& device) :
1274
- Base(op, device)
1275
- {
1276
- EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
1277
- GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
1278
- }
1279
-
1280
- // We need to redefine this method to make nvcc happy
1281
- EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
1282
- this->m_leftImpl.evalSubExprsIfNeeded(NULL);
1283
- this->m_rightImpl.evalSubExprsIfNeeded(NULL);
1284
- if (data) {
1285
- evalTo(data);
1286
- return false;
1287
- } else {
1288
- this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
1289
- evalTo(this->m_result);
1290
- return true;
1291
- }
1292
- }
1293
-
1294
- void evalTo(Scalar* buffer) const {
1295
- if (this->m_lhs_inner_dim_contiguous) {
1296
- if (this->m_rhs_inner_dim_contiguous) {
1297
- if (this->m_rhs_inner_dim_reordered) {
1298
- evalTyped<true, true, true, Unaligned>(buffer);
1299
- }
1300
- else {
1301
- evalTyped<true, true, false, Unaligned>(buffer);
1302
- }
1303
- }
1304
- else {
1305
- if (this->m_rhs_inner_dim_reordered) {
1306
- evalTyped<true, false, true, Unaligned>(buffer);
1307
- }
1308
- else {
1309
- evalTyped<true, false, false, Unaligned>(buffer);
1310
- }
1311
- }
1312
- }
1313
- else {
1314
- if (this->m_rhs_inner_dim_contiguous) {
1315
- if (this->m_rhs_inner_dim_reordered) {
1316
- evalTyped<false, true, true, Unaligned>(buffer);
1317
- }
1318
- else {
1319
- evalTyped<false, true, false, Unaligned>(buffer);
1320
- }
1321
- }
1322
- else {
1323
- if (this->m_rhs_inner_dim_reordered) {
1324
- evalTyped<false, false, true, Unaligned>(buffer);
1325
- }
1326
- else {
1327
- evalTyped<false, false, false, Unaligned>(buffer);
1328
- }
1329
- }
1330
- }
1331
- }
1332
-
1333
- template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
1334
- static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
1335
- const Index m_blocks = (m + 63) / 64;
1336
- const Index n_blocks = (n + 63) / 64;
1337
- const dim3 num_blocks(m_blocks, n_blocks, 1);
1338
- const dim3 block_size(8, 8, 8);
1339
- LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
1340
- }
1341
- };
1342
-
1343
- template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
1344
- static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
1345
- if (m < 768 || n < 768) {
1346
- const Index m_blocks = (m + 63) / 64;
1347
- const Index n_blocks = (n + 63) / 64;
1348
- const dim3 num_blocks(m_blocks, n_blocks, 1);
1349
- const dim3 block_size(16, 16, 1);
1350
- LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
1351
- } else {
1352
- const Index m_blocks = (m + 127) / 128;
1353
- const Index n_blocks = (n + 63) / 64;
1354
- const dim3 num_blocks(m_blocks, n_blocks, 1);
1355
- const dim3 block_size(8, 32, 1);
1356
- LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
1357
- }
1358
- }
1359
- };
1360
-
1361
- template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
1362
- void evalTyped(Scalar* buffer) const {
1363
- // columns in left side, rows in right side
1364
- const Index k = this->m_k_size;
1365
- EIGEN_UNUSED_VARIABLE(k)
1366
-
1367
- // rows in left side
1368
- const Index m = this->m_i_size;
1369
-
1370
- // columns in right side
1371
- const Index n = this->m_j_size;
1372
-
1373
- // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
1374
- this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
1375
-
1376
- typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
1377
- LeftEvaluator, left_nocontract_t,
1378
- contract_t, 4,
1379
- lhs_inner_dim_contiguous,
1380
- false, Unaligned> LhsMapper;
1381
-
1382
- typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
1383
- RightEvaluator, right_nocontract_t,
1384
- contract_t, 4,
1385
- rhs_inner_dim_contiguous,
1386
- rhs_inner_dim_reordered, Unaligned> RhsMapper;
1387
-
1388
- typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
1389
-
1390
-
1391
- // initialize data mappers
1392
- LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
1393
- this->m_left_contracting_strides, this->m_k_strides);
1394
-
1395
- RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
1396
- this->m_right_contracting_strides, this->m_k_strides);
1397
-
1398
- OutputMapper output(buffer, m);
1399
-
1400
- #if defined(EIGEN_USE_HIP)
1401
- setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
1402
- #else
1403
- setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
1404
- #endif
1405
-
1406
- LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device);
1407
- }
1408
- };
1409
-
1410
- } // end namespace Eigen
1411
-
1412
- #endif // EIGEN_USE_GPU and EIGEN_GPUCC
1413
- #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H