sequenzo 0.1.17__cp39-cp39-win_amd64.whl → 0.1.19__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (475) hide show
  1. sequenzo/__init__.py +64 -8
  2. sequenzo/big_data/clara/clara.py +1 -1
  3. sequenzo/big_data/clara/utils/get_weighted_diss.c +156 -156
  4. sequenzo/big_data/clara/utils/get_weighted_diss.cp39-win_amd64.pyd +0 -0
  5. sequenzo/clustering/KMedoids.py +39 -0
  6. sequenzo/clustering/clustering_c_code.cp39-win_amd64.pyd +0 -0
  7. sequenzo/clustering/hierarchical_clustering.py +304 -8
  8. sequenzo/define_sequence_data.py +44 -3
  9. sequenzo/dissimilarity_measures/c_code.cp39-win_amd64.pyd +0 -0
  10. sequenzo/dissimilarity_measures/get_distance_matrix.py +1 -2
  11. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +1 -1
  12. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +13 -37
  13. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +13 -37
  14. sequenzo/dissimilarity_measures/src/OMdistance.cpp +12 -47
  15. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +103 -67
  16. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  17. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +41 -16
  18. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +4 -0
  19. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +7 -0
  20. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +10 -0
  21. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +127 -43
  22. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +30 -2
  23. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  24. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +14 -5
  25. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +111 -54
  26. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +131 -9
  27. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +11 -113
  28. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +39 -7
  29. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +336 -30
  30. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +9 -37
  31. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +58 -0
  32. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +1 -0
  33. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +35 -2
  34. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +3 -1
  35. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +17 -0
  36. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +13 -0
  37. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +18 -0
  38. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +13 -0
  39. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +8 -0
  40. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +363 -34
  41. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +7 -0
  42. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +13 -0
  43. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +41 -4
  44. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +252 -16
  45. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +9 -0
  46. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +12 -1
  47. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +7 -0
  48. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  49. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +78 -1
  50. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +3 -1
  51. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +13 -2
  52. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +5 -0
  53. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +5 -1
  54. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +2 -0
  55. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +64 -1
  56. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +36 -0
  57. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +40 -31
  58. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +8 -0
  59. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +6 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +54 -2
  63. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +8 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +11 -4
  65. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +18 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +8 -14
  67. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +216 -173
  68. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +1 -1
  70. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +7 -4
  71. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +6 -2
  72. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +32 -18
  73. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +21 -24
  74. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +69 -9
  75. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.c +156 -156
  76. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cp39-win_amd64.pyd +0 -0
  77. sequenzo/dissimilarity_measures/utils/seqconc.c +156 -156
  78. sequenzo/dissimilarity_measures/utils/seqconc.cp39-win_amd64.pyd +0 -0
  79. sequenzo/dissimilarity_measures/utils/seqdss.c +156 -156
  80. sequenzo/dissimilarity_measures/utils/seqdss.cp39-win_amd64.pyd +0 -0
  81. sequenzo/dissimilarity_measures/utils/seqdur.c +156 -156
  82. sequenzo/dissimilarity_measures/utils/seqdur.cp39-win_amd64.pyd +0 -0
  83. sequenzo/dissimilarity_measures/utils/seqlength.c +156 -156
  84. sequenzo/dissimilarity_measures/utils/seqlength.cp39-win_amd64.pyd +0 -0
  85. sequenzo/multidomain/cat.py +0 -53
  86. sequenzo/multidomain/idcd.py +0 -1
  87. sequenzo/openmp_setup.py +233 -0
  88. sequenzo/sequence_characteristics/__init__.py +4 -0
  89. sequenzo/sequence_characteristics/complexity_index.py +17 -57
  90. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +177 -111
  91. sequenzo/sequence_characteristics/plot_characteristics.py +30 -11
  92. sequenzo/sequence_characteristics/simple_characteristics.py +1 -0
  93. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +9 -3
  94. sequenzo/sequence_characteristics/turbulence.py +47 -67
  95. sequenzo/sequence_characteristics/variance_of_spell_durations.py +19 -9
  96. sequenzo/sequence_characteristics/within_sequence_entropy.py +5 -58
  97. sequenzo/visualization/plot_sequence_index.py +58 -35
  98. sequenzo/visualization/plot_state_distribution.py +57 -36
  99. sequenzo/visualization/plot_transition_matrix.py +21 -22
  100. sequenzo/with_event_history_analysis/__init__.py +35 -0
  101. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  102. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  103. {sequenzo-0.1.17.dist-info → sequenzo-0.1.19.dist-info}/METADATA +48 -14
  104. sequenzo-0.1.19.dist-info/RECORD +272 -0
  105. sequenzo/dissimilarity_measures/setup.py +0 -35
  106. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Cholesky/LDLT.h +0 -688
  107. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Cholesky/LLT.h +0 -558
  108. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +0 -99
  109. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +0 -682
  110. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +0 -346
  111. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +0 -462
  112. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +0 -91
  113. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/EigenSolver.h +0 -622
  114. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +0 -418
  115. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +0 -226
  116. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +0 -374
  117. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +0 -158
  118. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/RealQZ.h +0 -657
  119. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/RealSchur.h +0 -558
  120. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +0 -77
  121. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +0 -904
  122. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +0 -87
  123. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +0 -561
  124. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/AlignedBox.h +0 -486
  125. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/AngleAxis.h +0 -247
  126. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/EulerAngles.h +0 -114
  127. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/Homogeneous.h +0 -501
  128. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/Hyperplane.h +0 -282
  129. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/OrthoMethods.h +0 -235
  130. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/ParametrizedLine.h +0 -232
  131. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/Quaternion.h +0 -870
  132. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/Rotation2D.h +0 -199
  133. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/RotationBase.h +0 -206
  134. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/Scaling.h +0 -188
  135. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/Transform.h +0 -1563
  136. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/Translation.h +0 -202
  137. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/Umeyama.h +0 -166
  138. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +0 -168
  139. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Householder/BlockHouseholder.h +0 -110
  140. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Householder/Householder.h +0 -176
  141. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Householder/HouseholderSequence.h +0 -545
  142. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +0 -226
  143. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +0 -212
  144. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +0 -229
  145. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +0 -394
  146. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +0 -453
  147. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +0 -444
  148. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +0 -198
  149. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +0 -117
  150. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/Jacobi/Jacobi.h +0 -483
  151. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/KLUSupport/KLUSupport.h +0 -358
  152. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/LU/Determinant.h +0 -117
  153. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/LU/FullPivLU.h +0 -877
  154. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/LU/InverseImpl.h +0 -432
  155. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/LU/PartialPivLU.h +0 -624
  156. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +0 -83
  157. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/LU/arch/InverseSize4.h +0 -351
  158. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/MetisSupport/MetisSupport.h +0 -137
  159. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/OrderingMethods/Amd.h +0 -435
  160. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +0 -1863
  161. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/OrderingMethods/Ordering.h +0 -153
  162. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +0 -678
  163. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +0 -545
  164. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/QR/ColPivHouseholderQR.h +0 -674
  165. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +0 -97
  166. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +0 -635
  167. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/QR/FullPivHouseholderQR.h +0 -713
  168. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/QR/HouseholderQR.h +0 -434
  169. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +0 -68
  170. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +0 -335
  171. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SVD/BDCSVD.h +0 -1366
  172. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SVD/JacobiSVD.h +0 -812
  173. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +0 -91
  174. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SVD/SVDBase.h +0 -376
  175. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SVD/UpperBidiagonalization.h +0 -414
  176. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +0 -697
  177. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +0 -174
  178. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/AmbiVector.h +0 -378
  179. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/CompressedStorage.h +0 -274
  180. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +0 -352
  181. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  182. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseAssign.h +0 -270
  183. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseBlock.h +0 -571
  184. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseColEtree.h +0 -206
  185. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +0 -370
  186. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +0 -722
  187. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +0 -150
  188. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +0 -342
  189. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +0 -138
  190. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseDot.h +0 -98
  191. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseFuzzy.h +0 -29
  192. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseMap.h +0 -305
  193. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseMatrix.h +0 -1518
  194. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +0 -398
  195. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparsePermutation.h +0 -178
  196. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseProduct.h +0 -181
  197. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseRedux.h +0 -49
  198. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseRef.h +0 -397
  199. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +0 -659
  200. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseSolverBase.h +0 -124
  201. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +0 -198
  202. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseTranspose.h +0 -92
  203. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseTriangularView.h +0 -189
  204. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseUtil.h +0 -186
  205. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseVector.h +0 -478
  206. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/SparseView.h +0 -254
  207. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseCore/TriangularSolver.h +0 -315
  208. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU.h +0 -923
  209. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLUImpl.h +0 -66
  210. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +0 -226
  211. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +0 -110
  212. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +0 -375
  213. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +0 -80
  214. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +0 -181
  215. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +0 -179
  216. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +0 -107
  217. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  218. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +0 -126
  219. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +0 -130
  220. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +0 -223
  221. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +0 -258
  222. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +0 -137
  223. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +0 -136
  224. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +0 -83
  225. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SparseQR/SparseQR.h +0 -758
  226. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/StlSupport/StdDeque.h +0 -116
  227. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/StlSupport/StdList.h +0 -106
  228. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/StlSupport/StdVector.h +0 -131
  229. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/StlSupport/details.h +0 -84
  230. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +0 -1025
  231. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +0 -642
  232. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/misc/Image.h +0 -82
  233. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/misc/Kernel.h +0 -79
  234. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/misc/RealSvd2x2.h +0 -55
  235. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/misc/blas.h +0 -440
  236. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/misc/lapack.h +0 -152
  237. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/misc/lapacke.h +0 -16292
  238. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/misc/lapacke_mangling.h +0 -17
  239. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
  240. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
  241. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/plugins/BlockMethods.h +0 -1442
  242. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  243. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -177
  244. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
  245. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  246. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
  247. sequenzo/dissimilarity_measures/src/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
  248. sequenzo/dissimilarity_measures/src/eigen/bench/BenchSparseUtil.h +0 -149
  249. sequenzo/dissimilarity_measures/src/eigen/bench/BenchTimer.h +0 -199
  250. sequenzo/dissimilarity_measures/src/eigen/bench/BenchUtil.h +0 -92
  251. sequenzo/dissimilarity_measures/src/eigen/bench/basicbenchmark.h +0 -63
  252. sequenzo/dissimilarity_measures/src/eigen/bench/btl/generic_bench/utils/utilities.h +0 -90
  253. sequenzo/dissimilarity_measures/src/eigen/bench/btl/libs/BLAS/blas.h +0 -675
  254. sequenzo/dissimilarity_measures/src/eigen/bench/btl/libs/BLAS/c_interface_base.h +0 -73
  255. sequenzo/dissimilarity_measures/src/eigen/bench/perf_monitoring/gemm_common.h +0 -67
  256. sequenzo/dissimilarity_measures/src/eigen/bench/perf_monitoring/gemv_common.h +0 -69
  257. sequenzo/dissimilarity_measures/src/eigen/bench/spbench/spbenchsolver.h +0 -573
  258. sequenzo/dissimilarity_measures/src/eigen/bench/spbench/spbenchstyle.h +0 -95
  259. sequenzo/dissimilarity_measures/src/eigen/bench/tensors/benchmark.h +0 -49
  260. sequenzo/dissimilarity_measures/src/eigen/bench/tensors/tensor_benchmarks.h +0 -597
  261. sequenzo/dissimilarity_measures/src/eigen/blas/BandTriangularSolver.h +0 -97
  262. sequenzo/dissimilarity_measures/src/eigen/blas/GeneralRank1Update.h +0 -44
  263. sequenzo/dissimilarity_measures/src/eigen/blas/PackedSelfadjointProduct.h +0 -53
  264. sequenzo/dissimilarity_measures/src/eigen/blas/PackedTriangularMatrixVector.h +0 -79
  265. sequenzo/dissimilarity_measures/src/eigen/blas/PackedTriangularSolverVector.h +0 -88
  266. sequenzo/dissimilarity_measures/src/eigen/blas/Rank2Update.h +0 -57
  267. sequenzo/dissimilarity_measures/src/eigen/blas/common.h +0 -175
  268. sequenzo/dissimilarity_measures/src/eigen/blas/f2c/datatypes.h +0 -24
  269. sequenzo/dissimilarity_measures/src/eigen/blas/level1_cplx_impl.h +0 -155
  270. sequenzo/dissimilarity_measures/src/eigen/blas/level1_impl.h +0 -144
  271. sequenzo/dissimilarity_measures/src/eigen/blas/level1_real_impl.h +0 -122
  272. sequenzo/dissimilarity_measures/src/eigen/blas/level2_cplx_impl.h +0 -360
  273. sequenzo/dissimilarity_measures/src/eigen/blas/level2_impl.h +0 -553
  274. sequenzo/dissimilarity_measures/src/eigen/blas/level2_real_impl.h +0 -306
  275. sequenzo/dissimilarity_measures/src/eigen/blas/level3_impl.h +0 -702
  276. sequenzo/dissimilarity_measures/src/eigen/debug/gdb/__init__.py +0 -1
  277. sequenzo/dissimilarity_measures/src/eigen/debug/gdb/printers.py +0 -314
  278. sequenzo/dissimilarity_measures/src/eigen/demos/mandelbrot/mandelbrot.h +0 -71
  279. sequenzo/dissimilarity_measures/src/eigen/demos/mix_eigen_and_c/binary_library.h +0 -71
  280. sequenzo/dissimilarity_measures/src/eigen/demos/opengl/camera.h +0 -118
  281. sequenzo/dissimilarity_measures/src/eigen/demos/opengl/gpuhelper.h +0 -207
  282. sequenzo/dissimilarity_measures/src/eigen/demos/opengl/icosphere.h +0 -30
  283. sequenzo/dissimilarity_measures/src/eigen/demos/opengl/quaternion_demo.h +0 -114
  284. sequenzo/dissimilarity_measures/src/eigen/demos/opengl/trackball.h +0 -42
  285. sequenzo/dissimilarity_measures/src/eigen/lapack/lapack_common.h +0 -29
  286. sequenzo/dissimilarity_measures/src/eigen/scripts/relicense.py +0 -69
  287. sequenzo/dissimilarity_measures/src/eigen/test/AnnoyingScalar.h +0 -165
  288. sequenzo/dissimilarity_measures/src/eigen/test/MovableScalar.h +0 -35
  289. sequenzo/dissimilarity_measures/src/eigen/test/SafeScalar.h +0 -30
  290. sequenzo/dissimilarity_measures/src/eigen/test/bug1213.h +0 -8
  291. sequenzo/dissimilarity_measures/src/eigen/test/evaluator_common.h +0 -0
  292. sequenzo/dissimilarity_measures/src/eigen/test/gpu_common.h +0 -176
  293. sequenzo/dissimilarity_measures/src/eigen/test/main.h +0 -857
  294. sequenzo/dissimilarity_measures/src/eigen/test/packetmath_test_shared.h +0 -275
  295. sequenzo/dissimilarity_measures/src/eigen/test/product.h +0 -259
  296. sequenzo/dissimilarity_measures/src/eigen/test/random_without_cast_overflow.h +0 -152
  297. sequenzo/dissimilarity_measures/src/eigen/test/solverbase.h +0 -36
  298. sequenzo/dissimilarity_measures/src/eigen/test/sparse.h +0 -204
  299. sequenzo/dissimilarity_measures/src/eigen/test/sparse_solver.h +0 -699
  300. sequenzo/dissimilarity_measures/src/eigen/test/split_test_helper.h +0 -5994
  301. sequenzo/dissimilarity_measures/src/eigen/test/svd_common.h +0 -521
  302. sequenzo/dissimilarity_measures/src/eigen/test/svd_fill.h +0 -118
  303. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +0 -554
  304. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +0 -329
  305. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +0 -247
  306. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +0 -1176
  307. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +0 -1559
  308. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +0 -1093
  309. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +0 -518
  310. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +0 -377
  311. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +0 -1023
  312. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +0 -73
  313. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +0 -6
  314. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +0 -1413
  315. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +0 -575
  316. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +0 -1650
  317. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +0 -1679
  318. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +0 -456
  319. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +0 -1132
  320. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +0 -544
  321. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +0 -214
  322. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +0 -347
  323. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +0 -137
  324. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +0 -6
  325. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +0 -104
  326. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +0 -389
  327. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +0 -1048
  328. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +0 -409
  329. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h +0 -236
  330. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +0 -490
  331. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +0 -236
  332. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +0 -983
  333. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +0 -703
  334. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +0 -388
  335. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +0 -669
  336. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +0 -379
  337. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +0 -237
  338. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +0 -191
  339. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +0 -488
  340. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +0 -302
  341. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h +0 -33
  342. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +0 -99
  343. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h +0 -44
  344. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +0 -79
  345. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +0 -603
  346. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +0 -738
  347. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +0 -247
  348. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +0 -82
  349. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +0 -263
  350. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +0 -216
  351. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +0 -98
  352. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +0 -327
  353. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +0 -311
  354. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +0 -1102
  355. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +0 -708
  356. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +0 -291
  357. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +0 -322
  358. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +0 -998
  359. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +0 -6
  360. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +0 -966
  361. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +0 -582
  362. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +0 -454
  363. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +0 -465
  364. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +0 -528
  365. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h +0 -513
  366. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +0 -471
  367. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +0 -161
  368. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +0 -346
  369. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +0 -303
  370. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +0 -264
  371. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +0 -249
  372. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +0 -629
  373. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +0 -293
  374. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +0 -236
  375. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h +0 -338
  376. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h +0 -669
  377. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h +0 -67
  378. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +0 -249
  379. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +0 -486
  380. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +0 -236
  381. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h +0 -23
  382. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h +0 -40
  383. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h +0 -301
  384. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h +0 -48
  385. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h +0 -20
  386. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +0 -537
  387. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h +0 -88
  388. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h +0 -261
  389. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h +0 -158
  390. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h +0 -108
  391. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +0 -730
  392. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h +0 -220
  393. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/BVH/BVAlgorithms.h +0 -293
  394. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/BVH/KdBVH.h +0 -223
  395. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h +0 -790
  396. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h +0 -355
  397. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h +0 -305
  398. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/FFT/ei_fftw_impl.h +0 -261
  399. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h +0 -449
  400. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h +0 -187
  401. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/IterativeSolvers/DGMRES.h +0 -511
  402. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/IterativeSolvers/GMRES.h +0 -335
  403. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/IterativeSolvers/IDRS.h +0 -436
  404. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h +0 -90
  405. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/IterativeSolvers/IterationController.h +0 -154
  406. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/IterativeSolvers/MINRES.h +0 -267
  407. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/IterativeSolvers/Scaling.h +0 -193
  408. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h +0 -305
  409. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h +0 -84
  410. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h +0 -202
  411. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h +0 -160
  412. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h +0 -188
  413. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h +0 -396
  414. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +0 -441
  415. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +0 -569
  416. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +0 -373
  417. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +0 -705
  418. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h +0 -368
  419. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/MatrixFunctions/StemFunction.h +0 -117
  420. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/MoreVectorization/MathFunctions.h +0 -95
  421. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h +0 -601
  422. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h +0 -657
  423. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/chkder.h +0 -66
  424. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/covar.h +0 -70
  425. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/dogleg.h +0 -107
  426. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h +0 -79
  427. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/lmpar.h +0 -298
  428. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h +0 -91
  429. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h +0 -30
  430. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/r1updt.h +0 -99
  431. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h +0 -49
  432. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h +0 -130
  433. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Polynomials/Companion.h +0 -280
  434. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +0 -428
  435. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Polynomials/PolynomialUtils.h +0 -143
  436. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h +0 -352
  437. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Skyline/SkylineMatrix.h +0 -862
  438. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h +0 -212
  439. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Skyline/SkylineProduct.h +0 -295
  440. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Skyline/SkylineStorage.h +0 -259
  441. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Skyline/SkylineUtil.h +0 -89
  442. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h +0 -122
  443. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h +0 -1079
  444. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h +0 -404
  445. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h +0 -282
  446. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h +0 -247
  447. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SparseExtra/RandomSetter.h +0 -349
  448. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h +0 -286
  449. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h +0 -68
  450. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h +0 -357
  451. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h +0 -66
  452. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h +0 -1959
  453. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h +0 -118
  454. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h +0 -67
  455. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h +0 -167
  456. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h +0 -58
  457. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h +0 -330
  458. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h +0 -58
  459. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +0 -2045
  460. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h +0 -79
  461. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h +0 -46
  462. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h +0 -16
  463. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h +0 -46
  464. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h +0 -16
  465. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h +0 -369
  466. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h +0 -54
  467. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h +0 -34
  468. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Splines/Spline.h +0 -507
  469. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Splines/SplineFitting.h +0 -431
  470. sequenzo/dissimilarity_measures/src/eigen/unsupported/Eigen/src/Splines/SplineFwd.h +0 -93
  471. sequenzo/dissimilarity_measures/src/eigen/unsupported/test/matrix_functions.h +0 -67
  472. sequenzo-0.1.17.dist-info/RECORD +0 -631
  473. {sequenzo-0.1.17.dist-info → sequenzo-0.1.19.dist-info}/WHEEL +0 -0
  474. {sequenzo-0.1.17.dist-info → sequenzo-0.1.19.dist-info}/licenses/LICENSE +0 -0
  475. {sequenzo-0.1.17.dist-info → sequenzo-0.1.19.dist-info}/top_level.txt +0 -0
@@ -1,1413 +0,0 @@
1
- // This file is part of Eigen, a lightweight C++ template library
2
- // for linear algebra.
3
- //
4
- // Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
- // Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
6
- // Copyright (C) 2014 Eric Martin <eric@ericmart.in>
7
- //
8
- // This Source Code Form is subject to the terms of the Mozilla
9
- // Public License v. 2.0. If a copy of the MPL was not distributed
10
- // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
-
12
- #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
13
- #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
14
-
15
- #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
16
-
17
- namespace Eigen {
18
-
19
- template<typename Scalar, typename Index, typename LhsMapper,
20
- typename RhsMapper, typename OutputMapper, bool needs_edge_check>
21
- __device__ EIGEN_STRONG_INLINE void
22
- EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
23
- const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
24
- const Index m_size, const Index n_size, const Index k_size) {
25
-
26
- const Index m_block_idx = blockIdx.x;
27
- const Index n_block_idx = blockIdx.y;
28
-
29
- const Index base_m = 64 * m_block_idx;
30
- const Index base_n = 64 * n_block_idx;
31
-
32
- // declare and initialize 64 registers for output 8x8 block
33
-
34
- // prefetch registers
35
- Scalar lhs_pf0;
36
- Scalar lhs_pf1;
37
- Scalar lhs_pf2;
38
- Scalar lhs_pf3;
39
- Scalar lhs_pf4;
40
- Scalar lhs_pf5;
41
- Scalar lhs_pf6;
42
- Scalar lhs_pf7;
43
-
44
- Scalar rhs_pf0;
45
- Scalar rhs_pf1;
46
- Scalar rhs_pf2;
47
- Scalar rhs_pf3;
48
- Scalar rhs_pf4;
49
- Scalar rhs_pf5;
50
- Scalar rhs_pf6;
51
- Scalar rhs_pf7;
52
-
53
- // shared memory is formatted
54
- // (contract idx in block, nocontract idx in block, block idx)
55
- // where block idx is column major. This transposition limits the number of
56
- // bank conflicts when reading the LHS. The core idea is that since the contracting
57
- // index is shared by both sides, then the contracting index should be in threadIdx.x.
58
-
59
- // On the LHS, we pad each row inside of each block with an extra element. This makes
60
- // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
61
- // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
62
-
63
- // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
64
- // conflicts on writes and also none on reads.
65
-
66
- // storage indices
67
- const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
68
- const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
69
-
70
- const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
71
- const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
72
- const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
73
- const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
74
- const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
75
- const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
76
- const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
77
- const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
78
-
79
- const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
80
- const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
81
- const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
82
- const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
83
- const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
84
- const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
85
- const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
86
- const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
87
-
88
- // in the loading code, the following variables are important:
89
- // threadIdx.x: the vertical position in an 8x8 block
90
- // threadIdx.y: the vertical index of the 8x8 block in the grid
91
- // threadIdx.z: the horizontal position in an 8x8 block
92
- // k: the horizontal index of the 8x8 block in the grid
93
- //
94
- // The k parameter is implicit (it was the loop counter for a loop that went
95
- // from 0 to <8, but now that loop is unrolled in the below code.
96
-
97
- const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
98
- const Index lhs_vert = base_m + load_idx_vert;
99
-
100
- #define prefetchIntoRegisters(base_k) \
101
- { \
102
- lhs_pf0 = conv(0); \
103
- lhs_pf1 = conv(0); \
104
- lhs_pf2 = conv(0); \
105
- lhs_pf3 = conv(0); \
106
- lhs_pf4 = conv(0); \
107
- lhs_pf5 = conv(0); \
108
- lhs_pf6 = conv(0); \
109
- lhs_pf7 = conv(0); \
110
- \
111
- rhs_pf0 = conv(0); \
112
- rhs_pf1 = conv(0); \
113
- rhs_pf2 = conv(0); \
114
- rhs_pf3 = conv(0); \
115
- rhs_pf4 = conv(0); \
116
- rhs_pf5 = conv(0); \
117
- rhs_pf6 = conv(0); \
118
- rhs_pf7 = conv(0); \
119
- \
120
- if (!needs_edge_check || lhs_vert < m_size) { \
121
- const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \
122
- const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \
123
- const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \
124
- const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \
125
- const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \
126
- const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \
127
- const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \
128
- const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \
129
- \
130
- if (!needs_edge_check || lhs_horiz_7 < k_size) { \
131
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
132
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
133
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
134
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
135
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
136
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
137
- lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
138
- lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \
139
- } else if (lhs_horiz_6 < k_size) { \
140
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
141
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
142
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
143
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
144
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
145
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
146
- lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
147
- } else if (lhs_horiz_5 < k_size) { \
148
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
149
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
150
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
151
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
152
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
153
- lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
154
- } else if (lhs_horiz_4 < k_size) { \
155
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
156
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
157
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
158
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
159
- lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
160
- } else if (lhs_horiz_3 < k_size) { \
161
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
162
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
163
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
164
- lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
165
- } else if (lhs_horiz_2 < k_size) { \
166
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
167
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
168
- lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
169
- } else if (lhs_horiz_1 < k_size) { \
170
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
171
- lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
172
- } else if (lhs_horiz_0 < k_size) { \
173
- lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
174
- } \
175
- } \
176
- \
177
- const Index rhs_vert = base_k + load_idx_vert; \
178
- if (!needs_edge_check || rhs_vert < k_size) { \
179
- const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \
180
- const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \
181
- const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \
182
- const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \
183
- const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \
184
- const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \
185
- const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \
186
- const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \
187
- \
188
- if (rhs_horiz_7 < n_size) { \
189
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
190
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
191
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
192
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
193
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
194
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
195
- rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
196
- rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \
197
- } else if (rhs_horiz_6 < n_size) { \
198
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
199
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
200
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
201
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
202
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
203
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
204
- rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
205
- } else if (rhs_horiz_5 < n_size) { \
206
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
207
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
208
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
209
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
210
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
211
- rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
212
- } else if (rhs_horiz_4 < n_size) { \
213
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
214
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
215
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
216
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
217
- rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
218
- } else if (rhs_horiz_3 < n_size) { \
219
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
220
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
221
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
222
- rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
223
- } else if (rhs_horiz_2 < n_size) { \
224
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
225
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
226
- rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
227
- } else if (rhs_horiz_1 < n_size) { \
228
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
229
- rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
230
- } else if (rhs_horiz_0 < n_size) { \
231
- rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
232
- } \
233
- } \
234
- } \
235
-
236
- #define writeRegToShmem(_) \
237
- lhs_shmem[lhs_store_idx_0] = lhs_pf0; \
238
- rhs_shmem[rhs_store_idx_0] = rhs_pf0; \
239
- \
240
- lhs_shmem[lhs_store_idx_1] = lhs_pf1; \
241
- rhs_shmem[rhs_store_idx_1] = rhs_pf1; \
242
- \
243
- lhs_shmem[lhs_store_idx_2] = lhs_pf2; \
244
- rhs_shmem[rhs_store_idx_2] = rhs_pf2; \
245
- \
246
- lhs_shmem[lhs_store_idx_3] = lhs_pf3; \
247
- rhs_shmem[rhs_store_idx_3] = rhs_pf3; \
248
- \
249
- lhs_shmem[lhs_store_idx_4] = lhs_pf4; \
250
- rhs_shmem[rhs_store_idx_4] = rhs_pf4; \
251
- \
252
- lhs_shmem[lhs_store_idx_5] = lhs_pf5; \
253
- rhs_shmem[rhs_store_idx_5] = rhs_pf5; \
254
- \
255
- lhs_shmem[lhs_store_idx_6] = lhs_pf6; \
256
- rhs_shmem[rhs_store_idx_6] = rhs_pf6; \
257
- \
258
- lhs_shmem[lhs_store_idx_7] = lhs_pf7; \
259
- rhs_shmem[rhs_store_idx_7] = rhs_pf7; \
260
-
261
- // declare and initialize result array
262
- #define res(i, j) _res_##i##j
263
- #define initResultRow(i) \
264
- Scalar res(i, 0) = conv(0); \
265
- Scalar res(i, 1) = conv(0); \
266
- Scalar res(i, 2) = conv(0); \
267
- Scalar res(i, 3) = conv(0); \
268
- Scalar res(i, 4) = conv(0); \
269
- Scalar res(i, 5) = conv(0); \
270
- Scalar res(i, 6) = conv(0); \
271
- Scalar res(i, 7) = conv(0); \
272
-
273
- internal::scalar_cast_op<int, Scalar> conv;
274
- initResultRow(0);
275
- initResultRow(1);
276
- initResultRow(2);
277
- initResultRow(3);
278
- initResultRow(4);
279
- initResultRow(5);
280
- initResultRow(6);
281
- initResultRow(7);
282
- #undef initResultRow
283
-
284
- for (Index base_k = 0; base_k < k_size; base_k += 64) {
285
- // wait for previous iteration to finish with shmem. Despite common sense,
286
- // the code is a bit faster with this here then at bottom of loop
287
- __syncthreads();
288
-
289
- prefetchIntoRegisters(base_k);
290
- writeRegToShmem();
291
-
292
- #undef prefetchIntoRegisters
293
- #undef writeRegToShmem
294
-
295
- // wait for shared mem packing to be done before starting computation
296
- __syncthreads();
297
-
298
- // compute 8x8 matrix product by outer product. This involves packing one column
299
- // of LHS and one row of RHS into registers (takes 16 registers).
300
-
301
- #define lcol(i) _lcol##i
302
- Scalar lcol(0);
303
- Scalar lcol(1);
304
- Scalar lcol(2);
305
- Scalar lcol(3);
306
- Scalar lcol(4);
307
- Scalar lcol(5);
308
- Scalar lcol(6);
309
- Scalar lcol(7);
310
-
311
- #define rrow(j) _rrow##j
312
- Scalar rrow(0);
313
- Scalar rrow(1);
314
- Scalar rrow(2);
315
- Scalar rrow(3);
316
- Scalar rrow(4);
317
- Scalar rrow(5);
318
- Scalar rrow(6);
319
- Scalar rrow(7);
320
-
321
- // Now x corresponds to k, y to m, and z to n
322
- const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
323
- const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
324
-
325
- #define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
326
- #define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
327
-
328
- #define loadData(i, j) \
329
- lcol(0) = lhs_element(0, j); \
330
- rrow(0) = rhs_element(i, 0); \
331
- lcol(1) = lhs_element(1, j); \
332
- rrow(1) = rhs_element(i, 1); \
333
- lcol(2) = lhs_element(2, j); \
334
- rrow(2) = rhs_element(i, 2); \
335
- lcol(3) = lhs_element(3, j); \
336
- rrow(3) = rhs_element(i, 3); \
337
- lcol(4) = lhs_element(4, j); \
338
- rrow(4) = rhs_element(i, 4); \
339
- lcol(5) = lhs_element(5, j); \
340
- rrow(5) = rhs_element(i, 5); \
341
- lcol(6) = lhs_element(6, j); \
342
- rrow(6) = rhs_element(i, 6); \
343
- lcol(7) = lhs_element(7, j); \
344
- rrow(7) = rhs_element(i, 7); \
345
-
346
- #define computeCol(j) \
347
- res(0, j) += lcol(0) * rrow(j); \
348
- res(1, j) += lcol(1) * rrow(j); \
349
- res(2, j) += lcol(2) * rrow(j); \
350
- res(3, j) += lcol(3) * rrow(j); \
351
- res(4, j) += lcol(4) * rrow(j); \
352
- res(5, j) += lcol(5) * rrow(j); \
353
- res(6, j) += lcol(6) * rrow(j); \
354
- res(7, j) += lcol(7) * rrow(j); \
355
-
356
- #define computePass(i) \
357
- loadData(i, i); \
358
- \
359
- computeCol(0); \
360
- computeCol(1); \
361
- computeCol(2); \
362
- computeCol(3); \
363
- computeCol(4); \
364
- computeCol(5); \
365
- computeCol(6); \
366
- computeCol(7); \
367
-
368
- computePass(0);
369
- computePass(1);
370
- computePass(2);
371
- computePass(3);
372
- computePass(4);
373
- computePass(5);
374
- computePass(6);
375
- computePass(7);
376
-
377
- #undef lcol
378
- #undef rrow
379
- #undef lhs_element
380
- #undef rhs_element
381
- #undef loadData
382
- #undef computeCol
383
- #undef computePass
384
- } // end loop over k
385
-
386
- // we've now iterated over all of the large (ie width 64) k blocks and
387
- // accumulated results in registers. At this point thread (x, y, z) contains
388
- // the sum across all big k blocks of the product of little k block of index (x, y)
389
- // with block of index (y, z). To compute the final output, we need to reduce
390
- // the 8 threads over y by summation.
391
- #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
392
- #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
393
- #else
394
- #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
395
- #endif
396
-
397
- #define reduceRow(i, mask) \
398
- shuffleInc(i, 0, mask); \
399
- shuffleInc(i, 1, mask); \
400
- shuffleInc(i, 2, mask); \
401
- shuffleInc(i, 3, mask); \
402
- shuffleInc(i, 4, mask); \
403
- shuffleInc(i, 5, mask); \
404
- shuffleInc(i, 6, mask); \
405
- shuffleInc(i, 7, mask); \
406
-
407
- #define reduceMatrix(mask) \
408
- reduceRow(0, mask); \
409
- reduceRow(1, mask); \
410
- reduceRow(2, mask); \
411
- reduceRow(3, mask); \
412
- reduceRow(4, mask); \
413
- reduceRow(5, mask); \
414
- reduceRow(6, mask); \
415
- reduceRow(7, mask); \
416
-
417
- // actually perform the reduction, now each thread of index (_, y, z)
418
- // contains the correct values in its registers that belong in the output
419
- // block
420
- reduceMatrix(1);
421
- reduceMatrix(2);
422
- reduceMatrix(4);
423
-
424
- #undef shuffleInc
425
- #undef reduceRow
426
- #undef reduceMatrix
427
-
428
- // now we need to copy the 64 values into main memory. We can't split work
429
- // among threads because all variables are in registers. There's 2 ways
430
- // to do this:
431
- // (1) have 1 thread do 64 writes from registers into global memory
432
- // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
433
- // each do 8 writes into global memory. We can just overwrite the shared
434
- // memory from the problem we just solved.
435
- // (2) is slightly faster than (1) due to less branching and more ILP
436
-
437
- // TODO: won't yield much gain, but could just use currently unused shared mem
438
- // and then we won't have to sync
439
- // wait for shared mem to be out of use
440
- __syncthreads();
441
-
442
- #define writeResultShmem(i, j) \
443
- lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
444
-
445
- #define writeRow(i) \
446
- writeResultShmem(i, 0); \
447
- writeResultShmem(i, 1); \
448
- writeResultShmem(i, 2); \
449
- writeResultShmem(i, 3); \
450
- writeResultShmem(i, 4); \
451
- writeResultShmem(i, 5); \
452
- writeResultShmem(i, 6); \
453
- writeResultShmem(i, 7); \
454
-
455
- if (threadIdx.x == 0) {
456
- writeRow(0);
457
- writeRow(1);
458
- writeRow(2);
459
- writeRow(3);
460
- writeRow(4);
461
- writeRow(5);
462
- writeRow(6);
463
- writeRow(7);
464
- }
465
- #undef writeResultShmem
466
- #undef writeRow
467
-
468
- const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
469
- const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
470
-
471
- if (threadIdx.x < max_i_write) {
472
- if (max_j_write == 8) {
473
- // TODO: can i trade bank conflicts for coalesced writes?
474
- Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
475
- Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
476
- Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
477
- Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
478
- Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
479
- Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
480
- Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
481
- Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
482
-
483
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
484
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
485
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
486
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
487
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
488
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
489
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
490
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
491
- } else {
492
- #pragma unroll 7
493
- for (int j = 0; j < max_j_write; j++) {
494
- Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
495
- output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
496
- }
497
- }
498
- }
499
- #undef res
500
- }
501
-
502
-
503
- template<typename Scalar, typename Index, typename LhsMapper,
504
- typename RhsMapper, typename OutputMapper>
505
- __global__ void
506
- #if defined(EIGEN_HIPCC)
507
- __launch_bounds__(512, 1)
508
- #else
509
- __launch_bounds__(512)
510
- #endif
511
- EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
512
- const OutputMapper output,
513
- const Index m_size, const Index n_size, const Index k_size) {
514
- __shared__ Scalar lhs_shmem[72 * 64];
515
- __shared__ Scalar rhs_shmem[72 * 64];
516
-
517
- const Index m_block_idx = blockIdx.x;
518
- const Index n_block_idx = blockIdx.y;
519
-
520
- const Index base_m = 64 * m_block_idx;
521
- const Index base_n = 64 * n_block_idx;
522
-
523
- if (base_m + 63 < m_size && base_n + 63 < n_size) {
524
- EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
525
- } else {
526
- EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
527
- }
528
- }
529
-
530
-
531
- template<typename Index, typename LhsMapper,
532
- typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
533
- bool CHECK_RHS_BOUNDARY>
534
- __device__ __forceinline__ void
535
- EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
536
- const OutputMapper output, float2 lhs_shmem2[][16],
537
- float2 rhs_shmem2[][8], const Index m_size,
538
- const Index n_size, const Index k_size,
539
- const Index base_m, const Index base_n) {
540
-
541
- // prefetch registers
542
- float4 lhs_pf0, rhs_pf0;
543
-
544
- float4 results[4];
545
- for (int i=0; i < 4; i++) {
546
- results[i].x = results[i].y = results[i].z = results[i].w = 0;
547
- }
548
-
549
- #define prefetch_lhs(reg, row, col) \
550
- if (!CHECK_LHS_BOUNDARY) { \
551
- if (col < k_size) { \
552
- reg =lhs.template loadPacket<float4,Unaligned>(row, col); \
553
- } \
554
- } else { \
555
- if (col < k_size) { \
556
- if (row + 3 < m_size) { \
557
- reg =lhs.template loadPacket<float4,Unaligned>(row, col); \
558
- } else if (row + 2 < m_size) { \
559
- reg.x =lhs(row + 0, col); \
560
- reg.y =lhs(row + 1, col); \
561
- reg.z =lhs(row + 2, col); \
562
- } else if (row + 1 < m_size) { \
563
- reg.x =lhs(row + 0, col); \
564
- reg.y =lhs(row + 1, col); \
565
- } else if (row < m_size) { \
566
- reg.x =lhs(row + 0, col); \
567
- } \
568
- } \
569
- } \
570
-
571
- Index lhs_vert = base_m+threadIdx.x*4;
572
-
573
- for (Index k = 0; k < k_size; k += 16) {
574
-
575
- lhs_pf0 = internal::pset1<float4>(0);
576
- rhs_pf0 = internal::pset1<float4>(0);
577
-
578
- Index lhs_horiz = threadIdx.y+k;
579
- prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
580
-
581
- Index rhs_vert = k+(threadIdx.x%4)*4;
582
- Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
583
-
584
- if (!CHECK_RHS_BOUNDARY) {
585
- if ((rhs_vert + 3) < k_size) {
586
- // just CHECK_RHS_BOUNDARY
587
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
588
- } else if (rhs_vert + 2 < k_size) {
589
- // just CHECK_RHS_BOUNDARY
590
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
591
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
592
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
593
- } else if (rhs_vert + 1 < k_size) {
594
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
595
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
596
- } else if (rhs_vert < k_size) {
597
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
598
- }
599
- } else {
600
- if (rhs_horiz0 < n_size) {
601
- if ((rhs_vert + 3) < k_size) {
602
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
603
- } else if ((rhs_vert + 2) < k_size) {
604
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
605
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
606
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
607
- } else if ((rhs_vert + 1) < k_size) {
608
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
609
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
610
- } else if (rhs_vert < k_size) {
611
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
612
- }
613
- }
614
- }
615
- float x1, x2 ;
616
- // the following can be a bitwise operation..... some day.
617
- if((threadIdx.x%8) < 4) {
618
- x1 = rhs_pf0.y;
619
- x2 = rhs_pf0.w;
620
- } else {
621
- x1 = rhs_pf0.x;
622
- x2 = rhs_pf0.z;
623
- }
624
- #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
625
- x1 = __shfl_xor(x1, 4);
626
- x2 = __shfl_xor(x2, 4);
627
- #else
628
- x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
629
- x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
630
- #endif
631
- if((threadIdx.x%8) < 4) {
632
- rhs_pf0.y = x1;
633
- rhs_pf0.w = x2;
634
- } else {
635
- rhs_pf0.x = x1;
636
- rhs_pf0.z = x2;
637
- }
638
-
639
- // We have 64 features.
640
- // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
641
- // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
642
- // ...
643
- // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
644
- // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
645
- // ...
646
- rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
647
- rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
648
-
649
- // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
650
- // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
651
- // ...
652
- // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
653
- // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63)
654
- // ...
655
-
656
- lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
657
- lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
658
-
659
-
660
- #define add_vals(fl1, fl2, fr1, fr2)\
661
- results[0].x += fl1.x * fr1.x;\
662
- results[0].y += fl1.y * fr1.x;\
663
- results[0].z += fl2.x * fr1.x;\
664
- results[0].w += fl2.y * fr1.x;\
665
- \
666
- results[1].x += fl1.x * fr1.y;\
667
- results[1].y += fl1.y * fr1.y;\
668
- results[1].z += fl2.x * fr1.y;\
669
- results[1].w += fl2.y * fr1.y;\
670
- \
671
- results[2].x += fl1.x * fr2.x;\
672
- results[2].y += fl1.y * fr2.x;\
673
- results[2].z += fl2.x * fr2.x;\
674
- results[2].w += fl2.y * fr2.x;\
675
- \
676
- results[3].x += fl1.x * fr2.y;\
677
- results[3].y += fl1.y * fr2.y;\
678
- results[3].z += fl2.x * fr2.y;\
679
- results[3].w += fl2.y * fr2.y;\
680
-
681
- __syncthreads();
682
-
683
- // Do the multiplies.
684
- #pragma unroll
685
- for (int koff = 0; koff < 16; koff ++) {
686
- // 32 x threads.
687
- float2 fl1 = lhs_shmem2[koff][threadIdx.x];
688
- float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
689
-
690
- int start_feature = threadIdx.y * 4;
691
- float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
692
- float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
693
-
694
- add_vals(fl1, fl2, fr1, fr2)
695
- }
696
- __syncthreads();
697
- }
698
-
699
- #undef prefetch_lhs
700
- #undef add_vals
701
-
702
- Index horiz_base = threadIdx.y*4+base_n;
703
- if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
704
- for (int i = 0; i < 4; i++) {
705
- output(lhs_vert, horiz_base + i) = results[i].x;
706
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
707
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
708
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
709
- }
710
- } else if (!CHECK_RHS_BOUNDARY) {
711
- // CHECK LHS
712
- if (lhs_vert + 3 < m_size) {
713
- for (int i = 0; i < 4; i++) {
714
- output(lhs_vert, horiz_base + i) = results[i].x;
715
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
716
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
717
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
718
- }
719
- } else if (lhs_vert + 2 < m_size) {
720
- for (int i = 0; i < 4; i++) {
721
- output(lhs_vert, horiz_base + i) = results[i].x;
722
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
723
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
724
- }
725
- } else if (lhs_vert + 1 < m_size) {
726
- for (int i = 0; i < 4; i++) {
727
- output(lhs_vert, horiz_base + i) = results[i].x;
728
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
729
- }
730
- } else if (lhs_vert < m_size) {
731
- for (int i = 0; i < 4; i++) {
732
- output(lhs_vert, horiz_base + i) = results[i].x;
733
- }
734
- }
735
- } else if (!CHECK_LHS_BOUNDARY) {
736
- // CHECK RHS
737
- /*
738
- int ncols_rem = fminf(n_size- horiz_base, 4);
739
- for (int i = 0; i < ncols_rem; i++) {
740
- output(lhs_vert, horiz_base + i) = results[i].x;
741
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
742
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
743
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
744
- }*/
745
- for (int i = 0; i < 4; i++) {
746
- if (horiz_base+i < n_size) {
747
- output(lhs_vert, horiz_base + i) = results[i].x;
748
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
749
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
750
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
751
- }
752
- }
753
- } else {
754
- // CHECK both boundaries.
755
- for (int i = 0; i < 4; i++) {
756
- if (horiz_base+i < n_size) {
757
- if (lhs_vert < m_size)
758
- output(lhs_vert, horiz_base + i) = results[i].x;
759
- if (lhs_vert + 1 < m_size)
760
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
761
- if (lhs_vert + 2 < m_size)
762
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
763
- if (lhs_vert + 3 < m_size)
764
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
765
- }
766
- }
767
- }
768
- }
769
-
770
-
771
- template<typename Index, typename LhsMapper,
772
- typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
773
- bool CHECK_RHS_BOUNDARY>
774
- __device__ __forceinline__ void
775
- EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
776
- const OutputMapper output, float2 lhs_shmem2[][32],
777
- float2 rhs_shmem2[][8], const Index m_size,
778
- const Index n_size, const Index k_size,
779
- const Index base_m, const Index base_n) {
780
-
781
- // prefetch registers
782
- float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
783
- float4 rhs_pf0, rhs_pf1;
784
-
785
- float4 results[8];
786
- for (int i=0; i < 8; i++) {
787
- results[i].x = results[i].y = results[i].z = results[i].w = 0;
788
- }
789
-
790
- Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
791
- for (Index k = 0; k < k_size; k += 32) {
792
- lhs_pf0 = internal::pset1<float4>(0);
793
- lhs_pf1 = internal::pset1<float4>(0);
794
- lhs_pf2 = internal::pset1<float4>(0);
795
- lhs_pf3 = internal::pset1<float4>(0);
796
-
797
- rhs_pf0 = internal::pset1<float4>(0);
798
- rhs_pf1 = internal::pset1<float4>(0);
799
-
800
- if (!CHECK_LHS_BOUNDARY) {
801
- if ((threadIdx.y/4+k+24) < k_size) {
802
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
803
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
804
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
805
- lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
806
- } else if ((threadIdx.y/4+k+16) < k_size) {
807
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
808
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
809
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
810
- } else if ((threadIdx.y/4+k+8) < k_size) {
811
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
812
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
813
- } else if ((threadIdx.y/4+k) < k_size) {
814
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
815
- }
816
- } else {
817
- // just CHECK_LHS_BOUNDARY
818
- if (lhs_vert + 3 < m_size) {
819
- if ((threadIdx.y/4+k+24) < k_size) {
820
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
821
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
822
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
823
- lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
824
- } else if ((threadIdx.y/4+k+16) < k_size) {
825
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
826
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
827
- lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
828
- } else if ((threadIdx.y/4+k+8) < k_size) {
829
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
830
- lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
831
- } else if ((threadIdx.y/4+k) < k_size) {
832
- lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
833
- }
834
- } else if (lhs_vert + 2 < m_size) {
835
- if ((threadIdx.y/4+k+24) < k_size) {
836
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
837
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
838
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
839
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
840
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
841
- lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
842
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
843
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
844
- lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
845
- lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
846
- lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
847
- lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
848
- } else if ((threadIdx.y/4+k+16) < k_size) {
849
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
850
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
851
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
852
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
853
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
854
- lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
855
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
856
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
857
- lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
858
- } else if ((threadIdx.y/4+k+8) < k_size) {
859
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
860
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
861
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
862
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
863
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
864
- lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
865
- } else if ((threadIdx.y/4+k) < k_size) {
866
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
867
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
868
- lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
869
- }
870
- } else if (lhs_vert + 1 < m_size) {
871
- if ((threadIdx.y/4+k+24) < k_size) {
872
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
873
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
874
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
875
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
876
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
877
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
878
- lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
879
- lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
880
- } else if ((threadIdx.y/4+k+16) < k_size) {
881
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
882
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
883
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
884
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
885
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
886
- lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
887
- } else if ((threadIdx.y/4+k+8) < k_size) {
888
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
889
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
890
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
891
- lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
892
- } else if ((threadIdx.y/4+k) < k_size) {
893
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
894
- lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
895
- }
896
- } else if (lhs_vert < m_size) {
897
- if ((threadIdx.y/4+k+24) < k_size) {
898
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
899
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
900
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
901
- lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
902
- } else if ((threadIdx.y/4+k+16) < k_size) {
903
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
904
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
905
- lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
906
- } else if ((threadIdx.y/4+k+8) < k_size) {
907
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
908
- lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
909
- } else if ((threadIdx.y/4+k) < k_size) {
910
- lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
911
- }
912
- }
913
- }
914
- __syncthreads();
915
- Index rhs_vert = k+threadIdx.x*4;
916
- Index rhs_horiz0 = threadIdx.y*2+base_n;
917
- Index rhs_horiz1 = threadIdx.y*2+1+base_n;
918
- if (!CHECK_RHS_BOUNDARY) {
919
- if ((rhs_vert + 3) < k_size) {
920
- // just CHECK_RHS_BOUNDARY
921
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
922
- rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
923
- } else if (rhs_vert + 2 < k_size) {
924
- // just CHECK_RHS_BOUNDARY
925
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
926
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
927
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
928
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
929
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
930
- rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
931
- } else if (rhs_vert + 1 < k_size) {
932
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
933
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
934
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
935
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
936
- } else if (rhs_vert < k_size) {
937
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
938
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
939
- }
940
- } else {
941
- if (rhs_horiz1 < n_size) {
942
- if ((rhs_vert + 3) < k_size) {
943
- // just CHECK_RHS_BOUNDARY
944
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
945
- rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
946
- } else if (rhs_vert + 2 < k_size) {
947
- // just CHECK_RHS_BOUNDARY
948
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
949
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
950
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
951
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
952
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
953
- rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
954
- } else if (k+threadIdx.x*4 + 1 < k_size) {
955
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
956
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
957
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
958
- rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
959
- } else if (k+threadIdx.x*4 < k_size) {
960
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
961
- rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
962
- }
963
- } else if (rhs_horiz0 < n_size) {
964
- if ((rhs_vert + 3) < k_size) {
965
- // just CHECK_RHS_BOUNDARY
966
- rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
967
- } else if ((rhs_vert + 2) < k_size) {
968
- // just CHECK_RHS_BOUNDARY
969
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
970
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
971
- rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
972
- } else if ((rhs_vert + 1) < k_size) {
973
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
974
- rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
975
- } else if (rhs_vert < k_size) {
976
- rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
977
- }
978
- }
979
- }
980
- __syncthreads();
981
- // Loaded. Do computation
982
- // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
983
- // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
984
- // ..
985
- // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
986
- rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
987
- // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
988
- // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
989
- // ..
990
- rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
991
- // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
992
- // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
993
- rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
994
- // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
995
- // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
996
- rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
997
-
998
- // LHS.
999
- // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
1000
- // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
1001
- // ...
1002
- // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
1003
- // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
1004
-
1005
-
1006
- #define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
1007
- results[0].x += a_feat1.x * f1.x;\
1008
- results[1].x += a_feat1.x * f1.y;\
1009
- results[2].x += a_feat1.x * f2.x;\
1010
- results[3].x += a_feat1.x * f2.y;\
1011
- results[4].x += a_feat1.x * f3.x;\
1012
- results[5].x += a_feat1.x * f3.y;\
1013
- results[6].x += a_feat1.x * f4.x;\
1014
- results[7].x += a_feat1.x * f4.y;\
1015
- \
1016
- results[0].y += a_feat1.y * f1.x;\
1017
- results[1].y += a_feat1.y * f1.y;\
1018
- results[2].y += a_feat1.y * f2.x;\
1019
- results[3].y += a_feat1.y * f2.y;\
1020
- results[4].y += a_feat1.y * f3.x;\
1021
- results[5].y += a_feat1.y * f3.y;\
1022
- results[6].y += a_feat1.y * f4.x;\
1023
- results[7].y += a_feat1.y * f4.y;\
1024
- \
1025
- results[0].z += a_feat2.x * f1.x;\
1026
- results[1].z += a_feat2.x * f1.y;\
1027
- results[2].z += a_feat2.x * f2.x;\
1028
- results[3].z += a_feat2.x * f2.y;\
1029
- results[4].z += a_feat2.x * f3.x;\
1030
- results[5].z += a_feat2.x * f3.y;\
1031
- results[6].z += a_feat2.x * f4.x;\
1032
- results[7].z += a_feat2.x * f4.y;\
1033
- \
1034
- results[0].w += a_feat2.y * f1.x;\
1035
- results[1].w += a_feat2.y * f1.y;\
1036
- results[2].w += a_feat2.y * f2.x;\
1037
- results[3].w += a_feat2.y * f2.y;\
1038
- results[4].w += a_feat2.y * f3.x;\
1039
- results[5].w += a_feat2.y * f3.y;\
1040
- results[6].w += a_feat2.y * f4.x;\
1041
- results[7].w += a_feat2.y * f4.y;\
1042
-
1043
- lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
1044
- lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
1045
- lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
1046
- lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
1047
-
1048
- lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
1049
- lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
1050
- lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
1051
- lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
1052
-
1053
- __syncthreads();
1054
-
1055
- // Do the multiplies.
1056
- #pragma unroll
1057
- for (int koff = 0; koff < 32; koff ++) {
1058
- float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
1059
- float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
1060
-
1061
- // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
1062
- int start_feature = (threadIdx.y / 4) * 8;
1063
-
1064
- float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4];
1065
- float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
1066
- float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
1067
- float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
1068
-
1069
- add_vals(a3, a4, br1, br2, br3, br4)
1070
- }
1071
- __syncthreads();
1072
- } // end loop over k
1073
-
1074
- __syncthreads();
1075
- Index horiz_base = (threadIdx.y/4)*8+base_n;
1076
- if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
1077
- for (int i = 0; i < 8; i++) {
1078
- output(lhs_vert, horiz_base + i) = results[i].x;
1079
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1080
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1081
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1082
- }
1083
- } else if (!CHECK_RHS_BOUNDARY) {
1084
- if (lhs_vert + 3 < m_size) {
1085
- for (int i = 0; i < 8; i++) {
1086
- output(lhs_vert, horiz_base + i) = results[i].x;
1087
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1088
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1089
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1090
- }
1091
- } else if (lhs_vert + 2 < m_size) {
1092
- for (int i = 0; i < 8; i++) {
1093
- output(lhs_vert, horiz_base + i) = results[i].x;
1094
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1095
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1096
- }
1097
- } else if (lhs_vert + 1 < m_size) {
1098
- for (int i = 0; i < 8; i++) {
1099
- output(lhs_vert, horiz_base + i) = results[i].x;
1100
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1101
- }
1102
- } else if (lhs_vert < m_size) {
1103
- for (int i = 0; i < 8; i++) {
1104
- output(lhs_vert, horiz_base + i) = results[i].x;
1105
- }
1106
- }
1107
- } else if (!CHECK_LHS_BOUNDARY) {
1108
- // CHECK BOUNDARY_B
1109
- for (int i = 0; i < 8; i++) {
1110
- if (horiz_base + i < n_size) {
1111
- output(lhs_vert, horiz_base + i) = results[i].x;
1112
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1113
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1114
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1115
- }
1116
- }
1117
- } else {
1118
- // CHECK both boundaries.
1119
- for (int i = 0; i < 8; i++) {
1120
- if (horiz_base + i < n_size) {
1121
- if (lhs_vert < m_size)
1122
- output(lhs_vert, horiz_base + i) = results[i].x;
1123
- if (lhs_vert + 1 < m_size)
1124
- output(lhs_vert + 1, horiz_base + i) = results[i].y;
1125
- if (lhs_vert + 2 < m_size)
1126
- output(lhs_vert + 2, horiz_base + i) = results[i].z;
1127
- if (lhs_vert + 3 < m_size)
1128
- output(lhs_vert + 3, horiz_base + i) = results[i].w;
1129
- }
1130
- }
1131
- }
1132
- }
1133
-
1134
-
1135
- template<typename Index, typename LhsMapper,
1136
- typename RhsMapper, typename OutputMapper>
1137
- __global__ void
1138
- #if defined(EIGEN_HIPCC)
1139
- __launch_bounds__(256, 1)
1140
- #else
1141
- __launch_bounds__(256)
1142
- #endif
1143
- EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
1144
- const OutputMapper output,
1145
- const Index m_size, const Index n_size, const Index k_size) {
1146
- __shared__ float2 lhs_shmem[64*32];
1147
- __shared__ float2 rhs_shmem[128*8];
1148
-
1149
- typedef float2 LHS_MEM[64][32];
1150
- typedef float2 RHS_MEM[128][8];
1151
-
1152
- const Index m_block_idx = blockIdx.x;
1153
- const Index n_block_idx = blockIdx.y;
1154
-
1155
- const Index base_m = 128 * m_block_idx;
1156
- const Index base_n = 64 * n_block_idx;
1157
-
1158
- bool check_rhs = (base_n + 63) >= n_size;
1159
- bool check_lhs128 = (base_m + 127) >= m_size;
1160
-
1161
- if (!check_rhs) {
1162
- if (!check_lhs128) {
1163
- // >= 128 rows left
1164
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
1165
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1166
- } else {
1167
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
1168
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1169
- }
1170
- } else {
1171
- if (!check_lhs128) {
1172
- // >= 128 rows left
1173
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
1174
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1175
- } else {
1176
- EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
1177
- lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1178
- }
1179
- }
1180
- }
1181
-
1182
- template<typename Index, typename LhsMapper,
1183
- typename RhsMapper, typename OutputMapper>
1184
- __global__ void
1185
- #if defined(EIGEN_HIPCC)
1186
- __launch_bounds__(256, 1)
1187
- #else
1188
- __launch_bounds__(256)
1189
- #endif
1190
- EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
1191
- const OutputMapper output,
1192
- const Index m_size, const Index n_size, const Index k_size) {
1193
- __shared__ float2 lhs_shmem[32][16];
1194
- __shared__ float2 rhs_shmem[64][8];
1195
-
1196
- const Index m_block_idx = blockIdx.x;
1197
- const Index n_block_idx = blockIdx.y;
1198
-
1199
- const Index base_m = 64 * m_block_idx;
1200
- const Index base_n = 64 * n_block_idx;
1201
-
1202
- if (base_m + 63 < m_size) {
1203
- if (base_n + 63 < n_size) {
1204
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1205
- } else {
1206
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1207
- }
1208
- } else {
1209
- if (base_n + 63 < n_size) {
1210
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1211
- } else {
1212
- EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1213
- }
1214
- }
1215
- }
1216
-
1217
-
1218
- template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
1219
- struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
1220
- public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
1221
-
1222
- typedef GpuDevice Device;
1223
-
1224
- typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
1225
- typedef TensorContractionEvaluatorBase<Self> Base;
1226
-
1227
- typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
1228
- typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
1229
- typedef typename XprType::Index Index;
1230
- typedef typename XprType::CoeffReturnType CoeffReturnType;
1231
- typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
1232
-
1233
- enum {
1234
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
1235
- };
1236
-
1237
- // Most of the code is assuming that both input tensors are ColMajor. If the
1238
- // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
1239
- // If we want to compute A * B = C, where A is LHS and B is RHS, the code
1240
- // will pretend B is LHS and A is RHS.
1241
- typedef typename internal::conditional<
1242
- static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
1243
- typedef typename internal::conditional<
1244
- static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
1245
-
1246
- static const int LDims =
1247
- internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
1248
- static const int RDims =
1249
- internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
1250
- static const int ContractDims = internal::array_size<Indices>::value;
1251
-
1252
- typedef array<Index, LDims> left_dim_mapper_t;
1253
- typedef array<Index, RDims> right_dim_mapper_t;
1254
-
1255
- typedef array<Index, ContractDims> contract_t;
1256
- typedef array<Index, LDims - ContractDims> left_nocontract_t;
1257
- typedef array<Index, RDims - ContractDims> right_nocontract_t;
1258
-
1259
- static const int NumDims = LDims + RDims - 2 * ContractDims;
1260
-
1261
- typedef DSizes<Index, NumDims> Dimensions;
1262
-
1263
- // typedefs needed in evalTo
1264
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
1265
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
1266
-
1267
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
1268
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
1269
-
1270
- typedef typename LeftEvaluator::Dimensions LeftDimensions;
1271
- typedef typename RightEvaluator::Dimensions RightDimensions;
1272
-
1273
- TensorEvaluator(const XprType& op, const Device& device) :
1274
- Base(op, device)
1275
- {
1276
- EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
1277
- GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
1278
- }
1279
-
1280
- // We need to redefine this method to make nvcc happy
1281
- EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
1282
- this->m_leftImpl.evalSubExprsIfNeeded(NULL);
1283
- this->m_rightImpl.evalSubExprsIfNeeded(NULL);
1284
- if (data) {
1285
- evalTo(data);
1286
- return false;
1287
- } else {
1288
- this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
1289
- evalTo(this->m_result);
1290
- return true;
1291
- }
1292
- }
1293
-
1294
- void evalTo(Scalar* buffer) const {
1295
- if (this->m_lhs_inner_dim_contiguous) {
1296
- if (this->m_rhs_inner_dim_contiguous) {
1297
- if (this->m_rhs_inner_dim_reordered) {
1298
- evalTyped<true, true, true, Unaligned>(buffer);
1299
- }
1300
- else {
1301
- evalTyped<true, true, false, Unaligned>(buffer);
1302
- }
1303
- }
1304
- else {
1305
- if (this->m_rhs_inner_dim_reordered) {
1306
- evalTyped<true, false, true, Unaligned>(buffer);
1307
- }
1308
- else {
1309
- evalTyped<true, false, false, Unaligned>(buffer);
1310
- }
1311
- }
1312
- }
1313
- else {
1314
- if (this->m_rhs_inner_dim_contiguous) {
1315
- if (this->m_rhs_inner_dim_reordered) {
1316
- evalTyped<false, true, true, Unaligned>(buffer);
1317
- }
1318
- else {
1319
- evalTyped<false, true, false, Unaligned>(buffer);
1320
- }
1321
- }
1322
- else {
1323
- if (this->m_rhs_inner_dim_reordered) {
1324
- evalTyped<false, false, true, Unaligned>(buffer);
1325
- }
1326
- else {
1327
- evalTyped<false, false, false, Unaligned>(buffer);
1328
- }
1329
- }
1330
- }
1331
- }
1332
-
1333
- template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
1334
- static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
1335
- const Index m_blocks = (m + 63) / 64;
1336
- const Index n_blocks = (n + 63) / 64;
1337
- const dim3 num_blocks(m_blocks, n_blocks, 1);
1338
- const dim3 block_size(8, 8, 8);
1339
- LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
1340
- }
1341
- };
1342
-
1343
- template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
1344
- static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
1345
- if (m < 768 || n < 768) {
1346
- const Index m_blocks = (m + 63) / 64;
1347
- const Index n_blocks = (n + 63) / 64;
1348
- const dim3 num_blocks(m_blocks, n_blocks, 1);
1349
- const dim3 block_size(16, 16, 1);
1350
- LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
1351
- } else {
1352
- const Index m_blocks = (m + 127) / 128;
1353
- const Index n_blocks = (n + 63) / 64;
1354
- const dim3 num_blocks(m_blocks, n_blocks, 1);
1355
- const dim3 block_size(8, 32, 1);
1356
- LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
1357
- }
1358
- }
1359
- };
1360
-
1361
- template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
1362
- void evalTyped(Scalar* buffer) const {
1363
- // columns in left side, rows in right side
1364
- const Index k = this->m_k_size;
1365
- EIGEN_UNUSED_VARIABLE(k)
1366
-
1367
- // rows in left side
1368
- const Index m = this->m_i_size;
1369
-
1370
- // columns in right side
1371
- const Index n = this->m_j_size;
1372
-
1373
- // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
1374
- this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
1375
-
1376
- typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
1377
- LeftEvaluator, left_nocontract_t,
1378
- contract_t, 4,
1379
- lhs_inner_dim_contiguous,
1380
- false, Unaligned> LhsMapper;
1381
-
1382
- typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
1383
- RightEvaluator, right_nocontract_t,
1384
- contract_t, 4,
1385
- rhs_inner_dim_contiguous,
1386
- rhs_inner_dim_reordered, Unaligned> RhsMapper;
1387
-
1388
- typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
1389
-
1390
-
1391
- // initialize data mappers
1392
- LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
1393
- this->m_left_contracting_strides, this->m_k_strides);
1394
-
1395
- RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
1396
- this->m_right_contracting_strides, this->m_k_strides);
1397
-
1398
- OutputMapper output(buffer, m);
1399
-
1400
- #if defined(EIGEN_USE_HIP)
1401
- setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
1402
- #else
1403
- setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
1404
- #endif
1405
-
1406
- LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device);
1407
- }
1408
- };
1409
-
1410
- } // end namespace Eigen
1411
-
1412
- #endif // EIGEN_USE_GPU and EIGEN_GPUCC
1413
- #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H