pyvale 2026.1.1__cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (860) hide show
  1. include/eigen3/Eigen/Cholesky +45 -0
  2. include/eigen3/Eigen/CholmodSupport +48 -0
  3. include/eigen3/Eigen/Core +384 -0
  4. include/eigen3/Eigen/Dense +7 -0
  5. include/eigen3/Eigen/Eigen +2 -0
  6. include/eigen3/Eigen/Eigenvalues +60 -0
  7. include/eigen3/Eigen/Geometry +59 -0
  8. include/eigen3/Eigen/Householder +29 -0
  9. include/eigen3/Eigen/IterativeLinearSolvers +48 -0
  10. include/eigen3/Eigen/Jacobi +32 -0
  11. include/eigen3/Eigen/KLUSupport +41 -0
  12. include/eigen3/Eigen/LU +47 -0
  13. include/eigen3/Eigen/MetisSupport +35 -0
  14. include/eigen3/Eigen/OrderingMethods +70 -0
  15. include/eigen3/Eigen/PaStiXSupport +49 -0
  16. include/eigen3/Eigen/PardisoSupport +35 -0
  17. include/eigen3/Eigen/QR +50 -0
  18. include/eigen3/Eigen/QtAlignedMalloc +39 -0
  19. include/eigen3/Eigen/SPQRSupport +34 -0
  20. include/eigen3/Eigen/SVD +50 -0
  21. include/eigen3/Eigen/Sparse +34 -0
  22. include/eigen3/Eigen/SparseCholesky +37 -0
  23. include/eigen3/Eigen/SparseCore +69 -0
  24. include/eigen3/Eigen/SparseLU +50 -0
  25. include/eigen3/Eigen/SparseQR +36 -0
  26. include/eigen3/Eigen/StdDeque +27 -0
  27. include/eigen3/Eigen/StdList +26 -0
  28. include/eigen3/Eigen/StdVector +27 -0
  29. include/eigen3/Eigen/SuperLUSupport +64 -0
  30. include/eigen3/Eigen/UmfPackSupport +40 -0
  31. include/eigen3/Eigen/src/Cholesky/LDLT.h +688 -0
  32. include/eigen3/Eigen/src/Cholesky/LLT.h +558 -0
  33. include/eigen3/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  34. include/eigen3/Eigen/src/CholmodSupport/CholmodSupport.h +682 -0
  35. include/eigen3/Eigen/src/Core/ArithmeticSequence.h +413 -0
  36. include/eigen3/Eigen/src/Core/Array.h +417 -0
  37. include/eigen3/Eigen/src/Core/ArrayBase.h +226 -0
  38. include/eigen3/Eigen/src/Core/ArrayWrapper.h +209 -0
  39. include/eigen3/Eigen/src/Core/Assign.h +90 -0
  40. include/eigen3/Eigen/src/Core/AssignEvaluator.h +1010 -0
  41. include/eigen3/Eigen/src/Core/Assign_MKL.h +178 -0
  42. include/eigen3/Eigen/src/Core/BandMatrix.h +353 -0
  43. include/eigen3/Eigen/src/Core/Block.h +448 -0
  44. include/eigen3/Eigen/src/Core/BooleanRedux.h +162 -0
  45. include/eigen3/Eigen/src/Core/CommaInitializer.h +164 -0
  46. include/eigen3/Eigen/src/Core/ConditionEstimator.h +175 -0
  47. include/eigen3/Eigen/src/Core/CoreEvaluators.h +1741 -0
  48. include/eigen3/Eigen/src/Core/CoreIterators.h +132 -0
  49. include/eigen3/Eigen/src/Core/CwiseBinaryOp.h +183 -0
  50. include/eigen3/Eigen/src/Core/CwiseNullaryOp.h +1001 -0
  51. include/eigen3/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  52. include/eigen3/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  53. include/eigen3/Eigen/src/Core/CwiseUnaryView.h +132 -0
  54. include/eigen3/Eigen/src/Core/DenseBase.h +701 -0
  55. include/eigen3/Eigen/src/Core/DenseCoeffsBase.h +685 -0
  56. include/eigen3/Eigen/src/Core/DenseStorage.h +652 -0
  57. include/eigen3/Eigen/src/Core/Diagonal.h +258 -0
  58. include/eigen3/Eigen/src/Core/DiagonalMatrix.h +391 -0
  59. include/eigen3/Eigen/src/Core/DiagonalProduct.h +28 -0
  60. include/eigen3/Eigen/src/Core/Dot.h +318 -0
  61. include/eigen3/Eigen/src/Core/EigenBase.h +160 -0
  62. include/eigen3/Eigen/src/Core/ForceAlignedAccess.h +150 -0
  63. include/eigen3/Eigen/src/Core/Fuzzy.h +155 -0
  64. include/eigen3/Eigen/src/Core/GeneralProduct.h +465 -0
  65. include/eigen3/Eigen/src/Core/GenericPacketMath.h +1040 -0
  66. include/eigen3/Eigen/src/Core/GlobalFunctions.h +194 -0
  67. include/eigen3/Eigen/src/Core/IO.h +258 -0
  68. include/eigen3/Eigen/src/Core/IndexedView.h +237 -0
  69. include/eigen3/Eigen/src/Core/Inverse.h +117 -0
  70. include/eigen3/Eigen/src/Core/Map.h +171 -0
  71. include/eigen3/Eigen/src/Core/MapBase.h +310 -0
  72. include/eigen3/Eigen/src/Core/MathFunctions.h +2057 -0
  73. include/eigen3/Eigen/src/Core/MathFunctionsImpl.h +200 -0
  74. include/eigen3/Eigen/src/Core/Matrix.h +565 -0
  75. include/eigen3/Eigen/src/Core/MatrixBase.h +547 -0
  76. include/eigen3/Eigen/src/Core/NestByValue.h +85 -0
  77. include/eigen3/Eigen/src/Core/NoAlias.h +109 -0
  78. include/eigen3/Eigen/src/Core/NumTraits.h +335 -0
  79. include/eigen3/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  80. include/eigen3/Eigen/src/Core/PermutationMatrix.h +605 -0
  81. include/eigen3/Eigen/src/Core/PlainObjectBase.h +1128 -0
  82. include/eigen3/Eigen/src/Core/Product.h +191 -0
  83. include/eigen3/Eigen/src/Core/ProductEvaluators.h +1179 -0
  84. include/eigen3/Eigen/src/Core/Random.h +218 -0
  85. include/eigen3/Eigen/src/Core/Redux.h +515 -0
  86. include/eigen3/Eigen/src/Core/Ref.h +381 -0
  87. include/eigen3/Eigen/src/Core/Replicate.h +142 -0
  88. include/eigen3/Eigen/src/Core/Reshaped.h +454 -0
  89. include/eigen3/Eigen/src/Core/ReturnByValue.h +119 -0
  90. include/eigen3/Eigen/src/Core/Reverse.h +217 -0
  91. include/eigen3/Eigen/src/Core/Select.h +164 -0
  92. include/eigen3/Eigen/src/Core/SelfAdjointView.h +365 -0
  93. include/eigen3/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  94. include/eigen3/Eigen/src/Core/Solve.h +188 -0
  95. include/eigen3/Eigen/src/Core/SolveTriangular.h +235 -0
  96. include/eigen3/Eigen/src/Core/SolverBase.h +168 -0
  97. include/eigen3/Eigen/src/Core/StableNorm.h +251 -0
  98. include/eigen3/Eigen/src/Core/StlIterators.h +463 -0
  99. include/eigen3/Eigen/src/Core/Stride.h +116 -0
  100. include/eigen3/Eigen/src/Core/Swap.h +68 -0
  101. include/eigen3/Eigen/src/Core/Transpose.h +464 -0
  102. include/eigen3/Eigen/src/Core/Transpositions.h +386 -0
  103. include/eigen3/Eigen/src/Core/TriangularMatrix.h +1001 -0
  104. include/eigen3/Eigen/src/Core/VectorBlock.h +96 -0
  105. include/eigen3/Eigen/src/Core/VectorwiseOp.h +784 -0
  106. include/eigen3/Eigen/src/Core/Visitor.h +381 -0
  107. include/eigen3/Eigen/src/Core/arch/AVX/Complex.h +372 -0
  108. include/eigen3/Eigen/src/Core/arch/AVX/MathFunctions.h +228 -0
  109. include/eigen3/Eigen/src/Core/arch/AVX/PacketMath.h +1574 -0
  110. include/eigen3/Eigen/src/Core/arch/AVX/TypeCasting.h +115 -0
  111. include/eigen3/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  112. include/eigen3/Eigen/src/Core/arch/AVX512/MathFunctions.h +362 -0
  113. include/eigen3/Eigen/src/Core/arch/AVX512/PacketMath.h +2303 -0
  114. include/eigen3/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  115. include/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h +417 -0
  116. include/eigen3/Eigen/src/Core/arch/AltiVec/MathFunctions.h +90 -0
  117. include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  118. include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  119. include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  120. include/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h +2711 -0
  121. include/eigen3/Eigen/src/Core/arch/CUDA/Complex.h +258 -0
  122. include/eigen3/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  123. include/eigen3/Eigen/src/Core/arch/Default/ConjHelper.h +117 -0
  124. include/eigen3/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  125. include/eigen3/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  126. include/eigen3/Eigen/src/Core/arch/Default/Half.h +942 -0
  127. include/eigen3/Eigen/src/Core/arch/Default/Settings.h +49 -0
  128. include/eigen3/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  129. include/eigen3/Eigen/src/Core/arch/GPU/MathFunctions.h +103 -0
  130. include/eigen3/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  131. include/eigen3/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  132. include/eigen3/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  133. include/eigen3/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  134. include/eigen3/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  135. include/eigen3/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  136. include/eigen3/Eigen/src/Core/arch/NEON/Complex.h +584 -0
  137. include/eigen3/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  138. include/eigen3/Eigen/src/Core/arch/NEON/MathFunctions.h +75 -0
  139. include/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h +4587 -0
  140. include/eigen3/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  141. include/eigen3/Eigen/src/Core/arch/SSE/Complex.h +351 -0
  142. include/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h +199 -0
  143. include/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h +1505 -0
  144. include/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h +142 -0
  145. include/eigen3/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  146. include/eigen3/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  147. include/eigen3/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  148. include/eigen3/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  149. include/eigen3/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  150. include/eigen3/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  151. include/eigen3/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  152. include/eigen3/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  153. include/eigen3/Eigen/src/Core/arch/ZVector/Complex.h +426 -0
  154. include/eigen3/Eigen/src/Core/arch/ZVector/MathFunctions.h +233 -0
  155. include/eigen3/Eigen/src/Core/arch/ZVector/PacketMath.h +1060 -0
  156. include/eigen3/Eigen/src/Core/functors/AssignmentFunctors.h +177 -0
  157. include/eigen3/Eigen/src/Core/functors/BinaryFunctors.h +541 -0
  158. include/eigen3/Eigen/src/Core/functors/NullaryFunctors.h +189 -0
  159. include/eigen3/Eigen/src/Core/functors/StlFunctors.h +166 -0
  160. include/eigen3/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  161. include/eigen3/Eigen/src/Core/functors/UnaryFunctors.h +1131 -0
  162. include/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2645 -0
  163. include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h +517 -0
  164. include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +317 -0
  165. include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  166. include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +124 -0
  167. include/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h +518 -0
  168. include/eigen3/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  169. include/eigen3/Eigen/src/Core/products/Parallelizer.h +180 -0
  170. include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +544 -0
  171. include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +295 -0
  172. include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h +262 -0
  173. include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  174. include/eigen3/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  175. include/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h +94 -0
  176. include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h +472 -0
  177. include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +317 -0
  178. include/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  179. include/eigen3/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  180. include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h +337 -0
  181. include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +167 -0
  182. include/eigen3/Eigen/src/Core/products/TriangularSolverVector.h +148 -0
  183. include/eigen3/Eigen/src/Core/util/BlasUtil.h +583 -0
  184. include/eigen3/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  185. include/eigen3/Eigen/src/Core/util/Constants.h +563 -0
  186. include/eigen3/Eigen/src/Core/util/DisableStupidWarnings.h +106 -0
  187. include/eigen3/Eigen/src/Core/util/ForwardDeclarations.h +322 -0
  188. include/eigen3/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  189. include/eigen3/Eigen/src/Core/util/IntegralConstant.h +272 -0
  190. include/eigen3/Eigen/src/Core/util/MKL_support.h +137 -0
  191. include/eigen3/Eigen/src/Core/util/Macros.h +1464 -0
  192. include/eigen3/Eigen/src/Core/util/Memory.h +1163 -0
  193. include/eigen3/Eigen/src/Core/util/Meta.h +812 -0
  194. include/eigen3/Eigen/src/Core/util/NonMPL2.h +3 -0
  195. include/eigen3/Eigen/src/Core/util/ReenableStupidWarnings.h +31 -0
  196. include/eigen3/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  197. include/eigen3/Eigen/src/Core/util/StaticAssert.h +221 -0
  198. include/eigen3/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  199. include/eigen3/Eigen/src/Core/util/XprHelper.h +856 -0
  200. include/eigen3/Eigen/src/Eigenvalues/ComplexEigenSolver.h +346 -0
  201. include/eigen3/Eigen/src/Eigenvalues/ComplexSchur.h +462 -0
  202. include/eigen3/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  203. include/eigen3/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  204. include/eigen3/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +418 -0
  205. include/eigen3/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  206. include/eigen3/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  207. include/eigen3/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  208. include/eigen3/Eigen/src/Eigenvalues/RealQZ.h +657 -0
  209. include/eigen3/Eigen/src/Eigenvalues/RealSchur.h +558 -0
  210. include/eigen3/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  211. include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +904 -0
  212. include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  213. include/eigen3/Eigen/src/Eigenvalues/Tridiagonalization.h +561 -0
  214. include/eigen3/Eigen/src/Geometry/AlignedBox.h +486 -0
  215. include/eigen3/Eigen/src/Geometry/AngleAxis.h +247 -0
  216. include/eigen3/Eigen/src/Geometry/EulerAngles.h +114 -0
  217. include/eigen3/Eigen/src/Geometry/Homogeneous.h +501 -0
  218. include/eigen3/Eigen/src/Geometry/Hyperplane.h +282 -0
  219. include/eigen3/Eigen/src/Geometry/OrthoMethods.h +235 -0
  220. include/eigen3/Eigen/src/Geometry/ParametrizedLine.h +232 -0
  221. include/eigen3/Eigen/src/Geometry/Quaternion.h +870 -0
  222. include/eigen3/Eigen/src/Geometry/Rotation2D.h +199 -0
  223. include/eigen3/Eigen/src/Geometry/RotationBase.h +206 -0
  224. include/eigen3/Eigen/src/Geometry/Scaling.h +188 -0
  225. include/eigen3/Eigen/src/Geometry/Transform.h +1563 -0
  226. include/eigen3/Eigen/src/Geometry/Translation.h +202 -0
  227. include/eigen3/Eigen/src/Geometry/Umeyama.h +166 -0
  228. include/eigen3/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  229. include/eigen3/Eigen/src/Householder/BlockHouseholder.h +110 -0
  230. include/eigen3/Eigen/src/Householder/Householder.h +176 -0
  231. include/eigen3/Eigen/src/Householder/HouseholderSequence.h +545 -0
  232. include/eigen3/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  233. include/eigen3/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +212 -0
  234. include/eigen3/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +229 -0
  235. include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +394 -0
  236. include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +453 -0
  237. include/eigen3/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +444 -0
  238. include/eigen3/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +198 -0
  239. include/eigen3/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +117 -0
  240. include/eigen3/Eigen/src/Jacobi/Jacobi.h +483 -0
  241. include/eigen3/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  242. include/eigen3/Eigen/src/LU/Determinant.h +117 -0
  243. include/eigen3/Eigen/src/LU/FullPivLU.h +877 -0
  244. include/eigen3/Eigen/src/LU/InverseImpl.h +432 -0
  245. include/eigen3/Eigen/src/LU/PartialPivLU.h +624 -0
  246. include/eigen3/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  247. include/eigen3/Eigen/src/LU/arch/InverseSize4.h +351 -0
  248. include/eigen3/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  249. include/eigen3/Eigen/src/OrderingMethods/Amd.h +435 -0
  250. include/eigen3/Eigen/src/OrderingMethods/Eigen_Colamd.h +1863 -0
  251. include/eigen3/Eigen/src/OrderingMethods/Ordering.h +153 -0
  252. include/eigen3/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  253. include/eigen3/Eigen/src/PardisoSupport/PardisoSupport.h +545 -0
  254. include/eigen3/Eigen/src/QR/ColPivHouseholderQR.h +674 -0
  255. include/eigen3/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  256. include/eigen3/Eigen/src/QR/CompleteOrthogonalDecomposition.h +635 -0
  257. include/eigen3/Eigen/src/QR/FullPivHouseholderQR.h +713 -0
  258. include/eigen3/Eigen/src/QR/HouseholderQR.h +434 -0
  259. include/eigen3/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  260. include/eigen3/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +335 -0
  261. include/eigen3/Eigen/src/SVD/BDCSVD.h +1366 -0
  262. include/eigen3/Eigen/src/SVD/JacobiSVD.h +812 -0
  263. include/eigen3/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  264. include/eigen3/Eigen/src/SVD/SVDBase.h +376 -0
  265. include/eigen3/Eigen/src/SVD/UpperBidiagonalization.h +414 -0
  266. include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky.h +697 -0
  267. include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +174 -0
  268. include/eigen3/Eigen/src/SparseCore/AmbiVector.h +378 -0
  269. include/eigen3/Eigen/src/SparseCore/CompressedStorage.h +274 -0
  270. include/eigen3/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  271. include/eigen3/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  272. include/eigen3/Eigen/src/SparseCore/SparseAssign.h +270 -0
  273. include/eigen3/Eigen/src/SparseCore/SparseBlock.h +571 -0
  274. include/eigen3/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  275. include/eigen3/Eigen/src/SparseCore/SparseCompressedBase.h +370 -0
  276. include/eigen3/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +722 -0
  277. include/eigen3/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +150 -0
  278. include/eigen3/Eigen/src/SparseCore/SparseDenseProduct.h +342 -0
  279. include/eigen3/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  280. include/eigen3/Eigen/src/SparseCore/SparseDot.h +98 -0
  281. include/eigen3/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  282. include/eigen3/Eigen/src/SparseCore/SparseMap.h +305 -0
  283. include/eigen3/Eigen/src/SparseCore/SparseMatrix.h +1518 -0
  284. include/eigen3/Eigen/src/SparseCore/SparseMatrixBase.h +398 -0
  285. include/eigen3/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  286. include/eigen3/Eigen/src/SparseCore/SparseProduct.h +181 -0
  287. include/eigen3/Eigen/src/SparseCore/SparseRedux.h +49 -0
  288. include/eigen3/Eigen/src/SparseCore/SparseRef.h +397 -0
  289. include/eigen3/Eigen/src/SparseCore/SparseSelfAdjointView.h +659 -0
  290. include/eigen3/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  291. include/eigen3/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  292. include/eigen3/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  293. include/eigen3/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  294. include/eigen3/Eigen/src/SparseCore/SparseUtil.h +186 -0
  295. include/eigen3/Eigen/src/SparseCore/SparseVector.h +478 -0
  296. include/eigen3/Eigen/src/SparseCore/SparseView.h +254 -0
  297. include/eigen3/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  298. include/eigen3/Eigen/src/SparseLU/SparseLU.h +923 -0
  299. include/eigen3/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  300. include/eigen3/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  301. include/eigen3/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  302. include/eigen3/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +375 -0
  303. include/eigen3/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  304. include/eigen3/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  305. include/eigen3/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  306. include/eigen3/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  307. include/eigen3/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +280 -0
  308. include/eigen3/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +126 -0
  309. include/eigen3/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +130 -0
  310. include/eigen3/Eigen/src/SparseLU/SparseLU_panel_bmod.h +223 -0
  311. include/eigen3/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  312. include/eigen3/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  313. include/eigen3/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  314. include/eigen3/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  315. include/eigen3/Eigen/src/SparseQR/SparseQR.h +758 -0
  316. include/eigen3/Eigen/src/StlSupport/StdDeque.h +116 -0
  317. include/eigen3/Eigen/src/StlSupport/StdList.h +106 -0
  318. include/eigen3/Eigen/src/StlSupport/StdVector.h +131 -0
  319. include/eigen3/Eigen/src/StlSupport/details.h +84 -0
  320. include/eigen3/Eigen/src/SuperLUSupport/SuperLUSupport.h +1025 -0
  321. include/eigen3/Eigen/src/UmfPackSupport/UmfPackSupport.h +642 -0
  322. include/eigen3/Eigen/src/misc/Image.h +82 -0
  323. include/eigen3/Eigen/src/misc/Kernel.h +79 -0
  324. include/eigen3/Eigen/src/misc/RealSvd2x2.h +55 -0
  325. include/eigen3/Eigen/src/misc/blas.h +440 -0
  326. include/eigen3/Eigen/src/misc/lapack.h +152 -0
  327. include/eigen3/Eigen/src/misc/lapacke.h +16292 -0
  328. include/eigen3/Eigen/src/misc/lapacke_mangling.h +17 -0
  329. include/eigen3/Eigen/src/plugins/ArrayCwiseBinaryOps.h +358 -0
  330. include/eigen3/Eigen/src/plugins/ArrayCwiseUnaryOps.h +696 -0
  331. include/eigen3/Eigen/src/plugins/BlockMethods.h +1442 -0
  332. include/eigen3/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  333. include/eigen3/Eigen/src/plugins/CommonCwiseUnaryOps.h +177 -0
  334. include/eigen3/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  335. include/eigen3/Eigen/src/plugins/MatrixCwiseBinaryOps.h +152 -0
  336. include/eigen3/Eigen/src/plugins/MatrixCwiseUnaryOps.h +95 -0
  337. include/eigen3/Eigen/src/plugins/ReshapedMethods.h +149 -0
  338. include/eigen3/signature_of_eigen3_matrix_library +1 -0
  339. include/eigen3/unsupported/Eigen/AdolcForward +159 -0
  340. include/eigen3/unsupported/Eigen/AlignedVector3 +234 -0
  341. include/eigen3/unsupported/Eigen/ArpackSupport +30 -0
  342. include/eigen3/unsupported/Eigen/AutoDiff +46 -0
  343. include/eigen3/unsupported/Eigen/BVH +95 -0
  344. include/eigen3/unsupported/Eigen/CXX11/Tensor +137 -0
  345. include/eigen3/unsupported/Eigen/CXX11/TensorSymmetry +42 -0
  346. include/eigen3/unsupported/Eigen/CXX11/ThreadPool +74 -0
  347. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +554 -0
  348. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +329 -0
  349. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +247 -0
  350. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +1176 -0
  351. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +1559 -0
  352. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +1093 -0
  353. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +518 -0
  354. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +377 -0
  355. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +1023 -0
  356. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +73 -0
  357. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +6 -0
  358. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +1413 -0
  359. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +575 -0
  360. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +1650 -0
  361. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +1679 -0
  362. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +456 -0
  363. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +1132 -0
  364. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +544 -0
  365. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +214 -0
  366. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +347 -0
  367. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +137 -0
  368. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +6 -0
  369. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +104 -0
  370. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +389 -0
  371. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +1048 -0
  372. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +409 -0
  373. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h +236 -0
  374. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +490 -0
  375. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +236 -0
  376. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +983 -0
  377. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +703 -0
  378. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +388 -0
  379. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +669 -0
  380. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +379 -0
  381. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +237 -0
  382. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +191 -0
  383. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +488 -0
  384. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +302 -0
  385. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h +33 -0
  386. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +99 -0
  387. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h +44 -0
  388. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +79 -0
  389. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +603 -0
  390. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +738 -0
  391. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +247 -0
  392. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +82 -0
  393. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +263 -0
  394. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +216 -0
  395. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +98 -0
  396. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +327 -0
  397. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +311 -0
  398. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +1102 -0
  399. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +708 -0
  400. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +291 -0
  401. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +322 -0
  402. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +998 -0
  403. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +6 -0
  404. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +966 -0
  405. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +582 -0
  406. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +454 -0
  407. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +465 -0
  408. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +528 -0
  409. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h +513 -0
  410. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +471 -0
  411. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +161 -0
  412. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +346 -0
  413. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +303 -0
  414. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +264 -0
  415. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +249 -0
  416. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +629 -0
  417. include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +293 -0
  418. include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +236 -0
  419. include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h +338 -0
  420. include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h +669 -0
  421. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h +67 -0
  422. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +249 -0
  423. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +486 -0
  424. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +236 -0
  425. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h +23 -0
  426. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h +40 -0
  427. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h +301 -0
  428. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h +48 -0
  429. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h +20 -0
  430. include/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +537 -0
  431. include/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h +88 -0
  432. include/eigen3/unsupported/Eigen/CXX11/src/util/EmulateArray.h +261 -0
  433. include/eigen3/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h +158 -0
  434. include/eigen3/unsupported/Eigen/EulerAngles +43 -0
  435. include/eigen3/unsupported/Eigen/FFT +419 -0
  436. include/eigen3/unsupported/Eigen/IterativeSolvers +51 -0
  437. include/eigen3/unsupported/Eigen/KroneckerProduct +36 -0
  438. include/eigen3/unsupported/Eigen/LevenbergMarquardt +49 -0
  439. include/eigen3/unsupported/Eigen/MPRealSupport +213 -0
  440. include/eigen3/unsupported/Eigen/MatrixFunctions +504 -0
  441. include/eigen3/unsupported/Eigen/MoreVectorization +24 -0
  442. include/eigen3/unsupported/Eigen/NonLinearOptimization +140 -0
  443. include/eigen3/unsupported/Eigen/NumericalDiff +56 -0
  444. include/eigen3/unsupported/Eigen/OpenGLSupport +322 -0
  445. include/eigen3/unsupported/Eigen/Polynomials +137 -0
  446. include/eigen3/unsupported/Eigen/Skyline +39 -0
  447. include/eigen3/unsupported/Eigen/SparseExtra +54 -0
  448. include/eigen3/unsupported/Eigen/SpecialFunctions +103 -0
  449. include/eigen3/unsupported/Eigen/Splines +35 -0
  450. include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h +108 -0
  451. include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +730 -0
  452. include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h +220 -0
  453. include/eigen3/unsupported/Eigen/src/BVH/BVAlgorithms.h +293 -0
  454. include/eigen3/unsupported/Eigen/src/BVH/KdBVH.h +223 -0
  455. include/eigen3/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h +790 -0
  456. include/eigen3/unsupported/Eigen/src/EulerAngles/EulerAngles.h +355 -0
  457. include/eigen3/unsupported/Eigen/src/EulerAngles/EulerSystem.h +305 -0
  458. include/eigen3/unsupported/Eigen/src/FFT/ei_fftw_impl.h +261 -0
  459. include/eigen3/unsupported/Eigen/src/FFT/ei_kissfft_impl.h +449 -0
  460. include/eigen3/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h +187 -0
  461. include/eigen3/unsupported/Eigen/src/IterativeSolvers/DGMRES.h +511 -0
  462. include/eigen3/unsupported/Eigen/src/IterativeSolvers/GMRES.h +335 -0
  463. include/eigen3/unsupported/Eigen/src/IterativeSolvers/IDRS.h +436 -0
  464. include/eigen3/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h +90 -0
  465. include/eigen3/unsupported/Eigen/src/IterativeSolvers/IterationController.h +154 -0
  466. include/eigen3/unsupported/Eigen/src/IterativeSolvers/MINRES.h +267 -0
  467. include/eigen3/unsupported/Eigen/src/IterativeSolvers/Scaling.h +193 -0
  468. include/eigen3/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h +305 -0
  469. include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h +84 -0
  470. include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h +202 -0
  471. include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h +160 -0
  472. include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h +188 -0
  473. include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h +396 -0
  474. include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +441 -0
  475. include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +569 -0
  476. include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +373 -0
  477. include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +705 -0
  478. include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h +368 -0
  479. include/eigen3/unsupported/Eigen/src/MatrixFunctions/StemFunction.h +117 -0
  480. include/eigen3/unsupported/Eigen/src/MoreVectorization/MathFunctions.h +95 -0
  481. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h +601 -0
  482. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h +657 -0
  483. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/chkder.h +66 -0
  484. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/covar.h +70 -0
  485. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/dogleg.h +107 -0
  486. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h +79 -0
  487. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/lmpar.h +298 -0
  488. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h +91 -0
  489. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h +30 -0
  490. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1updt.h +99 -0
  491. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h +49 -0
  492. include/eigen3/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h +130 -0
  493. include/eigen3/unsupported/Eigen/src/Polynomials/Companion.h +280 -0
  494. include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +428 -0
  495. include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialUtils.h +143 -0
  496. include/eigen3/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h +352 -0
  497. include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrix.h +862 -0
  498. include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h +212 -0
  499. include/eigen3/unsupported/Eigen/src/Skyline/SkylineProduct.h +295 -0
  500. include/eigen3/unsupported/Eigen/src/Skyline/SkylineStorage.h +259 -0
  501. include/eigen3/unsupported/Eigen/src/Skyline/SkylineUtil.h +89 -0
  502. include/eigen3/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h +122 -0
  503. include/eigen3/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h +1079 -0
  504. include/eigen3/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h +404 -0
  505. include/eigen3/unsupported/Eigen/src/SparseExtra/MarketIO.h +282 -0
  506. include/eigen3/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h +247 -0
  507. include/eigen3/unsupported/Eigen/src/SparseExtra/RandomSetter.h +349 -0
  508. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h +286 -0
  509. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h +68 -0
  510. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h +357 -0
  511. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h +66 -0
  512. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h +1959 -0
  513. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h +118 -0
  514. include/eigen3/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h +67 -0
  515. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h +167 -0
  516. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h +58 -0
  517. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h +330 -0
  518. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h +58 -0
  519. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +2045 -0
  520. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h +79 -0
  521. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h +46 -0
  522. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h +16 -0
  523. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h +46 -0
  524. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h +16 -0
  525. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h +369 -0
  526. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h +54 -0
  527. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h +34 -0
  528. include/eigen3/unsupported/Eigen/src/Splines/Spline.h +507 -0
  529. include/eigen3/unsupported/Eigen/src/Splines/SplineFitting.h +431 -0
  530. include/eigen3/unsupported/Eigen/src/Splines/SplineFwd.h +93 -0
  531. pyvale/__init__.py +23 -0
  532. pyvale/blender/__init__.py +23 -0
  533. pyvale/blender/blendercalibrationdata.py +17 -0
  534. pyvale/blender/blenderexceptions.py +8 -0
  535. pyvale/blender/blenderlightdata.py +26 -0
  536. pyvale/blender/blendermaterialdata.py +15 -0
  537. pyvale/blender/blenderrenderdata.py +35 -0
  538. pyvale/blender/blenderscene.py +493 -0
  539. pyvale/blender/blendertools.py +449 -0
  540. pyvale/calib/__init__.py +11 -0
  541. pyvale/calib/calibcpp.cpython-311-i386-linux-gnu.so +0 -0
  542. pyvale/calib/calibdotdetect.py +510 -0
  543. pyvale/calib/calibparams.py +47 -0
  544. pyvale/calib/calibstereo.py +441 -0
  545. pyvale/calib/cpp/bindings.cpp +22 -0
  546. pyvale/calib/cpp/calibdotdetect.cpp +16 -0
  547. pyvale/calib/cpp/calibdotdetect.hpp +20 -0
  548. pyvale/calib/cpp/calibopt.cpp +347 -0
  549. pyvale/calib/cpp/calibopt.hpp +84 -0
  550. pyvale/calib/cpp/calibstereo.cpp +95 -0
  551. pyvale/calib/cpp/calibstereo.hpp +27 -0
  552. pyvale/common_cpp/__init__.py +5 -0
  553. pyvale/common_cpp/bindings.cpp +33 -0
  554. pyvale/common_cpp/common_cpp.cpython-311-i386-linux-gnu.so +0 -0
  555. pyvale/common_cpp/defines.hpp +39 -0
  556. pyvale/common_cpp/dicsignalhandler.cpp +16 -0
  557. pyvale/common_cpp/dicsignalhandler.hpp +11 -0
  558. pyvale/common_cpp/pocketfft_hdronly.h +3744 -0
  559. pyvale/common_cpp/progressbar.hpp +107 -0
  560. pyvale/common_cpp/util.cpp +19 -0
  561. pyvale/common_cpp/util.hpp +72 -0
  562. pyvale/common_py/util.py +63 -0
  563. pyvale/data/DIC_Challenge_Star_Noise_Def.tiff +0 -0
  564. pyvale/data/DIC_Challenge_Star_Noise_Ref.tiff +0 -0
  565. pyvale/data/__init__.py +5 -0
  566. pyvale/data/cal_target.tiff +0 -0
  567. pyvale/data/calib.caldat +26 -0
  568. pyvale/data/case00_HEX20_out.e +0 -0
  569. pyvale/data/case00_HEX27_out.e +0 -0
  570. pyvale/data/case00_HEX8_out.e +0 -0
  571. pyvale/data/case00_TET10_out.e +0 -0
  572. pyvale/data/case00_TET14_out.e +0 -0
  573. pyvale/data/case00_TET4_out.e +0 -0
  574. pyvale/data/case16_d_out.e +0 -0
  575. pyvale/data/case16_out.e +0 -0
  576. pyvale/data/case17_out.e +0 -0
  577. pyvale/data/case18_d_out.e +0 -0
  578. pyvale/data/case18_out.e +0 -0
  579. pyvale/data/case26_out.e +0 -0
  580. pyvale/data/optspeckle_2464x2056px_spec5px_8bit_gblur1px.tiff +0 -0
  581. pyvale/data/plate_hole_def0000.tiff +0 -0
  582. pyvale/data/plate_hole_def0001.tiff +0 -0
  583. pyvale/data/plate_hole_ref0000.tiff +0 -0
  584. pyvale/data/plate_rigid_def0000.tiff +0 -0
  585. pyvale/data/plate_rigid_def0001.tiff +0 -0
  586. pyvale/data/plate_rigid_def_25px.tiff +0 -0
  587. pyvale/data/plate_rigid_def_50px.tiff +0 -0
  588. pyvale/data/plate_rigid_ref0000.tiff +0 -0
  589. pyvale/dataset/__init__.py +7 -0
  590. pyvale/dataset/dataset.py +483 -0
  591. pyvale/dic/__init__.py +15 -0
  592. pyvale/dic/cpp/bindings.cpp +52 -0
  593. pyvale/dic/cpp/dicfourier.cpp +705 -0
  594. pyvale/dic/cpp/dicfourier.hpp +410 -0
  595. pyvale/dic/cpp/dicinterpolator.cpp +633 -0
  596. pyvale/dic/cpp/dicinterpolator.hpp +162 -0
  597. pyvale/dic/cpp/dicmain.cpp +214 -0
  598. pyvale/dic/cpp/dicmain.hpp +61 -0
  599. pyvale/dic/cpp/dicoptimizer.cpp +564 -0
  600. pyvale/dic/cpp/dicoptimizer.hpp +279 -0
  601. pyvale/dic/cpp/dicresults.cpp +239 -0
  602. pyvale/dic/cpp/dicresults.hpp +64 -0
  603. pyvale/dic/cpp/dicrg.cpp +55 -0
  604. pyvale/dic/cpp/dicrg.hpp +52 -0
  605. pyvale/dic/cpp/dicscanmethod.cpp +819 -0
  606. pyvale/dic/cpp/dicscanmethod.hpp +119 -0
  607. pyvale/dic/cpp/dicshapefunc.cpp +117 -0
  608. pyvale/dic/cpp/dicshapefunc.hpp +40 -0
  609. pyvale/dic/cpp/dicsubset.cpp +325 -0
  610. pyvale/dic/cpp/dicsubset.hpp +122 -0
  611. pyvale/dic/cpp/dicutil.cpp +108 -0
  612. pyvale/dic/cpp/dicutil.hpp +96 -0
  613. pyvale/dic/cuda/malloc.cu +99 -0
  614. pyvale/dic/cuda/malloc.hpp +17 -0
  615. pyvale/dic/dic2d.py +190 -0
  616. pyvale/dic/dic2dconv.py +6 -0
  617. pyvale/dic/dic2dcpp.cpython-311-i386-linux-gnu.so +0 -0
  618. pyvale/dic/dicchecks.py +455 -0
  619. pyvale/dic/dicdataimport.py +402 -0
  620. pyvale/dic/dicregionofinterest.py +1163 -0
  621. pyvale/dic/dicresults.py +58 -0
  622. pyvale/examples/__init__.py +5 -0
  623. pyvale/examples/basicsensorsim/README.md +2 -0
  624. pyvale/examples/basicsensorsim/ex0_quickstart.py +139 -0
  625. pyvale/examples/basicsensorsim/ex1_scalar_sensors.py +240 -0
  626. pyvale/examples/basicsensorsim/ex2_vector_tensor_sensors.py +280 -0
  627. pyvale/examples/basicsensorsim/ex3_experiment_simulator.py +397 -0
  628. pyvale/examples/blenderimagedef/README.md +2 -0
  629. pyvale/examples/blenderimagedef/ex1_blender_scene2d.py +176 -0
  630. pyvale/examples/blenderimagedef/ex2_blender_imagedef2d.py +177 -0
  631. pyvale/examples/blenderimagedef/ex3_blender_scenestereo.py +205 -0
  632. pyvale/examples/blenderimagedef/ex4_blender_imagedefstereo.py +213 -0
  633. pyvale/examples/blenderimagedef/ex5_blender_calibstereo.py +190 -0
  634. pyvale/examples/dic/README.md +2 -0
  635. pyvale/examples/dic/ex1_region_of_interest.py +101 -0
  636. pyvale/examples/dic/ex2_plate_with_hole.py +155 -0
  637. pyvale/examples/dic/ex3_plate_with_hole_strain.py +99 -0
  638. pyvale/examples/dic/ex4_dic_blender.py +97 -0
  639. pyvale/examples/dic/ex5_dic_challenge.py +107 -0
  640. pyvale/examples/extsensorsim/README.md +2 -0
  641. pyvale/examples/extsensorsim/ex1_byosimdata.py +211 -0
  642. pyvale/examples/extsensorsim/ex2_meshfreesensors.py +174 -0
  643. pyvale/examples/extsensorsim/ex3a_scal2d.py +151 -0
  644. pyvale/examples/extsensorsim/ex3b_scal3d.py +150 -0
  645. pyvale/examples/extsensorsim/ex3c_vec2d.py +163 -0
  646. pyvale/examples/extsensorsim/ex3d_vec3d.py +169 -0
  647. pyvale/examples/extsensorsim/ex3e_tens2d.py +170 -0
  648. pyvale/examples/extsensorsim/ex3f_tens3d.py +198 -0
  649. pyvale/examples/extsensorsim/ex4a_basicerrs_scal2d.py +201 -0
  650. pyvale/examples/extsensorsim/ex4b_fielderrs_scal3d.py +197 -0
  651. pyvale/examples/extsensorsim/ex4c_angleerrs_vec2d.py +215 -0
  652. pyvale/examples/extsensorsim/ex4d_fieldlockerrs_vec3d.py +184 -0
  653. pyvale/examples/extsensorsim/ex4e_chainfielderrs_vec2d.py +233 -0
  654. pyvale/examples/extsensorsim/ex4f_caliberrs_scal2d.py +167 -0
  655. pyvale/examples/extsensorsim/ex4g_spatavgerrs_scal2d.py +146 -0
  656. pyvale/examples/extsensorsim/ex5a_expsim_thermmech2d.py +350 -0
  657. pyvale/examples/extsensorsim/ex5b_expsim_thermmech3d.py +358 -0
  658. pyvale/examples/genanalyticdata/ex1_1_scalarvisualisation.py +41 -0
  659. pyvale/examples/genanalyticdata/ex1_2_scalarcasebuild.py +43 -0
  660. pyvale/examples/genanalyticdata/ex2_1_analyticsensors.py +86 -0
  661. pyvale/examples/genanalyticdata/ex2_2_analyticsensors_nomesh.py +89 -0
  662. pyvale/examples/imagedef2d/ex_imagedef2d_todisk.py +84 -0
  663. pyvale/examples/mooseherder/README.md +2 -0
  664. pyvale/examples/mooseherder/ex0_create_moose_config.py +65 -0
  665. pyvale/examples/mooseherder/ex1a_modify_moose_input.py +71 -0
  666. pyvale/examples/mooseherder/ex1b_modify_gmsh_input.py +69 -0
  667. pyvale/examples/mooseherder/ex2a_run_moose_once.py +80 -0
  668. pyvale/examples/mooseherder/ex2b_run_gmsh_once.py +64 -0
  669. pyvale/examples/mooseherder/ex2c_run_both_once.py +114 -0
  670. pyvale/examples/mooseherder/ex3_run_moose_seq_para.py +157 -0
  671. pyvale/examples/mooseherder/ex4_run_gmsh-moose_seq_para.py +176 -0
  672. pyvale/examples/mooseherder/ex5_run_moose_paramulti.py +136 -0
  673. pyvale/examples/mooseherder/ex6_read_moose_exodus.py +163 -0
  674. pyvale/examples/mooseherder/ex7a_read_moose_herd_results.py +153 -0
  675. pyvale/examples/mooseherder/ex7b_read_multi_herd_results.py +116 -0
  676. pyvale/examples/mooseherder/ex7c_read_multi_gmshmoose_results.py +127 -0
  677. pyvale/examples/mooseherder/ex7d_readconfig_multi_gmshmoose_results.py +143 -0
  678. pyvale/examples/mooseherder/ex8_read_existing_sweep_output.py +72 -0
  679. pyvale/examples/rasterimagedef/ex_rastenp.py +194 -0
  680. pyvale/examples/rasterimagedef/ex_rastercyth_oneframe.py +206 -0
  681. pyvale/examples/rasterimagedef/ex_rastercyth_static_cypara.py +189 -0
  682. pyvale/examples/rasterimagedef/ex_rastercyth_static_pypara.py +219 -0
  683. pyvale/examples/visualisation/ex1_visualisation_options.py +111 -0
  684. pyvale/mooseherder/__init__.py +55 -0
  685. pyvale/mooseherder/directorymanager.py +408 -0
  686. pyvale/mooseherder/exceptions.py +10 -0
  687. pyvale/mooseherder/exodusloader.py +762 -0
  688. pyvale/mooseherder/gmshrunner.py +158 -0
  689. pyvale/mooseherder/inputmodifier.py +240 -0
  690. pyvale/mooseherder/mooseconfig.py +212 -0
  691. pyvale/mooseherder/mooseherd.py +539 -0
  692. pyvale/mooseherder/mooserunner.py +307 -0
  693. pyvale/mooseherder/outputloader.py +17 -0
  694. pyvale/mooseherder/simdata.py +93 -0
  695. pyvale/mooseherder/simloaderbyfield.py +211 -0
  696. pyvale/mooseherder/simloaderbytime.py +193 -0
  697. pyvale/mooseherder/simloadopts.py +55 -0
  698. pyvale/mooseherder/simloadtools.py +465 -0
  699. pyvale/mooseherder/simrunner.py +31 -0
  700. pyvale/mooseherder/simsaver.py +401 -0
  701. pyvale/mooseherder/sweeploader.py +358 -0
  702. pyvale/mooseherder/sweeptools.py +76 -0
  703. pyvale/sensorsim/__init__.py +86 -0
  704. pyvale/sensorsim/camera.py +147 -0
  705. pyvale/sensorsim/cameradata.py +72 -0
  706. pyvale/sensorsim/cameradata2d.py +84 -0
  707. pyvale/sensorsim/camerasensor.py +147 -0
  708. pyvale/sensorsim/camerastereo.py +217 -0
  709. pyvale/sensorsim/cameratools.py +484 -0
  710. pyvale/sensorsim/cython/rastercyth.c +32404 -0
  711. pyvale/sensorsim/cython/rastercyth.html +3392 -0
  712. pyvale/sensorsim/cython/rastercyth.py +684 -0
  713. pyvale/sensorsim/enums.py +16 -0
  714. pyvale/sensorsim/errordriftcalc.py +104 -0
  715. pyvale/sensorsim/errorintegrator.py +359 -0
  716. pyvale/sensorsim/errorrand.py +105 -0
  717. pyvale/sensorsim/errorsimulator.py +137 -0
  718. pyvale/sensorsim/errorsyscalib.py +93 -0
  719. pyvale/sensorsim/errorsysdep.py +197 -0
  720. pyvale/sensorsim/errorsysfield.py +383 -0
  721. pyvale/sensorsim/errorsysindep.py +209 -0
  722. pyvale/sensorsim/exceptions.py +14 -0
  723. pyvale/sensorsim/experimentsimio.py +94 -0
  724. pyvale/sensorsim/experimentsimulator.py +615 -0
  725. pyvale/sensorsim/experimentstats.py +115 -0
  726. pyvale/sensorsim/field.py +127 -0
  727. pyvale/sensorsim/fieldconverter.py +378 -0
  728. pyvale/sensorsim/fieldinterp.py +89 -0
  729. pyvale/sensorsim/fieldinterpmesh.py +119 -0
  730. pyvale/sensorsim/fieldinterppoints.py +93 -0
  731. pyvale/sensorsim/fieldsampler.py +110 -0
  732. pyvale/sensorsim/fieldscalar.py +94 -0
  733. pyvale/sensorsim/fieldtensor.py +150 -0
  734. pyvale/sensorsim/fieldtransform.py +388 -0
  735. pyvale/sensorsim/fieldvector.py +136 -0
  736. pyvale/sensorsim/generatorsrandom.py +420 -0
  737. pyvale/sensorsim/imagedef2d.py +577 -0
  738. pyvale/sensorsim/imagetools.py +137 -0
  739. pyvale/sensorsim/integratorfactory.py +240 -0
  740. pyvale/sensorsim/integratorquadrature.py +217 -0
  741. pyvale/sensorsim/integratorrectangle.py +165 -0
  742. pyvale/sensorsim/integratorspatial.py +89 -0
  743. pyvale/sensorsim/integratortype.py +43 -0
  744. pyvale/sensorsim/logger.py +23 -0
  745. pyvale/sensorsim/plotting_logs.py +22 -0
  746. pyvale/sensorsim/raster.py +31 -0
  747. pyvale/sensorsim/rastercy.py +107 -0
  748. pyvale/sensorsim/rasternp.py +627 -0
  749. pyvale/sensorsim/rasteropts.py +58 -0
  750. pyvale/sensorsim/renderer.py +47 -0
  751. pyvale/sensorsim/rendermesh.py +137 -0
  752. pyvale/sensorsim/renderscene.py +51 -0
  753. pyvale/sensorsim/sensorarray.py +178 -0
  754. pyvale/sensorsim/sensordata.py +74 -0
  755. pyvale/sensorsim/sensordescriptor.py +275 -0
  756. pyvale/sensorsim/sensorfactory.py +179 -0
  757. pyvale/sensorsim/sensorspoint.py +308 -0
  758. pyvale/sensorsim/sensortools.py +113 -0
  759. pyvale/sensorsim/simtools.py +300 -0
  760. pyvale/sensorsim/visualexpplotter.py +201 -0
  761. pyvale/sensorsim/visualimagedef.py +74 -0
  762. pyvale/sensorsim/visualimages.py +76 -0
  763. pyvale/sensorsim/visualopts.py +507 -0
  764. pyvale/sensorsim/visualsimanimator.py +111 -0
  765. pyvale/sensorsim/visualsimplotter.py +180 -0
  766. pyvale/sensorsim/visualsimsensors.py +343 -0
  767. pyvale/sensorsim/visualtools.py +136 -0
  768. pyvale/sensorsim/visualtraceanimator.py +77 -0
  769. pyvale/sensorsim/visualtraceplotter.py +296 -0
  770. pyvale/simcases/case00_HEX20.i +242 -0
  771. pyvale/simcases/case00_HEX27.i +242 -0
  772. pyvale/simcases/case00_HEX8.i +242 -0
  773. pyvale/simcases/case00_TET10.i +242 -0
  774. pyvale/simcases/case00_TET14.i +242 -0
  775. pyvale/simcases/case00_TET4.i +242 -0
  776. pyvale/simcases/case01.i +101 -0
  777. pyvale/simcases/case02.i +156 -0
  778. pyvale/simcases/case03.i +136 -0
  779. pyvale/simcases/case04.i +181 -0
  780. pyvale/simcases/case05.i +234 -0
  781. pyvale/simcases/case06.i +305 -0
  782. pyvale/simcases/case07.geo +135 -0
  783. pyvale/simcases/case07.i +87 -0
  784. pyvale/simcases/case08.geo +144 -0
  785. pyvale/simcases/case08.i +153 -0
  786. pyvale/simcases/case09.geo +204 -0
  787. pyvale/simcases/case09.i +87 -0
  788. pyvale/simcases/case10.geo +204 -0
  789. pyvale/simcases/case10.i +257 -0
  790. pyvale/simcases/case11.geo +337 -0
  791. pyvale/simcases/case11.i +147 -0
  792. pyvale/simcases/case12.geo +388 -0
  793. pyvale/simcases/case12.i +329 -0
  794. pyvale/simcases/case13.i +140 -0
  795. pyvale/simcases/case14.i +159 -0
  796. pyvale/simcases/case15.geo +337 -0
  797. pyvale/simcases/case15.i +150 -0
  798. pyvale/simcases/case16.geo +391 -0
  799. pyvale/simcases/case16.i +357 -0
  800. pyvale/simcases/case16_d.i +360 -0
  801. pyvale/simcases/case16_u.i +360 -0
  802. pyvale/simcases/case17.geo +138 -0
  803. pyvale/simcases/case17.i +144 -0
  804. pyvale/simcases/case18.i +271 -0
  805. pyvale/simcases/case18_d.i +271 -0
  806. pyvale/simcases/case18_u.i +271 -0
  807. pyvale/simcases/case19.geo +252 -0
  808. pyvale/simcases/case19.i +99 -0
  809. pyvale/simcases/case20.geo +252 -0
  810. pyvale/simcases/case20.i +250 -0
  811. pyvale/simcases/case21.geo +74 -0
  812. pyvale/simcases/case21.i +155 -0
  813. pyvale/simcases/case22.geo +82 -0
  814. pyvale/simcases/case22.i +140 -0
  815. pyvale/simcases/case23.geo +164 -0
  816. pyvale/simcases/case23.i +140 -0
  817. pyvale/simcases/case24.geo +79 -0
  818. pyvale/simcases/case24.i +123 -0
  819. pyvale/simcases/case25.geo +82 -0
  820. pyvale/simcases/case25.i +140 -0
  821. pyvale/simcases/case26.geo +166 -0
  822. pyvale/simcases/case26.i +140 -0
  823. pyvale/simcases/cases_dictionary.yaml +336 -0
  824. pyvale/simcases/run_1case.py +60 -0
  825. pyvale/simcases/run_all_cases.py +69 -0
  826. pyvale/simcases/run_build_case.py +64 -0
  827. pyvale/simcases/run_example_cases.py +69 -0
  828. pyvale/strain/__init__.py +13 -0
  829. pyvale/strain/cpp/bindings.cpp +25 -0
  830. pyvale/strain/cpp/smooth.cpp +140 -0
  831. pyvale/strain/cpp/smooth.hpp +53 -0
  832. pyvale/strain/cpp/strain.cpp +390 -0
  833. pyvale/strain/cpp/strain.hpp +177 -0
  834. pyvale/strain/strain.py +117 -0
  835. pyvale/strain/strain_cpp.cpython-311-i386-linux-gnu.so +0 -0
  836. pyvale/strain/strainchecks.py +47 -0
  837. pyvale/strain/strainimport.py +303 -0
  838. pyvale/strain/strainresults.py +55 -0
  839. pyvale/verif/__init__.py +15 -0
  840. pyvale/verif/analyticmeshgen.py +102 -0
  841. pyvale/verif/analyticsimdatafactory.py +125 -0
  842. pyvale/verif/analyticsimdatagenerator.py +368 -0
  843. pyvale/verif/matchsimdata.py +113 -0
  844. pyvale/verif/pointsens.py +120 -0
  845. pyvale/verif/pointsensconst.py +19 -0
  846. pyvale/verif/pointsensmech.py +270 -0
  847. pyvale/verif/pointsensmultiphys.py +184 -0
  848. pyvale/verif/pointsensscalar.py +383 -0
  849. pyvale/verif/pointsenstensor.py +159 -0
  850. pyvale/verif/pointsensvector.py +157 -0
  851. pyvale-2026.1.1.dist-info/METADATA +98 -0
  852. pyvale-2026.1.1.dist-info/RECORD +860 -0
  853. pyvale-2026.1.1.dist-info/WHEEL +6 -0
  854. pyvale-2026.1.1.dist-info/licenses/LICENSE +21 -0
  855. pyvale.libs/libgomp-65f46eca.so.1.0.0 +0 -0
  856. share/eigen3/cmake/Eigen3Config.cmake +37 -0
  857. share/eigen3/cmake/Eigen3ConfigVersion.cmake +65 -0
  858. share/eigen3/cmake/Eigen3Targets.cmake +106 -0
  859. share/eigen3/cmake/UseEigen3.cmake +6 -0
  860. share/pkgconfig/eigen3.pc +9 -0
@@ -0,0 +1,3744 @@
1
+ /*
2
+ This file is part of pocketfft.
3
+
4
+ Copyright (C) 2010-2024 Max-Planck-Society
5
+ Copyright (C) 2019-2020 Peter Bell
6
+
7
+ For the odd-sized DCT-IV transforms:
8
+ Copyright (C) 2003, 2007-14 Matteo Frigo
9
+ Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology
10
+
11
+ For the prev_good_size search:
12
+ Copyright (C) 2024 Tan Ping Liang, Peter Bell
13
+
14
+ For the safeguards against integer overflow in good_size search:
15
+ Copyright (C) 2024 Cris Luengo
16
+
17
+ Authors: Martin Reinecke, Peter Bell
18
+
19
+ All rights reserved.
20
+
21
+ Redistribution and use in source and binary forms, with or without modification,
22
+ are permitted provided that the following conditions are met:
23
+
24
+ * Redistributions of source code must retain the above copyright notice, this
25
+ list of conditions and the following disclaimer.
26
+ * Redistributions in binary form must reproduce the above copyright notice, this
27
+ list of conditions and the following disclaimer in the documentation and/or
28
+ other materials provided with the distribution.
29
+ * Neither the name of the copyright holder nor the names of its contributors may
30
+ be used to endorse or promote products derived from this software without
31
+ specific prior written permission.
32
+
33
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
34
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
35
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
36
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
37
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
38
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
39
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
40
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43
+ */
44
+
45
+ #ifndef POCKETFFT_HDRONLY_H
46
+ #define POCKETFFT_HDRONLY_H
47
+
48
+ #ifndef __cplusplus
49
+ #error This file is C++ and requires a C++ compiler.
50
+ #endif
51
+
52
+ #if !(__cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L))
53
+ #error This file requires at least C++11 support.
54
+ #endif
55
+
56
+ #ifndef POCKETFFT_CACHE_SIZE
57
+ #define POCKETFFT_CACHE_SIZE 0
58
+ #endif
59
+
60
+ #include <cmath>
61
+ #include <cstdlib>
62
+ #include <cstddef>
63
+ #include <cstdint>
64
+ #include <exception>
65
+ #include <stdexcept>
66
+ #include <memory>
67
+ #include <vector>
68
+ #include <complex>
69
+ #include <algorithm>
70
+ #include <limits>
71
+ #if POCKETFFT_CACHE_SIZE!=0
72
+ #include <array>
73
+ #include <mutex>
74
+ #endif
75
+
76
+ #ifndef POCKETFFT_NO_MULTITHREADING
77
+ #include <mutex>
78
+ #include <condition_variable>
79
+ #include <thread>
80
+ #include <queue>
81
+ #include <atomic>
82
+ #include <functional>
83
+ #include <new>
84
+
85
+ #ifdef POCKETFFT_PTHREADS
86
+ # include <pthread.h>
87
+ #endif
88
+ #endif
89
+
90
+ #if defined(__GNUC__)
91
+ #define POCKETFFT_NOINLINE __attribute__((noinline))
92
+ #define POCKETFFT_RESTRICT __restrict__
93
+ #elif defined(_MSC_VER)
94
+ #define POCKETFFT_NOINLINE __declspec(noinline)
95
+ #define POCKETFFT_RESTRICT __restrict
96
+ #else
97
+ #define POCKETFFT_NOINLINE
98
+ #define POCKETFFT_RESTRICT
99
+ #endif
100
+
101
+ namespace pocketfft {
102
+
103
+ namespace detail {
104
+ using std::size_t;
105
+ using std::ptrdiff_t;
106
+
107
+ // Always use std:: for <cmath> functions
108
+ template <typename T> T cos(T) = delete;
109
+ template <typename T> T sin(T) = delete;
110
+ template <typename T> T sqrt(T) = delete;
111
+
112
+ using shape_t = std::vector<size_t>;
113
+ using stride_t = std::vector<ptrdiff_t>;
114
+
115
+ constexpr bool FORWARD = true,
116
+ BACKWARD = false;
117
+
118
+ // only enable vector support for gcc>=5.0 and clang>=5.0
119
+ #ifndef POCKETFFT_NO_VECTORS
120
+ #define POCKETFFT_NO_VECTORS
121
+ #if defined(__INTEL_COMPILER)
122
+ // do nothing. This is necessary because this compiler also sets __GNUC__.
123
+ #elif defined(__clang__)
124
+ // AppleClang has their own version numbering
125
+ #ifdef __apple_build_version__
126
+ # if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1)
127
+ # undef POCKETFFT_NO_VECTORS
128
+ # endif
129
+ #elif __clang_major__ >= 5
130
+ # undef POCKETFFT_NO_VECTORS
131
+ #endif
132
+ #elif defined(__GNUC__)
133
+ #if __GNUC__>=5
134
+ #undef POCKETFFT_NO_VECTORS
135
+ #endif
136
+ #endif
137
+ #endif
138
+
139
+ template<typename T> struct VLEN { static constexpr size_t val=1; };
140
+
141
+ #ifndef POCKETFFT_NO_VECTORS
142
+ #if (defined(__AVX512F__))
143
+ template<> struct VLEN<float> { static constexpr size_t val=16; };
144
+ template<> struct VLEN<double> { static constexpr size_t val=8; };
145
+ #elif (defined(__AVX__))
146
+ template<> struct VLEN<float> { static constexpr size_t val=8; };
147
+ template<> struct VLEN<double> { static constexpr size_t val=4; };
148
+ #elif (defined(__SSE2__))
149
+ template<> struct VLEN<float> { static constexpr size_t val=4; };
150
+ template<> struct VLEN<double> { static constexpr size_t val=2; };
151
+ #elif (defined(__VSX__))
152
+ template<> struct VLEN<float> { static constexpr size_t val=4; };
153
+ template<> struct VLEN<double> { static constexpr size_t val=2; };
154
+ #elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
155
+ template<> struct VLEN<float> { static constexpr size_t val=4; };
156
+ template<> struct VLEN<double> { static constexpr size_t val=2; };
157
+ #else
158
+ #define POCKETFFT_NO_VECTORS
159
+ #endif
160
+ #endif
161
+
162
+ // std::aligned_alloc is a bit cursed ... it doesn't exist on MacOS < 10.15
163
+ // and in musl, and other OSes seem to have even more peculiarities.
164
+ // Let's unconditionally work around it for now.
165
+ # if 0
166
+ //#if (__cplusplus >= 201703L) && (!defined(__MINGW32__)) && (!defined(_MSC_VER)) && (__MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_15)
167
+ inline void *aligned_alloc(size_t align, size_t size)
168
+ {
169
+ // aligned_alloc() requires that the requested size is a multiple of "align"
170
+ void *ptr = ::aligned_alloc(align,(size+align-1)&(~(align-1)));
171
+ if (!ptr) throw std::bad_alloc();
172
+ return ptr;
173
+ }
174
+ inline void aligned_dealloc(void *ptr)
175
+ { free(ptr); }
176
+ #else // portable emulation
177
+ inline void *aligned_alloc(size_t align, size_t size)
178
+ {
179
+ align = std::max(align, alignof(max_align_t));
180
+ void *ptr = malloc(size+align);
181
+ if (!ptr) throw std::bad_alloc();
182
+ void *res = reinterpret_cast<void *>
183
+ ((reinterpret_cast<uintptr_t>(ptr) & ~(uintptr_t(align-1))) + uintptr_t(align));
184
+ (reinterpret_cast<void**>(res))[-1] = ptr;
185
+ return res;
186
+ }
187
+ inline void aligned_dealloc(void *ptr)
188
+ { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
189
+ #endif
190
+
191
+ template<typename T> class arr
192
+ {
193
+ private:
194
+ T *p;
195
+ size_t sz;
196
+
197
+ #if defined(POCKETFFT_NO_VECTORS)
198
+ static T *ralloc(size_t num)
199
+ {
200
+ if (num==0) return nullptr;
201
+ void *res = malloc(num*sizeof(T));
202
+ if (!res) throw std::bad_alloc();
203
+ return reinterpret_cast<T *>(res);
204
+ }
205
+ static void dealloc(T *ptr)
206
+ { free(ptr); }
207
+ #else
208
+ static T *ralloc(size_t num)
209
+ {
210
+ if (num==0) return nullptr;
211
+ void *ptr = aligned_alloc(64, num*sizeof(T));
212
+ return static_cast<T*>(ptr);
213
+ }
214
+ static void dealloc(T *ptr)
215
+ { aligned_dealloc(ptr); }
216
+ #endif
217
+
218
+ public:
219
+ arr() : p(0), sz(0) {}
220
+ arr(size_t n) : p(ralloc(n)), sz(n) {}
221
+ arr(arr &&other)
222
+ : p(other.p), sz(other.sz)
223
+ { other.p=nullptr; other.sz=0; }
224
+ ~arr() { dealloc(p); }
225
+
226
+ void resize(size_t n)
227
+ {
228
+ if (n==sz) return;
229
+ dealloc(p);
230
+ p = ralloc(n);
231
+ sz = n;
232
+ }
233
+
234
+ T &operator[](size_t idx) { return p[idx]; }
235
+ const T &operator[](size_t idx) const { return p[idx]; }
236
+
237
+ T *data() { return p; }
238
+ const T *data() const { return p; }
239
+
240
+ size_t size() const { return sz; }
241
+ };
242
+
243
+ template<typename T> struct cmplx {
244
+ T r, i;
245
+ cmplx() {}
246
+ cmplx(T r_, T i_) : r(r_), i(i_) {}
247
+ void Set(T r_, T i_) { r=r_; i=i_; }
248
+ void Set(T r_) { r=r_; i=T(0); }
249
+ cmplx &operator+= (const cmplx &other)
250
+ { r+=other.r; i+=other.i; return *this; }
251
+ template<typename T2>cmplx &operator*= (T2 other)
252
+ { r*=other; i*=other; return *this; }
253
+ template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
254
+ {
255
+ T tmp = r*other.r - i*other.i;
256
+ i = r*other.i + i*other.r;
257
+ r = tmp;
258
+ return *this;
259
+ }
260
+ template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
261
+ { r+=other.r; i+=other.i; return *this; }
262
+ template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
263
+ { r-=other.r; i-=other.i; return *this; }
264
+ template<typename T2> auto operator* (const T2 &other) const
265
+ -> cmplx<decltype(r*other)>
266
+ { return {r*other, i*other}; }
267
+ template<typename T2> auto operator+ (const cmplx<T2> &other) const
268
+ -> cmplx<decltype(r+other.r)>
269
+ { return {r+other.r, i+other.i}; }
270
+ template<typename T2> auto operator- (const cmplx<T2> &other) const
271
+ -> cmplx<decltype(r+other.r)>
272
+ { return {r-other.r, i-other.i}; }
273
+ template<typename T2> auto operator* (const cmplx<T2> &other) const
274
+ -> cmplx<decltype(r+other.r)>
275
+ { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
276
+ template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
277
+ -> cmplx<decltype(r+other.r)>
278
+ {
279
+ using Tres = cmplx<decltype(r+other.r)>;
280
+ return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
281
+ : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
282
+ }
283
+ };
284
+ template<typename T> inline void PM(T &a, T &b, T c, T d)
285
+ { a=c+d; b=c-d; }
286
+ template<typename T> inline void PMINPLACE(T &a, T &b)
287
+ { T t = a; a+=b; b=t-b; }
288
+ template<typename T> inline void MPINPLACE(T &a, T &b)
289
+ { T t = a; a-=b; b=t+b; }
290
+ template<typename T> cmplx<T> conj(const cmplx<T> &a)
291
+ { return {a.r, -a.i}; }
292
+ template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res)
293
+ {
294
+ res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
295
+ : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
296
+ }
297
+
298
+ template<typename T> void ROT90(cmplx<T> &a)
299
+ { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
300
+ template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
301
+ { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
302
+
303
+ //
304
+ // twiddle factor section
305
+ //
306
+ template<typename T> class sincos_2pibyn
307
+ {
308
+ private:
309
+ using Thigh = typename std::conditional<(sizeof(T)>sizeof(double)), T, double>::type;
310
+ size_t N, mask, shift;
311
+ arr<cmplx<Thigh>> v1, v2;
312
+
313
+ static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
314
+ {
315
+ x<<=3;
316
+ if (x<4*n) // first half
317
+ {
318
+ if (x<2*n) // first quadrant
319
+ {
320
+ if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), std::sin(Thigh(x)*ang));
321
+ return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), std::cos(Thigh(2*n-x)*ang));
322
+ }
323
+ else // second quadrant
324
+ {
325
+ x-=2*n;
326
+ if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), std::cos(Thigh(x)*ang));
327
+ return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), std::sin(Thigh(2*n-x)*ang));
328
+ }
329
+ }
330
+ else
331
+ {
332
+ x=8*n-x;
333
+ if (x<2*n) // third quadrant
334
+ {
335
+ if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), -std::sin(Thigh(x)*ang));
336
+ return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), -std::cos(Thigh(2*n-x)*ang));
337
+ }
338
+ else // fourth quadrant
339
+ {
340
+ x-=2*n;
341
+ if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), -std::cos(Thigh(x)*ang));
342
+ return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), -std::sin(Thigh(2*n-x)*ang));
343
+ }
344
+ }
345
+ }
346
+
347
+ public:
348
+ POCKETFFT_NOINLINE sincos_2pibyn(size_t n)
349
+ : N(n)
350
+ {
351
+ constexpr auto pi = 3.141592653589793238462643383279502884197L;
352
+ Thigh ang = Thigh(0.25L*pi/n);
353
+ size_t nval = (n+2)/2;
354
+ shift = 1;
355
+ while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
356
+ mask = (size_t(1)<<shift)-1;
357
+ v1.resize(mask+1);
358
+ v1[0].Set(Thigh(1), Thigh(0));
359
+ for (size_t i=1; i<v1.size(); ++i)
360
+ v1[i]=calc(i,n,ang);
361
+ v2.resize((nval+mask)/(mask+1));
362
+ v2[0].Set(Thigh(1), Thigh(0));
363
+ for (size_t i=1; i<v2.size(); ++i)
364
+ v2[i]=calc(i*(mask+1),n,ang);
365
+ }
366
+
367
+ cmplx<T> operator[](size_t idx) const
368
+ {
369
+ if (2*idx<=N)
370
+ {
371
+ auto x1=v1[idx&mask], x2=v2[idx>>shift];
372
+ return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
373
+ }
374
+ idx = N-idx;
375
+ auto x1=v1[idx&mask], x2=v2[idx>>shift];
376
+ return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
377
+ }
378
+ };
379
+
380
+ struct util // hack to avoid duplicate symbols
381
+ {
382
+ static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
383
+ {
384
+ size_t res=1;
385
+ while ((n&1)==0)
386
+ { res=2; n>>=1; }
387
+ for (size_t x=3; x*x<=n; x+=2)
388
+ while ((n%x)==0)
389
+ { res=x; n/=x; }
390
+ if (n>1) res=n;
391
+ return res;
392
+ }
393
+
394
+ static POCKETFFT_NOINLINE double cost_guess (size_t n)
395
+ {
396
+ constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
397
+ size_t ni=n;
398
+ double result=0.;
399
+ while ((n&1)==0)
400
+ { result+=2; n>>=1; }
401
+ for (size_t x=3; x*x<=n; x+=2)
402
+ while ((n%x)==0)
403
+ {
404
+ result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
405
+ n/=x;
406
+ }
407
+ if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
408
+ return result*double(ni);
409
+ }
410
+
411
+ /* inner workings of good_size_cmplx() */
412
+ template<typename UIntT>
413
+ static POCKETFFT_NOINLINE UIntT good_size_cmplx_typed(UIntT n)
414
+ {
415
+ static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
416
+ "type must be unsigned integer");
417
+ if (n<=12) return n;
418
+ if (n>std::numeric_limits<UIntT>::max()/11/2)
419
+ {
420
+ // The algorithm below doesn't work for this value, the multiplication can overflow.
421
+ if (sizeof(UIntT)<sizeof(std::uint64_t))
422
+ {
423
+ // We can try using this algorithm with 64-bit integers:
424
+ std::uint64_t res = good_size_cmplx_typed<std::uint64_t>(n);
425
+ if (res<=std::numeric_limits<UIntT>::max())
426
+ return static_cast<UIntT>(res);
427
+ }
428
+ // Otherwise, this size is ridiculously large, people shouldn't be computing FFTs this large.
429
+ throw std::runtime_error("FFT size is too large.");
430
+ }
431
+
432
+ UIntT bestfac=2*n;
433
+ for (UIntT f11=1; f11<bestfac; f11*=11)
434
+ for (UIntT f117=f11; f117<bestfac; f117*=7)
435
+ for (UIntT f1175=f117; f1175<bestfac; f1175*=5)
436
+ {
437
+ UIntT x=f1175;
438
+ while (x<n) x*=2;
439
+ for (;;)
440
+ {
441
+ if (x<n)
442
+ x*=3;
443
+ else if (x>n)
444
+ {
445
+ if (x<bestfac) bestfac=x;
446
+ if (x&1) break;
447
+ x>>=1;
448
+ }
449
+ else
450
+ return n;
451
+ }
452
+ }
453
+ return bestfac;
454
+ }
455
+ /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
456
+ static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
457
+ {
458
+ return good_size_cmplx_typed(n);
459
+ }
460
+ /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n
461
+ and a multiple of required_factor. */
462
+ static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n,
463
+ size_t required_factor)
464
+ {
465
+ if (required_factor<1)
466
+ throw std::runtime_error("required factor must not be 0");
467
+ return good_size_cmplx((n+required_factor-1)/required_factor) * required_factor;
468
+ }
469
+
470
+ /* inner workings of good_size_real() */
471
+ template<typename UIntT>
472
+ static POCKETFFT_NOINLINE UIntT good_size_real_typed(UIntT n)
473
+ {
474
+ static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
475
+ "type must be unsigned integer");
476
+ if (n<=6) return n;
477
+ if (n>std::numeric_limits<UIntT>::max()/5/2)
478
+ {
479
+ // The algorithm below doesn't work for this value, the multiplication can overflow.
480
+ if (sizeof(UIntT)<sizeof(std::uint64_t))
481
+ {
482
+ // We can try using this algorithm with 64-bit integers:
483
+ std::uint64_t res = good_size_real_typed<std::uint64_t>(n);
484
+ if (res<=std::numeric_limits<UIntT>::max())
485
+ return static_cast<UIntT>(res);
486
+ }
487
+ // Otherwise, this size is ridiculously large, people shouldn't be computing FFTs this large.
488
+ throw std::runtime_error("FFT size is too large.");
489
+ }
490
+
491
+ UIntT bestfac=2*n;
492
+ for (UIntT f5=1; f5<bestfac; f5*=5)
493
+ {
494
+ UIntT x = f5;
495
+ while (x<n) x *= 2;
496
+ for (;;)
497
+ {
498
+ if (x<n)
499
+ x*=3;
500
+ else if (x>n)
501
+ {
502
+ if (x<bestfac) bestfac=x;
503
+ if (x&1) break;
504
+ x>>=1;
505
+ }
506
+ else
507
+ return n;
508
+ }
509
+ }
510
+ return bestfac;
511
+ }
512
+ /* returns the smallest composite of 2, 3, 5 which is >= n */
513
+ static POCKETFFT_NOINLINE size_t good_size_real(size_t n)
514
+ {
515
+ return good_size_real_typed(n);
516
+ }
517
+ /* returns the smallest composite of 2, 3, 5 which is >= n
518
+ and a multiple of required_factor. */
519
+ static POCKETFFT_NOINLINE size_t good_size_real(size_t n,
520
+ size_t required_factor)
521
+ {
522
+ if (required_factor<1)
523
+ throw std::runtime_error("required factor must not be 0");
524
+ return good_size_real((n+required_factor-1)/required_factor) * required_factor;
525
+ }
526
+
527
+ /* inner workings of prev_good_size_cmplx() */
528
+ template<typename UIntT>
529
+ static POCKETFFT_NOINLINE UIntT prev_good_size_cmplx_typed(UIntT n)
530
+ {
531
+ static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
532
+ "type must be unsigned integer");
533
+ if (n<=12) return n;
534
+ if (n>std::numeric_limits<UIntT>::max()/11)
535
+ {
536
+ // The algorithm below doesn't work for this value, the multiplication can overflow.
537
+ if (sizeof(UIntT)<sizeof(std::uint64_t))
538
+ {
539
+ // We can try using this algorithm with 64-bit integers:
540
+ std::uint64_t res = prev_good_size_cmplx_typed<std::uint64_t>(n);
541
+ if (res<=std::numeric_limits<UIntT>::max())
542
+ return static_cast<UIntT>(res);
543
+ }
544
+ // Otherwise, this size is ridiculously large, people shouldn't be computing FFTs this large.
545
+ throw std::runtime_error("FFT size is too large.");
546
+ }
547
+
548
+ UIntT bestfound = 1;
549
+ for (UIntT f11 = 1;f11 <= n; f11 *= 11)
550
+ for (UIntT f117 = f11; f117 <= n; f117 *= 7)
551
+ for (UIntT f1175 = f117; f1175 <= n; f1175 *= 5)
552
+ {
553
+ UIntT x = f1175;
554
+ while (x*2 <= n) x *= 2;
555
+ if (x > bestfound) bestfound = x;
556
+ while (true)
557
+ {
558
+ if (x * 3 <= n) x *= 3;
559
+ else if (x % 2 == 0) x /= 2;
560
+ else break;
561
+
562
+ if (x > bestfound) bestfound = x;
563
+ }
564
+ }
565
+ return bestfound;
566
+ }
567
+ /* returns the largest composite of 2, 3, 5, 7 and 11 which is <= n */
568
+ static POCKETFFT_NOINLINE size_t prev_good_size_cmplx(size_t n)
569
+ {
570
+ return prev_good_size_cmplx_typed(n);
571
+ }
572
+
573
+ /* inner workings of prev_good_size_real() */
574
+ template<typename UIntT>
575
+ static POCKETFFT_NOINLINE UIntT prev_good_size_real_typed(UIntT n)
576
+ {
577
+ static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
578
+ "type must be unsigned integer");
579
+ if (n<=6) return n;
580
+ if (n>std::numeric_limits<UIntT>::max()/5)
581
+ {
582
+ // The algorithm below doesn't work for this value, the multiplication can overflow.
583
+ if (sizeof(UIntT)<sizeof(std::uint64_t))
584
+ {
585
+ // We can try using this algorithm with 64-bit integers:
586
+ std::uint64_t res = prev_good_size_real_typed<std::uint64_t>(n);
587
+ if (res<=std::numeric_limits<UIntT>::max())
588
+ return static_cast<UIntT>(res);
589
+ }
590
+ // Otherwise, this size is ridiculously large, people shouldn't be computing FFTs this large.
591
+ throw std::runtime_error("FFT size is too large.");
592
+ }
593
+
594
+ UIntT bestfound = 1;
595
+ for (UIntT f5 = 1; f5 <= n; f5 *= 5)
596
+ {
597
+ UIntT x = f5;
598
+ while (x*2 <= n) x *= 2;
599
+ if (x > bestfound) bestfound = x;
600
+ while (true)
601
+ {
602
+ if (x * 3 <= n) x *= 3;
603
+ else if (x % 2 == 0) x /= 2;
604
+ else break;
605
+
606
+ if (x > bestfound) bestfound = x;
607
+ }
608
+ }
609
+ return bestfound;
610
+ }
611
+ /* returns the largest composite of 2, 3, 5 which is <= n */
612
+ static POCKETFFT_NOINLINE size_t prev_good_size_real(size_t n)
613
+ {
614
+ return prev_good_size_real_typed(n);
615
+ }
616
+
617
+ static size_t prod(const shape_t &shape)
618
+ {
619
+ size_t res=1;
620
+ for (auto sz: shape)
621
+ res*=sz;
622
+ return res;
623
+ }
624
+
625
+ static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
626
+ const stride_t &stride_in, const stride_t &stride_out, bool inplace)
627
+ {
628
+ auto ndim = shape.size();
629
+ if (ndim<1) throw std::runtime_error("ndim must be >= 1");
630
+ if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
631
+ throw std::runtime_error("stride dimension mismatch");
632
+ if (inplace && (stride_in!=stride_out))
633
+ throw std::runtime_error("stride mismatch");
634
+ }
635
+
636
+ static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
637
+ const stride_t &stride_in, const stride_t &stride_out, bool inplace,
638
+ const shape_t &axes)
639
+ {
640
+ sanity_check(shape, stride_in, stride_out, inplace);
641
+ auto ndim = shape.size();
642
+ shape_t tmp(ndim,0);
643
+ for (auto ax : axes)
644
+ {
645
+ if (ax>=ndim) throw std::invalid_argument("bad axis number");
646
+ if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly");
647
+ }
648
+ }
649
+
650
+ static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
651
+ const stride_t &stride_in, const stride_t &stride_out, bool inplace,
652
+ size_t axis)
653
+ {
654
+ sanity_check(shape, stride_in, stride_out, inplace);
655
+ if (axis>=shape.size()) throw std::invalid_argument("bad axis number");
656
+ }
657
+
658
+ #ifdef POCKETFFT_NO_MULTITHREADING
659
+ static size_t thread_count (size_t /*nthreads*/, const shape_t &/*shape*/,
660
+ size_t /*axis*/, size_t /*vlen*/)
661
+ { return 1; }
662
+ #else
663
+ static size_t thread_count (size_t nthreads, const shape_t &shape,
664
+ size_t axis, size_t vlen)
665
+ {
666
+ if (nthreads==1) return 1;
667
+ size_t size = prod(shape);
668
+ size_t parallel = size / (shape[axis] * vlen);
669
+ if (shape[axis] < 1000)
670
+ parallel /= 4;
671
+ size_t max_threads = nthreads == 0 ?
672
+ std::thread::hardware_concurrency() : nthreads;
673
+ return std::max(size_t(1), std::min(parallel, max_threads));
674
+ }
675
+ #endif
676
+ };
677
+
678
+ namespace threading {
679
+
680
+ #ifdef POCKETFFT_NO_MULTITHREADING
681
+
682
+ constexpr inline size_t thread_id() { return 0; }
683
+ constexpr inline size_t num_threads() { return 1; }
684
+
685
+ template <typename Func>
686
+ void thread_map(size_t /* nthreads */, Func f)
687
+ { f(); }
688
+
689
+ #else
690
+
691
+ inline size_t &thread_id()
692
+ {
693
+ static thread_local size_t thread_id_=0;
694
+ return thread_id_;
695
+ }
696
+ inline size_t &num_threads()
697
+ {
698
+ static thread_local size_t num_threads_=1;
699
+ return num_threads_;
700
+ }
701
+ static const size_t max_threads = std::max(1u, std::thread::hardware_concurrency());
702
+
703
+ class latch
704
+ {
705
+ std::atomic<size_t> num_left_;
706
+ std::mutex mut_;
707
+ std::condition_variable completed_;
708
+ using lock_t = std::unique_lock<std::mutex>;
709
+
710
+ public:
711
+ latch(size_t n): num_left_(n) {}
712
+
713
+ void count_down()
714
+ {
715
+ lock_t lock(mut_);
716
+ if (--num_left_)
717
+ return;
718
+ completed_.notify_all();
719
+ }
720
+
721
+ void wait()
722
+ {
723
+ lock_t lock(mut_);
724
+ completed_.wait(lock, [this]{ return is_ready(); });
725
+ }
726
+ bool is_ready() { return num_left_ == 0; }
727
+ };
728
+
729
+ template <typename T> class concurrent_queue
730
+ {
731
+ std::queue<T> q_;
732
+ std::mutex mut_;
733
+ std::atomic<size_t> size_;
734
+ using lock_t = std::lock_guard<std::mutex>;
735
+
736
+ public:
737
+
738
+ void push(T val)
739
+ {
740
+ lock_t lock(mut_);
741
+ ++size_;
742
+ q_.push(std::move(val));
743
+ }
744
+
745
+ bool try_pop(T &val)
746
+ {
747
+ if (size_ == 0) return false;
748
+ lock_t lock(mut_);
749
+ // Queue might have been emptied while we acquired the lock
750
+ if (q_.empty()) return false;
751
+
752
+ val = std::move(q_.front());
753
+ --size_;
754
+ q_.pop();
755
+ return true;
756
+ }
757
+
758
+ bool empty() const { return size_==0; }
759
+ };
760
+
761
+ // C++ allocator with support for over-aligned types
762
+ template <typename T> struct aligned_allocator
763
+ {
764
+ using value_type = T;
765
+ template <class U>
766
+ aligned_allocator(const aligned_allocator<U>&) {}
767
+ aligned_allocator() = default;
768
+
769
+ T *allocate(size_t n)
770
+ {
771
+ void* mem = aligned_alloc(alignof(T), n*sizeof(T));
772
+ return static_cast<T*>(mem);
773
+ }
774
+
775
+ void deallocate(T *p, size_t /*n*/)
776
+ { aligned_dealloc(p); }
777
+ };
778
+
779
+ class thread_pool
780
+ {
781
+ // A reasonable guess, probably close enough for most hardware
782
+ static constexpr size_t cache_line_size = 64;
783
+ struct alignas(cache_line_size) worker
784
+ {
785
+ std::thread thread;
786
+ std::condition_variable work_ready;
787
+ std::mutex mut;
788
+ std::atomic_flag busy_flag = ATOMIC_FLAG_INIT;
789
+ std::function<void()> work;
790
+
791
+ void worker_main(
792
+ std::atomic<bool> &shutdown_flag,
793
+ std::atomic<size_t> &unscheduled_tasks,
794
+ concurrent_queue<std::function<void()>> &overflow_work)
795
+ {
796
+ using lock_t = std::unique_lock<std::mutex>;
797
+ bool expect_work = true;
798
+ while (!shutdown_flag || expect_work)
799
+ {
800
+ std::function<void()> local_work;
801
+ if (expect_work || unscheduled_tasks == 0)
802
+ {
803
+ lock_t lock(mut);
804
+ // Wait until there is work to be executed
805
+ work_ready.wait(lock, [&]{ return (work || shutdown_flag); });
806
+ local_work.swap(work);
807
+ expect_work = false;
808
+ }
809
+
810
+ bool marked_busy = false;
811
+ if (local_work)
812
+ {
813
+ marked_busy = true;
814
+ local_work();
815
+ }
816
+
817
+ if (!overflow_work.empty())
818
+ {
819
+ if (!marked_busy && busy_flag.test_and_set())
820
+ {
821
+ expect_work = true;
822
+ continue;
823
+ }
824
+ marked_busy = true;
825
+
826
+ while (overflow_work.try_pop(local_work))
827
+ {
828
+ --unscheduled_tasks;
829
+ local_work();
830
+ }
831
+ }
832
+
833
+ if (marked_busy) busy_flag.clear();
834
+ }
835
+ }
836
+ };
837
+
838
+ concurrent_queue<std::function<void()>> overflow_work_;
839
+ std::mutex mut_;
840
+ std::vector<worker, aligned_allocator<worker>> workers_;
841
+ std::atomic<bool> shutdown_;
842
+ std::atomic<size_t> unscheduled_tasks_;
843
+ using lock_t = std::lock_guard<std::mutex>;
844
+
845
+ void create_threads()
846
+ {
847
+ lock_t lock(mut_);
848
+ size_t nthreads=workers_.size();
849
+ for (size_t i=0; i<nthreads; ++i)
850
+ {
851
+ try
852
+ {
853
+ auto *worker = &workers_[i];
854
+ worker->busy_flag.clear();
855
+ worker->work = nullptr;
856
+ worker->thread = std::thread([worker, this]
857
+ {
858
+ worker->worker_main(shutdown_, unscheduled_tasks_, overflow_work_);
859
+ });
860
+ }
861
+ catch (...)
862
+ {
863
+ shutdown_locked();
864
+ throw;
865
+ }
866
+ }
867
+ }
868
+
869
+ void shutdown_locked()
870
+ {
871
+ shutdown_ = true;
872
+ for (auto &worker : workers_)
873
+ worker.work_ready.notify_all();
874
+
875
+ for (auto &worker : workers_)
876
+ if (worker.thread.joinable())
877
+ worker.thread.join();
878
+ }
879
+
880
+ public:
881
+ explicit thread_pool(size_t nthreads):
882
+ workers_(nthreads)
883
+ { create_threads(); }
884
+
885
+ thread_pool(): thread_pool(max_threads) {}
886
+
887
+ ~thread_pool() { shutdown(); }
888
+
889
+ void submit(std::function<void()> work)
890
+ {
891
+ lock_t lock(mut_);
892
+ if (shutdown_)
893
+ throw std::runtime_error("Work item submitted after shutdown");
894
+
895
+ ++unscheduled_tasks_;
896
+
897
+ // First check for any idle workers and wake those
898
+ for (auto &worker : workers_)
899
+ if (!worker.busy_flag.test_and_set())
900
+ {
901
+ --unscheduled_tasks_;
902
+ {
903
+ lock_t lock(worker.mut);
904
+ worker.work = std::move(work);
905
+ }
906
+ worker.work_ready.notify_one();
907
+ return;
908
+ }
909
+
910
+ // If no workers were idle, push onto the overflow queue for later
911
+ overflow_work_.push(std::move(work));
912
+ }
913
+
914
+ void shutdown()
915
+ {
916
+ lock_t lock(mut_);
917
+ shutdown_locked();
918
+ }
919
+
920
+ void restart()
921
+ {
922
+ shutdown_ = false;
923
+ create_threads();
924
+ }
925
+ };
926
+
927
+ inline thread_pool & get_pool()
928
+ {
929
+ static thread_pool pool;
930
+ #ifdef POCKETFFT_PTHREADS
931
+ static std::once_flag f;
932
+ std::call_once(f,
933
+ []{
934
+ pthread_atfork(
935
+ +[]{ get_pool().shutdown(); }, // prepare
936
+ +[]{ get_pool().restart(); }, // parent
937
+ +[]{ get_pool().restart(); } // child
938
+ );
939
+ });
940
+ #endif
941
+
942
+ return pool;
943
+ }
944
+
945
+ /** Map a function f over nthreads */
946
+ template <typename Func>
947
+ void thread_map(size_t nthreads, Func f)
948
+ {
949
+ if (nthreads == 0)
950
+ nthreads = max_threads;
951
+
952
+ if (nthreads == 1)
953
+ { f(); return; }
954
+
955
+ auto & pool = get_pool();
956
+ latch counter(nthreads);
957
+ std::exception_ptr ex;
958
+ std::mutex ex_mut;
959
+ for (size_t i=0; i<nthreads; ++i)
960
+ {
961
+ pool.submit(
962
+ [&f, &counter, &ex, &ex_mut, i, nthreads] {
963
+ thread_id() = i;
964
+ num_threads() = nthreads;
965
+ try { f(); }
966
+ catch (...)
967
+ {
968
+ std::lock_guard<std::mutex> lock(ex_mut);
969
+ ex = std::current_exception();
970
+ }
971
+ counter.count_down();
972
+ });
973
+ }
974
+ counter.wait();
975
+ if (ex)
976
+ std::rethrow_exception(ex);
977
+ }
978
+
979
+ #endif
980
+
981
+ }
982
+
983
+ //
984
+ // complex FFTPACK transforms
985
+ //
986
+
987
+ template<typename T0> class cfftp
988
+ {
989
+ private:
990
+ struct fctdata
991
+ {
992
+ size_t fct;
993
+ cmplx<T0> *tw, *tws;
994
+ };
995
+
996
+ size_t length;
997
+ arr<cmplx<T0>> mem;
998
+ std::vector<fctdata> fact;
999
+
1000
+ void add_factor(size_t factor)
1001
+ { fact.push_back({factor, nullptr, nullptr}); }
1002
+
1003
+ template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
1004
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1005
+ const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1006
+ {
1007
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1008
+ { return ch[a+ido*(b+l1*c)]; };
1009
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1010
+ { return cc[a+ido*(b+2*c)]; };
1011
+ auto WA = [wa, ido](size_t x, size_t i)
1012
+ { return wa[i-1+x*(ido-1)]; };
1013
+
1014
+ if (ido==1)
1015
+ for (size_t k=0; k<l1; ++k)
1016
+ {
1017
+ CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
1018
+ CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
1019
+ }
1020
+ else
1021
+ for (size_t k=0; k<l1; ++k)
1022
+ {
1023
+ CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
1024
+ CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
1025
+ for (size_t i=1; i<ido; ++i)
1026
+ {
1027
+ CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
1028
+ special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1));
1029
+ }
1030
+ }
1031
+ }
1032
+
1033
+ #define POCKETFFT_PREP3(idx) \
1034
+ T t0 = CC(idx,0,k), t1, t2; \
1035
+ PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
1036
+ CH(idx,k,0)=t0+t1;
1037
+ #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
1038
+ { \
1039
+ T ca=t0+t1*twr; \
1040
+ T cb{-t2.i*twi, t2.r*twi}; \
1041
+ PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
1042
+ }
1043
+ #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
1044
+ { \
1045
+ T ca=t0+t1*twr; \
1046
+ T cb{-t2.i*twi, t2.r*twi}; \
1047
+ special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
1048
+ special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
1049
+ }
1050
+ template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
1051
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1052
+ const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1053
+ {
1054
+ constexpr T0 tw1r=-0.5,
1055
+ tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
1056
+
1057
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1058
+ { return ch[a+ido*(b+l1*c)]; };
1059
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1060
+ { return cc[a+ido*(b+3*c)]; };
1061
+ auto WA = [wa, ido](size_t x, size_t i)
1062
+ { return wa[i-1+x*(ido-1)]; };
1063
+
1064
+ if (ido==1)
1065
+ for (size_t k=0; k<l1; ++k)
1066
+ {
1067
+ POCKETFFT_PREP3(0)
1068
+ POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
1069
+ }
1070
+ else
1071
+ for (size_t k=0; k<l1; ++k)
1072
+ {
1073
+ {
1074
+ POCKETFFT_PREP3(0)
1075
+ POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
1076
+ }
1077
+ for (size_t i=1; i<ido; ++i)
1078
+ {
1079
+ POCKETFFT_PREP3(i)
1080
+ POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
1081
+ }
1082
+ }
1083
+ }
1084
+
1085
+ #undef POCKETFFT_PARTSTEP3b
1086
+ #undef POCKETFFT_PARTSTEP3a
1087
+ #undef POCKETFFT_PREP3
1088
+
1089
+ template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
1090
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1091
+ const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1092
+ {
1093
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1094
+ { return ch[a+ido*(b+l1*c)]; };
1095
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1096
+ { return cc[a+ido*(b+4*c)]; };
1097
+ auto WA = [wa, ido](size_t x, size_t i)
1098
+ { return wa[i-1+x*(ido-1)]; };
1099
+
1100
+ if (ido==1)
1101
+ for (size_t k=0; k<l1; ++k)
1102
+ {
1103
+ T t1, t2, t3, t4;
1104
+ PM(t2,t1,CC(0,0,k),CC(0,2,k));
1105
+ PM(t3,t4,CC(0,1,k),CC(0,3,k));
1106
+ ROTX90<fwd>(t4);
1107
+ PM(CH(0,k,0),CH(0,k,2),t2,t3);
1108
+ PM(CH(0,k,1),CH(0,k,3),t1,t4);
1109
+ }
1110
+ else
1111
+ for (size_t k=0; k<l1; ++k)
1112
+ {
1113
+ {
1114
+ T t1, t2, t3, t4;
1115
+ PM(t2,t1,CC(0,0,k),CC(0,2,k));
1116
+ PM(t3,t4,CC(0,1,k),CC(0,3,k));
1117
+ ROTX90<fwd>(t4);
1118
+ PM(CH(0,k,0),CH(0,k,2),t2,t3);
1119
+ PM(CH(0,k,1),CH(0,k,3),t1,t4);
1120
+ }
1121
+ for (size_t i=1; i<ido; ++i)
1122
+ {
1123
+ T t1, t2, t3, t4;
1124
+ T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
1125
+ PM(t2,t1,cc0,cc2);
1126
+ PM(t3,t4,cc1,cc3);
1127
+ ROTX90<fwd>(t4);
1128
+ CH(i,k,0) = t2+t3;
1129
+ special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
1130
+ special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
1131
+ special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
1132
+ }
1133
+ }
1134
+ }
1135
+
1136
+ #define POCKETFFT_PREP5(idx) \
1137
+ T t0 = CC(idx,0,k), t1, t2, t3, t4; \
1138
+ PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
1139
+ PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
1140
+ CH(idx,k,0).r=t0.r+t1.r+t2.r; \
1141
+ CH(idx,k,0).i=t0.i+t1.i+t2.i;
1142
+
1143
+ #define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
1144
+ { \
1145
+ T ca,cb; \
1146
+ ca.r=t0.r+twar*t1.r+twbr*t2.r; \
1147
+ ca.i=t0.i+twar*t1.i+twbr*t2.i; \
1148
+ cb.i=twai*t4.r twbi*t3.r; \
1149
+ cb.r=-(twai*t4.i twbi*t3.i); \
1150
+ PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
1151
+ }
1152
+
1153
+ #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
1154
+ { \
1155
+ T ca,cb,da,db; \
1156
+ ca.r=t0.r+twar*t1.r+twbr*t2.r; \
1157
+ ca.i=t0.i+twar*t1.i+twbr*t2.i; \
1158
+ cb.i=twai*t4.r twbi*t3.r; \
1159
+ cb.r=-(twai*t4.i twbi*t3.i); \
1160
+ special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
1161
+ special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
1162
+ }
1163
+ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
1164
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1165
+ const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1166
+ {
1167
+ constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
1168
+ tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
1169
+ tw2r= T0(-0.8090169943749474241022934171828191L),
1170
+ tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
1171
+
1172
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1173
+ { return ch[a+ido*(b+l1*c)]; };
1174
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1175
+ { return cc[a+ido*(b+5*c)]; };
1176
+ auto WA = [wa, ido](size_t x, size_t i)
1177
+ { return wa[i-1+x*(ido-1)]; };
1178
+
1179
+ if (ido==1)
1180
+ for (size_t k=0; k<l1; ++k)
1181
+ {
1182
+ POCKETFFT_PREP5(0)
1183
+ POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
1184
+ POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
1185
+ }
1186
+ else
1187
+ for (size_t k=0; k<l1; ++k)
1188
+ {
1189
+ {
1190
+ POCKETFFT_PREP5(0)
1191
+ POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
1192
+ POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
1193
+ }
1194
+ for (size_t i=1; i<ido; ++i)
1195
+ {
1196
+ POCKETFFT_PREP5(i)
1197
+ POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
1198
+ POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
1199
+ }
1200
+ }
1201
+ }
1202
+
1203
+ #undef POCKETFFT_PARTSTEP5b
1204
+ #undef POCKETFFT_PARTSTEP5a
1205
+ #undef POCKETFFT_PREP5
1206
+
1207
+ #define POCKETFFT_PREP7(idx) \
1208
+ T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
1209
+ PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
1210
+ PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
1211
+ PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
1212
+ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
1213
+ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
1214
+
1215
+ #define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
1216
+ { \
1217
+ T ca,cb; \
1218
+ ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
1219
+ ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
1220
+ cb.i=y1*t7.r y2*t6.r y3*t5.r; \
1221
+ cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
1222
+ PM(out1,out2,ca,cb); \
1223
+ }
1224
+ #define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
1225
+ POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
1226
+ #define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
1227
+ { \
1228
+ T da,db; \
1229
+ POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
1230
+ special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
1231
+ special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
1232
+ }
1233
+
1234
+ template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
1235
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1236
+ const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1237
+ {
1238
+ constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
1239
+ tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
1240
+ tw2r= T0(-0.2225209339563144042889025644967948L),
1241
+ tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
1242
+ tw3r= T0(-0.9009688679024191262361023195074451L),
1243
+ tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
1244
+
1245
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1246
+ { return ch[a+ido*(b+l1*c)]; };
1247
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1248
+ { return cc[a+ido*(b+7*c)]; };
1249
+ auto WA = [wa, ido](size_t x, size_t i)
1250
+ { return wa[i-1+x*(ido-1)]; };
1251
+
1252
+ if (ido==1)
1253
+ for (size_t k=0; k<l1; ++k)
1254
+ {
1255
+ POCKETFFT_PREP7(0)
1256
+ POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1257
+ POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1258
+ POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1259
+ }
1260
+ else
1261
+ for (size_t k=0; k<l1; ++k)
1262
+ {
1263
+ {
1264
+ POCKETFFT_PREP7(0)
1265
+ POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1266
+ POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1267
+ POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1268
+ }
1269
+ for (size_t i=1; i<ido; ++i)
1270
+ {
1271
+ POCKETFFT_PREP7(i)
1272
+ POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1273
+ POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1274
+ POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1275
+ }
1276
+ }
1277
+ }
1278
+
1279
+ #undef POCKETFFT_PARTSTEP7
1280
+ #undef POCKETFFT_PARTSTEP7a0
1281
+ #undef POCKETFFT_PARTSTEP7a
1282
+ #undef POCKETFFT_PREP7
1283
+
1284
+ template <bool fwd, typename T> void ROTX45(T &a) const
1285
+ {
1286
+ constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1287
+ if (fwd)
1288
+ { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
1289
+ else
1290
+ { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
1291
+ }
1292
+ template <bool fwd, typename T> void ROTX135(T &a) const
1293
+ {
1294
+ constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1295
+ if (fwd)
1296
+ { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
1297
+ else
1298
+ { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
1299
+ }
1300
+
1301
+ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
1302
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1303
+ const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1304
+ {
1305
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1306
+ { return ch[a+ido*(b+l1*c)]; };
1307
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1308
+ { return cc[a+ido*(b+8*c)]; };
1309
+ auto WA = [wa, ido](size_t x, size_t i)
1310
+ { return wa[i-1+x*(ido-1)]; };
1311
+
1312
+ if (ido==1)
1313
+ for (size_t k=0; k<l1; ++k)
1314
+ {
1315
+ T a0, a1, a2, a3, a4, a5, a6, a7;
1316
+ PM(a1,a5,CC(0,1,k),CC(0,5,k));
1317
+ PM(a3,a7,CC(0,3,k),CC(0,7,k));
1318
+ PMINPLACE(a1,a3);
1319
+ ROTX90<fwd>(a3);
1320
+
1321
+ ROTX90<fwd>(a7);
1322
+ PMINPLACE(a5,a7);
1323
+ ROTX45<fwd>(a5);
1324
+ ROTX135<fwd>(a7);
1325
+
1326
+ PM(a0,a4,CC(0,0,k),CC(0,4,k));
1327
+ PM(a2,a6,CC(0,2,k),CC(0,6,k));
1328
+ PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
1329
+ PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
1330
+ ROTX90<fwd>(a6);
1331
+ PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
1332
+ PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
1333
+ }
1334
+ else
1335
+ for (size_t k=0; k<l1; ++k)
1336
+ {
1337
+ {
1338
+ T a0, a1, a2, a3, a4, a5, a6, a7;
1339
+ PM(a1,a5,CC(0,1,k),CC(0,5,k));
1340
+ PM(a3,a7,CC(0,3,k),CC(0,7,k));
1341
+ PMINPLACE(a1,a3);
1342
+ ROTX90<fwd>(a3);
1343
+
1344
+ ROTX90<fwd>(a7);
1345
+ PMINPLACE(a5,a7);
1346
+ ROTX45<fwd>(a5);
1347
+ ROTX135<fwd>(a7);
1348
+
1349
+ PM(a0,a4,CC(0,0,k),CC(0,4,k));
1350
+ PM(a2,a6,CC(0,2,k),CC(0,6,k));
1351
+ PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
1352
+ PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
1353
+ ROTX90<fwd>(a6);
1354
+ PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
1355
+ PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
1356
+ }
1357
+ for (size_t i=1; i<ido; ++i)
1358
+ {
1359
+ T a0, a1, a2, a3, a4, a5, a6, a7;
1360
+ PM(a1,a5,CC(i,1,k),CC(i,5,k));
1361
+ PM(a3,a7,CC(i,3,k),CC(i,7,k));
1362
+ ROTX90<fwd>(a7);
1363
+ PMINPLACE(a1,a3);
1364
+ ROTX90<fwd>(a3);
1365
+ PMINPLACE(a5,a7);
1366
+ ROTX45<fwd>(a5);
1367
+ ROTX135<fwd>(a7);
1368
+ PM(a0,a4,CC(i,0,k),CC(i,4,k));
1369
+ PM(a2,a6,CC(i,2,k),CC(i,6,k));
1370
+ PMINPLACE(a0,a2);
1371
+ CH(i,k,0) = a0+a1;
1372
+ special_mul<fwd>(a0-a1,WA(3,i),CH(i,k,4));
1373
+ special_mul<fwd>(a2+a3,WA(1,i),CH(i,k,2));
1374
+ special_mul<fwd>(a2-a3,WA(5,i),CH(i,k,6));
1375
+ ROTX90<fwd>(a6);
1376
+ PMINPLACE(a4,a6);
1377
+ special_mul<fwd>(a4+a5,WA(0,i),CH(i,k,1));
1378
+ special_mul<fwd>(a4-a5,WA(4,i),CH(i,k,5));
1379
+ special_mul<fwd>(a6+a7,WA(2,i),CH(i,k,3));
1380
+ special_mul<fwd>(a6-a7,WA(6,i),CH(i,k,7));
1381
+ }
1382
+ }
1383
+ }
1384
+
1385
+
1386
+ #define POCKETFFT_PREP11(idx) \
1387
+ T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
1388
+ PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
1389
+ PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
1390
+ PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
1391
+ PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
1392
+ PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
1393
+ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
1394
+ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
1395
+
1396
+ #define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
1397
+ { \
1398
+ T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
1399
+ cb; \
1400
+ cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
1401
+ cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
1402
+ PM(out1,out2,ca,cb); \
1403
+ }
1404
+ #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1405
+ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
1406
+ #define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1407
+ { \
1408
+ T da,db; \
1409
+ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
1410
+ special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
1411
+ special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
1412
+ }
1413
+
1414
+ template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
1415
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1416
+ const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1417
+ {
1418
+ constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
1419
+ tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
1420
+ tw2r= T0(0.4154150130018864255292741492296232L),
1421
+ tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L),
1422
+ tw3r= T0(-0.1423148382732851404437926686163697L),
1423
+ tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L),
1424
+ tw4r= T0(-0.6548607339452850640569250724662936L),
1425
+ tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L),
1426
+ tw5r= T0(-0.9594929736144973898903680570663277L),
1427
+ tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L);
1428
+
1429
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1430
+ { return ch[a+ido*(b+l1*c)]; };
1431
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1432
+ { return cc[a+ido*(b+11*c)]; };
1433
+ auto WA = [wa, ido](size_t x, size_t i)
1434
+ { return wa[i-1+x*(ido-1)]; };
1435
+
1436
+ if (ido==1)
1437
+ for (size_t k=0; k<l1; ++k)
1438
+ {
1439
+ POCKETFFT_PREP11(0)
1440
+ POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1441
+ POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1442
+ POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1443
+ POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1444
+ POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1445
+ }
1446
+ else
1447
+ for (size_t k=0; k<l1; ++k)
1448
+ {
1449
+ {
1450
+ POCKETFFT_PREP11(0)
1451
+ POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1452
+ POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1453
+ POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1454
+ POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1455
+ POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1456
+ }
1457
+ for (size_t i=1; i<ido; ++i)
1458
+ {
1459
+ POCKETFFT_PREP11(i)
1460
+ POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1461
+ POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1462
+ POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1463
+ POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1464
+ POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1465
+ }
1466
+ }
1467
+ }
1468
+
1469
+ #undef POCKETFFT_PARTSTEP11
1470
+ #undef POCKETFFT_PARTSTEP11a0
1471
+ #undef POCKETFFT_PARTSTEP11a
1472
+ #undef POCKETFFT_PREP11
1473
+
1474
+ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
1475
+ size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1476
+ const cmplx<T0> * POCKETFFT_RESTRICT wa,
1477
+ const cmplx<T0> * POCKETFFT_RESTRICT csarr) const
1478
+ {
1479
+ const size_t cdim=ip;
1480
+ size_t ipph = (ip+1)/2;
1481
+ size_t idl1 = ido*l1;
1482
+
1483
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1484
+ { return ch[a+ido*(b+l1*c)]; };
1485
+ auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
1486
+ { return cc[a+ido*(b+cdim*c)]; };
1487
+ auto CX = [cc, ido, l1](size_t a, size_t b, size_t c) -> T&
1488
+ { return cc[a+ido*(b+l1*c)]; };
1489
+ auto CX2 = [cc, idl1](size_t a, size_t b) -> T&
1490
+ { return cc[a+idl1*b]; };
1491
+ auto CH2 = [ch, idl1](size_t a, size_t b) -> const T&
1492
+ { return ch[a+idl1*b]; };
1493
+
1494
+ arr<cmplx<T0>> wal(ip);
1495
+ wal[0] = cmplx<T0>(1., 0.);
1496
+ for (size_t i=1; i<ip; ++i)
1497
+ wal[i]=cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i);
1498
+
1499
+ for (size_t k=0; k<l1; ++k)
1500
+ for (size_t i=0; i<ido; ++i)
1501
+ CH(i,k,0) = CC(i,0,k);
1502
+ for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
1503
+ for (size_t k=0; k<l1; ++k)
1504
+ for (size_t i=0; i<ido; ++i)
1505
+ PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
1506
+ for (size_t k=0; k<l1; ++k)
1507
+ for (size_t i=0; i<ido; ++i)
1508
+ {
1509
+ T tmp = CH(i,k,0);
1510
+ for (size_t j=1; j<ipph; ++j)
1511
+ tmp+=CH(i,k,j);
1512
+ CX(i,k,0) = tmp;
1513
+ }
1514
+ for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
1515
+ {
1516
+ // j=0
1517
+ for (size_t ik=0; ik<idl1; ++ik)
1518
+ {
1519
+ CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
1520
+ CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
1521
+ CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
1522
+ CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
1523
+ }
1524
+
1525
+ size_t iwal=2*l;
1526
+ size_t j=3, jc=ip-3;
1527
+ for (; j<ipph-1; j+=2, jc-=2)
1528
+ {
1529
+ iwal+=l; if (iwal>ip) iwal-=ip;
1530
+ cmplx<T0> xwal=wal[iwal];
1531
+ iwal+=l; if (iwal>ip) iwal-=ip;
1532
+ cmplx<T0> xwal2=wal[iwal];
1533
+ for (size_t ik=0; ik<idl1; ++ik)
1534
+ {
1535
+ CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
1536
+ CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
1537
+ CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
1538
+ CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
1539
+ }
1540
+ }
1541
+ for (; j<ipph; ++j, --jc)
1542
+ {
1543
+ iwal+=l; if (iwal>ip) iwal-=ip;
1544
+ cmplx<T0> xwal=wal[iwal];
1545
+ for (size_t ik=0; ik<idl1; ++ik)
1546
+ {
1547
+ CX2(ik,l).r += CH2(ik,j).r*xwal.r;
1548
+ CX2(ik,l).i += CH2(ik,j).i*xwal.r;
1549
+ CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
1550
+ CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
1551
+ }
1552
+ }
1553
+ }
1554
+
1555
+ // shuffling and twiddling
1556
+ if (ido==1)
1557
+ for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
1558
+ for (size_t ik=0; ik<idl1; ++ik)
1559
+ {
1560
+ T t1=CX2(ik,j), t2=CX2(ik,jc);
1561
+ PM(CX2(ik,j),CX2(ik,jc),t1,t2);
1562
+ }
1563
+ else
1564
+ {
1565
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
1566
+ for (size_t k=0; k<l1; ++k)
1567
+ {
1568
+ T t1=CX(0,k,j), t2=CX(0,k,jc);
1569
+ PM(CX(0,k,j),CX(0,k,jc),t1,t2);
1570
+ for (size_t i=1; i<ido; ++i)
1571
+ {
1572
+ T x1, x2;
1573
+ PM(x1,x2,CX(i,k,j),CX(i,k,jc));
1574
+ size_t idij=(j-1)*(ido-1)+i-1;
1575
+ special_mul<fwd>(x1,wa[idij],CX(i,k,j));
1576
+ idij=(jc-1)*(ido-1)+i-1;
1577
+ special_mul<fwd>(x2,wa[idij],CX(i,k,jc));
1578
+ }
1579
+ }
1580
+ }
1581
+ }
1582
+
1583
+ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
1584
+ {
1585
+ if (length==1) { c[0]*=fct; return; }
1586
+ size_t l1=1;
1587
+ arr<T> ch(length);
1588
+ T *p1=c, *p2=ch.data();
1589
+
1590
+ for(size_t k1=0; k1<fact.size(); k1++)
1591
+ {
1592
+ size_t ip=fact[k1].fct;
1593
+ size_t l2=ip*l1;
1594
+ size_t ido = length/l2;
1595
+ if (ip==4)
1596
+ pass4<fwd> (ido, l1, p1, p2, fact[k1].tw);
1597
+ else if(ip==8)
1598
+ pass8<fwd>(ido, l1, p1, p2, fact[k1].tw);
1599
+ else if(ip==2)
1600
+ pass2<fwd>(ido, l1, p1, p2, fact[k1].tw);
1601
+ else if(ip==3)
1602
+ pass3<fwd> (ido, l1, p1, p2, fact[k1].tw);
1603
+ else if(ip==5)
1604
+ pass5<fwd> (ido, l1, p1, p2, fact[k1].tw);
1605
+ else if(ip==7)
1606
+ pass7<fwd> (ido, l1, p1, p2, fact[k1].tw);
1607
+ else if(ip==11)
1608
+ pass11<fwd> (ido, l1, p1, p2, fact[k1].tw);
1609
+ else
1610
+ {
1611
+ passg<fwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws);
1612
+ std::swap(p1,p2);
1613
+ }
1614
+ std::swap(p1,p2);
1615
+ l1=l2;
1616
+ }
1617
+ if (p1!=c)
1618
+ {
1619
+ if (fct!=1.)
1620
+ for (size_t i=0; i<length; ++i)
1621
+ c[i] = ch[i]*fct;
1622
+ else
1623
+ std::copy_n (p1, length, c);
1624
+ }
1625
+ else
1626
+ if (fct!=1.)
1627
+ for (size_t i=0; i<length; ++i)
1628
+ c[i] *= fct;
1629
+ }
1630
+
1631
+ public:
1632
+ template<typename T> void exec(T c[], T0 fct, bool fwd) const
1633
+ { fwd ? pass_all<true>(c, fct) : pass_all<false>(c, fct); }
1634
+
1635
+ private:
1636
+ POCKETFFT_NOINLINE void factorize()
1637
+ {
1638
+ size_t len=length;
1639
+ while ((len&7)==0)
1640
+ { add_factor(8); len>>=3; }
1641
+ while ((len&3)==0)
1642
+ { add_factor(4); len>>=2; }
1643
+ if ((len&1)==0)
1644
+ {
1645
+ len>>=1;
1646
+ // factor 2 should be at the front of the factor list
1647
+ add_factor(2);
1648
+ std::swap(fact[0].fct, fact.back().fct);
1649
+ }
1650
+ for (size_t divisor=3; divisor*divisor<=len; divisor+=2)
1651
+ while ((len%divisor)==0)
1652
+ {
1653
+ add_factor(divisor);
1654
+ len/=divisor;
1655
+ }
1656
+ if (len>1) add_factor(len);
1657
+ }
1658
+
1659
+ size_t twsize() const
1660
+ {
1661
+ size_t twsize=0, l1=1;
1662
+ for (size_t k=0; k<fact.size(); ++k)
1663
+ {
1664
+ size_t ip=fact[k].fct, ido= length/(l1*ip);
1665
+ twsize+=(ip-1)*(ido-1);
1666
+ if (ip>11)
1667
+ twsize+=ip;
1668
+ l1*=ip;
1669
+ }
1670
+ return twsize;
1671
+ }
1672
+
1673
+ void comp_twiddle()
1674
+ {
1675
+ sincos_2pibyn<T0> twiddle(length);
1676
+ size_t l1=1;
1677
+ size_t memofs=0;
1678
+ for (size_t k=0; k<fact.size(); ++k)
1679
+ {
1680
+ size_t ip=fact[k].fct, ido=length/(l1*ip);
1681
+ fact[k].tw=mem.data()+memofs;
1682
+ memofs+=(ip-1)*(ido-1);
1683
+ for (size_t j=1; j<ip; ++j)
1684
+ for (size_t i=1; i<ido; ++i)
1685
+ fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
1686
+ if (ip>11)
1687
+ {
1688
+ fact[k].tws=mem.data()+memofs;
1689
+ memofs+=ip;
1690
+ for (size_t j=0; j<ip; ++j)
1691
+ fact[k].tws[j] = twiddle[j*l1*ido];
1692
+ }
1693
+ l1*=ip;
1694
+ }
1695
+ }
1696
+
1697
+ public:
1698
+ POCKETFFT_NOINLINE cfftp(size_t length_)
1699
+ : length(length_)
1700
+ {
1701
+ if (length==0) throw std::runtime_error("zero-length FFT requested");
1702
+ if (length==1) return;
1703
+ factorize();
1704
+ mem.resize(twsize());
1705
+ comp_twiddle();
1706
+ }
1707
+ };
1708
+
1709
+ //
1710
+ // real-valued FFTPACK transforms
1711
+ //
1712
+
1713
+ template<typename T0> class rfftp
1714
+ {
1715
+ private:
1716
+ struct fctdata
1717
+ {
1718
+ size_t fct;
1719
+ T0 *tw, *tws;
1720
+ };
1721
+
1722
+ size_t length;
1723
+ arr<T0> mem;
1724
+ std::vector<fctdata> fact;
1725
+
1726
+ void add_factor(size_t factor)
1727
+ { fact.push_back({factor, nullptr, nullptr}); }
1728
+
1729
+ /* (a+ib) = conj(c+id) * (e+if) */
1730
+ template<typename T1, typename T2, typename T3> inline void MULPM
1731
+ (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) const
1732
+ { a=c*e+d*f; b=c*f-d*e; }
1733
+
1734
+ template<typename T> void radf2 (size_t ido, size_t l1,
1735
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1736
+ const T0 * POCKETFFT_RESTRICT wa) const
1737
+ {
1738
+ auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1739
+ auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1740
+ { return cc[a+ido*(b+l1*c)]; };
1741
+ auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1742
+ { return ch[a+ido*(b+2*c)]; };
1743
+
1744
+ for (size_t k=0; k<l1; k++)
1745
+ PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1));
1746
+ if ((ido&1)==0)
1747
+ for (size_t k=0; k<l1; k++)
1748
+ {
1749
+ CH( 0,1,k) = -CC(ido-1,k,1);
1750
+ CH(ido-1,0,k) = CC(ido-1,k,0);
1751
+ }
1752
+ if (ido<=2) return;
1753
+ for (size_t k=0; k<l1; k++)
1754
+ for (size_t i=2; i<ido; i+=2)
1755
+ {
1756
+ size_t ic=ido-i;
1757
+ T tr2, ti2;
1758
+ MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1759
+ PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2);
1760
+ PM (CH(i ,0,k),CH(ic ,1,k),ti2,CC(i ,k,0));
1761
+ }
1762
+ }
1763
+
1764
+ // a2=a+b; b2=i*(b-a);
1765
+ #define POCKETFFT_REARRANGE(rx, ix, ry, iy) \
1766
+ {\
1767
+ auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \
1768
+ rx=t1; ix=t3; ry=t4; iy=t2; \
1769
+ }
1770
+
1771
+ template<typename T> void radf3(size_t ido, size_t l1,
1772
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1773
+ const T0 * POCKETFFT_RESTRICT wa) const
1774
+ {
1775
+ constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
1776
+
1777
+ auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1778
+ auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1779
+ { return cc[a+ido*(b+l1*c)]; };
1780
+ auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1781
+ { return ch[a+ido*(b+3*c)]; };
1782
+
1783
+ for (size_t k=0; k<l1; k++)
1784
+ {
1785
+ T cr2=CC(0,k,1)+CC(0,k,2);
1786
+ CH(0,0,k) = CC(0,k,0)+cr2;
1787
+ CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1788
+ CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1789
+ }
1790
+ if (ido==1) return;
1791
+ for (size_t k=0; k<l1; k++)
1792
+ for (size_t i=2; i<ido; i+=2)
1793
+ {
1794
+ size_t ic=ido-i;
1795
+ T di2, di3, dr2, dr3;
1796
+ MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); // d2=conj(WA0)*CC1
1797
+ MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); // d3=conj(WA1)*CC2
1798
+ POCKETFFT_REARRANGE(dr2, di2, dr3, di3);
1799
+ CH(i-1,0,k) = CC(i-1,k,0)+dr2; // c add
1800
+ CH(i ,0,k) = CC(i ,k,0)+di2;
1801
+ T tr2 = CC(i-1,k,0)+taur*dr2; // c add
1802
+ T ti2 = CC(i ,k,0)+taur*di2;
1803
+ T tr3 = taui*dr3; // t3 = taui*i*(d3-d2)?
1804
+ T ti3 = taui*di3;
1805
+ PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3); // PM(i) = t2+t3
1806
+ PM(CH(i ,2,k),CH(ic ,1,k),ti3,ti2); // PM(ic) = conj(t2-t3)
1807
+ }
1808
+ }
1809
+
1810
+ template<typename T> void radf4(size_t ido, size_t l1,
1811
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1812
+ const T0 * POCKETFFT_RESTRICT wa) const
1813
+ {
1814
+ constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1815
+
1816
+ auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1817
+ auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1818
+ { return cc[a+ido*(b+l1*c)]; };
1819
+ auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1820
+ { return ch[a+ido*(b+4*c)]; };
1821
+
1822
+ for (size_t k=0; k<l1; k++)
1823
+ {
1824
+ T tr1,tr2;
1825
+ PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1));
1826
+ PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2));
1827
+ PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1);
1828
+ }
1829
+ if ((ido&1)==0)
1830
+ for (size_t k=0; k<l1; k++)
1831
+ {
1832
+ T ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1833
+ T tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1834
+ PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1);
1835
+ PM (CH( 0,3,k),CH( 0,1,k),ti1,CC(ido-1,k,2));
1836
+ }
1837
+ if (ido<=2) return;
1838
+ for (size_t k=0; k<l1; k++)
1839
+ for (size_t i=2; i<ido; i+=2)
1840
+ {
1841
+ size_t ic=ido-i;
1842
+ T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1843
+ MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1844
+ MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1845
+ MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
1846
+ PM(tr1,tr4,cr4,cr2);
1847
+ PM(ti1,ti4,ci2,ci4);
1848
+ PM(tr2,tr3,CC(i-1,k,0),cr3);
1849
+ PM(ti2,ti3,CC(i ,k,0),ci3);
1850
+ PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1);
1851
+ PM(CH(i ,0,k),CH(ic ,3,k),ti1,ti2);
1852
+ PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4);
1853
+ PM(CH(i ,2,k),CH(ic ,1,k),tr4,ti3);
1854
+ }
1855
+ }
1856
+
1857
+ template<typename T> void radf5(size_t ido, size_t l1,
1858
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1859
+ const T0 * POCKETFFT_RESTRICT wa) const
1860
+ {
1861
+ constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
1862
+ ti11= T0(0.9510565162951535721164393333793821L),
1863
+ tr12= T0(-0.8090169943749474241022934171828191L),
1864
+ ti12= T0(0.5877852522924731291687059546390728L);
1865
+
1866
+ auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1867
+ auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1868
+ { return cc[a+ido*(b+l1*c)]; };
1869
+ auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1870
+ { return ch[a+ido*(b+5*c)]; };
1871
+
1872
+ for (size_t k=0; k<l1; k++)
1873
+ {
1874
+ T cr2, cr3, ci4, ci5;
1875
+ PM (cr2,ci5,CC(0,k,4),CC(0,k,1));
1876
+ PM (cr3,ci4,CC(0,k,3),CC(0,k,2));
1877
+ CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1878
+ CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1879
+ CH(0,2,k)=ti11*ci5+ti12*ci4;
1880
+ CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1881
+ CH(0,4,k)=ti12*ci5-ti11*ci4;
1882
+ }
1883
+ if (ido==1) return;
1884
+ for (size_t k=0; k<l1;++k)
1885
+ for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
1886
+ {
1887
+ T di2, di3, di4, di5, dr2, dr3, dr4, dr5;
1888
+ MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1889
+ MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1890
+ MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
1891
+ MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4));
1892
+ POCKETFFT_REARRANGE(dr2, di2, dr5, di5);
1893
+ POCKETFFT_REARRANGE(dr3, di3, dr4, di4);
1894
+ CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3;
1895
+ CH(i ,0,k)=CC(i ,k,0)+di2+di3;
1896
+ T tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3;
1897
+ T ti2=CC(i ,k,0)+tr11*di2+tr12*di3;
1898
+ T tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3;
1899
+ T ti3=CC(i ,k,0)+tr12*di2+tr11*di3;
1900
+ T tr5 = ti11*dr5 + ti12*dr4;
1901
+ T ti5 = ti11*di5 + ti12*di4;
1902
+ T tr4 = ti12*dr5 - ti11*dr4;
1903
+ T ti4 = ti12*di5 - ti11*di4;
1904
+ PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5);
1905
+ PM(CH(i ,2,k),CH(ic ,1,k),ti5,ti2);
1906
+ PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4);
1907
+ PM(CH(i ,4,k),CH(ic ,3,k),ti4,ti3);
1908
+ }
1909
+ }
1910
+
1911
+ #undef POCKETFFT_REARRANGE
1912
+
1913
+ template<typename T> void radfg(size_t ido, size_t ip, size_t l1,
1914
+ T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1915
+ const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const
1916
+ {
1917
+ const size_t cdim=ip;
1918
+ size_t ipph=(ip+1)/2;
1919
+ size_t idl1 = ido*l1;
1920
+
1921
+ auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> T&
1922
+ { return cc[a+ido*(b+cdim*c)]; };
1923
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> const T&
1924
+ { return ch[a+ido*(b+l1*c)]; };
1925
+ auto C1 = [cc,ido,l1] (size_t a, size_t b, size_t c) -> T&
1926
+ { return cc[a+ido*(b+l1*c)]; };
1927
+ auto C2 = [cc,idl1] (size_t a, size_t b) -> T&
1928
+ { return cc[a+idl1*b]; };
1929
+ auto CH2 = [ch,idl1] (size_t a, size_t b) -> T&
1930
+ { return ch[a+idl1*b]; };
1931
+
1932
+ if (ido>1)
1933
+ {
1934
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 114
1935
+ {
1936
+ size_t is=(j-1)*(ido-1),
1937
+ is2=(jc-1)*(ido-1);
1938
+ for (size_t k=0; k<l1; ++k) // 113
1939
+ {
1940
+ size_t idij=is;
1941
+ size_t idij2=is2;
1942
+ for (size_t i=1; i<=ido-2; i+=2) // 112
1943
+ {
1944
+ T t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1945
+ t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1946
+ T x1=wa[idij]*t1 + wa[idij+1]*t2,
1947
+ x2=wa[idij]*t2 - wa[idij+1]*t1,
1948
+ x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1949
+ x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1950
+ PM(C1(i,k,j),C1(i+1,k,jc),x3,x1);
1951
+ PM(C1(i+1,k,j),C1(i,k,jc),x2,x4);
1952
+ idij+=2;
1953
+ idij2+=2;
1954
+ }
1955
+ }
1956
+ }
1957
+ }
1958
+
1959
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 123
1960
+ for (size_t k=0; k<l1; ++k) // 122
1961
+ MPINPLACE(C1(0,k,jc), C1(0,k,j));
1962
+
1963
+ //everything in C
1964
+ //memset(ch,0,ip*l1*ido*sizeof(double));
1965
+
1966
+ for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) // 127
1967
+ {
1968
+ for (size_t ik=0; ik<idl1; ++ik) // 124
1969
+ {
1970
+ CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1971
+ CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
1972
+ }
1973
+ size_t iang = 2*l;
1974
+ size_t j=3, jc=ip-3;
1975
+ for (; j<ipph-3; j+=4,jc-=4) // 126
1976
+ {
1977
+ iang+=l; if (iang>=ip) iang-=ip;
1978
+ T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1979
+ iang+=l; if (iang>=ip) iang-=ip;
1980
+ T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1981
+ iang+=l; if (iang>=ip) iang-=ip;
1982
+ T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1983
+ iang+=l; if (iang>=ip) iang-=ip;
1984
+ T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1985
+ for (size_t ik=0; ik<idl1; ++ik) // 125
1986
+ {
1987
+ CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
1988
+ +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
1989
+ CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
1990
+ +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
1991
+ }
1992
+ }
1993
+ for (; j<ipph-1; j+=2,jc-=2) // 126
1994
+ {
1995
+ iang+=l; if (iang>=ip) iang-=ip;
1996
+ T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1997
+ iang+=l; if (iang>=ip) iang-=ip;
1998
+ T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1999
+ for (size_t ik=0; ik<idl1; ++ik) // 125
2000
+ {
2001
+ CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
2002
+ CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
2003
+ }
2004
+ }
2005
+ for (; j<ipph; ++j,--jc) // 126
2006
+ {
2007
+ iang+=l; if (iang>=ip) iang-=ip;
2008
+ T0 ar=csarr[2*iang], ai=csarr[2*iang+1];
2009
+ for (size_t ik=0; ik<idl1; ++ik) // 125
2010
+ {
2011
+ CH2(ik,l ) += ar*C2(ik,j );
2012
+ CH2(ik,lc) += ai*C2(ik,jc);
2013
+ }
2014
+ }
2015
+ }
2016
+ for (size_t ik=0; ik<idl1; ++ik) // 101
2017
+ CH2(ik,0) = C2(ik,0);
2018
+ for (size_t j=1; j<ipph; ++j) // 129
2019
+ for (size_t ik=0; ik<idl1; ++ik) // 128
2020
+ CH2(ik,0) += C2(ik,j);
2021
+
2022
+ // everything in CH at this point!
2023
+ //memset(cc,0,ip*l1*ido*sizeof(double));
2024
+
2025
+ for (size_t k=0; k<l1; ++k) // 131
2026
+ for (size_t i=0; i<ido; ++i) // 130
2027
+ CC(i,0,k) = CH(i,k,0);
2028
+
2029
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 137
2030
+ {
2031
+ size_t j2=2*j-1;
2032
+ for (size_t k=0; k<l1; ++k) // 136
2033
+ {
2034
+ CC(ido-1,j2,k) = CH(0,k,j);
2035
+ CC(0,j2+1,k) = CH(0,k,jc);
2036
+ }
2037
+ }
2038
+
2039
+ if (ido==1) return;
2040
+
2041
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 140
2042
+ {
2043
+ size_t j2=2*j-1;
2044
+ for(size_t k=0; k<l1; ++k) // 139
2045
+ for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 138
2046
+ {
2047
+ CC(i ,j2+1,k) = CH(i ,k,j )+CH(i ,k,jc);
2048
+ CC(ic ,j2 ,k) = CH(i ,k,j )-CH(i ,k,jc);
2049
+ CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
2050
+ CC(ic+1,j2 ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
2051
+ }
2052
+ }
2053
+ }
2054
+
2055
+ template<typename T> void radb2(size_t ido, size_t l1,
2056
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2057
+ const T0 * POCKETFFT_RESTRICT wa) const
2058
+ {
2059
+ auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
2060
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
2061
+ { return cc[a+ido*(b+2*c)]; };
2062
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2063
+ { return ch[a+ido*(b+l1*c)]; };
2064
+
2065
+ for (size_t k=0; k<l1; k++)
2066
+ PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k));
2067
+ if ((ido&1)==0)
2068
+ for (size_t k=0; k<l1; k++)
2069
+ {
2070
+ CH(ido-1,k,0) = 2*CC(ido-1,0,k);
2071
+ CH(ido-1,k,1) =-2*CC(0 ,1,k);
2072
+ }
2073
+ if (ido<=2) return;
2074
+ for (size_t k=0; k<l1;++k)
2075
+ for (size_t i=2; i<ido; i+=2)
2076
+ {
2077
+ size_t ic=ido-i;
2078
+ T ti2, tr2;
2079
+ PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k));
2080
+ PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k));
2081
+ MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2);
2082
+ }
2083
+ }
2084
+
2085
+ template<typename T> void radb3(size_t ido, size_t l1,
2086
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2087
+ const T0 * POCKETFFT_RESTRICT wa) const
2088
+ {
2089
+ constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
2090
+
2091
+ auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
2092
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
2093
+ { return cc[a+ido*(b+3*c)]; };
2094
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2095
+ { return ch[a+ido*(b+l1*c)]; };
2096
+
2097
+ for (size_t k=0; k<l1; k++)
2098
+ {
2099
+ T tr2=2*CC(ido-1,1,k);
2100
+ T cr2=CC(0,0,k)+taur*tr2;
2101
+ CH(0,k,0)=CC(0,0,k)+tr2;
2102
+ T ci3=2*taui*CC(0,2,k);
2103
+ PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
2104
+ }
2105
+ if (ido==1) return;
2106
+ for (size_t k=0; k<l1; k++)
2107
+ for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2108
+ {
2109
+ T tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
2110
+ T ti2=CC(i ,2,k)-CC(ic ,1,k);
2111
+ T cr2=CC(i-1,0,k)+taur*tr2; // c2=CC +taur*t2
2112
+ T ci2=CC(i ,0,k)+taur*ti2;
2113
+ CH(i-1,k,0)=CC(i-1,0,k)+tr2; // CH=CC+t2
2114
+ CH(i ,k,0)=CC(i ,0,k)+ti2;
2115
+ T cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
2116
+ T ci3=taui*(CC(i ,2,k)+CC(ic ,1,k));
2117
+ T di2, di3, dr2, dr3;
2118
+ PM(dr3,dr2,cr2,ci3); // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
2119
+ PM(di2,di3,ci2,cr3); // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
2120
+ MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); // ch = WA*d2
2121
+ MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
2122
+ }
2123
+ }
2124
+
2125
+ template<typename T> void radb4(size_t ido, size_t l1,
2126
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2127
+ const T0 * POCKETFFT_RESTRICT wa) const
2128
+ {
2129
+ constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2130
+
2131
+ auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
2132
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
2133
+ { return cc[a+ido*(b+4*c)]; };
2134
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2135
+ { return ch[a+ido*(b+l1*c)]; };
2136
+
2137
+ for (size_t k=0; k<l1; k++)
2138
+ {
2139
+ T tr1, tr2;
2140
+ PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k));
2141
+ T tr3=2*CC(ido-1,1,k);
2142
+ T tr4=2*CC(0,2,k);
2143
+ PM (CH(0,k,0),CH(0,k,2),tr2,tr3);
2144
+ PM (CH(0,k,3),CH(0,k,1),tr1,tr4);
2145
+ }
2146
+ if ((ido&1)==0)
2147
+ for (size_t k=0; k<l1; k++)
2148
+ {
2149
+ T tr1,tr2,ti1,ti2;
2150
+ PM (ti1,ti2,CC(0 ,3,k),CC(0 ,1,k));
2151
+ PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k));
2152
+ CH(ido-1,k,0)=tr2+tr2;
2153
+ CH(ido-1,k,1)=sqrt2*(tr1-ti1);
2154
+ CH(ido-1,k,2)=ti2+ti2;
2155
+ CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
2156
+ }
2157
+ if (ido<=2) return;
2158
+ for (size_t k=0; k<l1;++k)
2159
+ for (size_t i=2; i<ido; i+=2)
2160
+ {
2161
+ T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
2162
+ size_t ic=ido-i;
2163
+ PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k));
2164
+ PM (ti1,ti2,CC(i ,0,k),CC(ic ,3,k));
2165
+ PM (tr4,ti3,CC(i ,2,k),CC(ic ,1,k));
2166
+ PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k));
2167
+ PM (CH(i-1,k,0),cr3,tr2,tr3);
2168
+ PM (CH(i ,k,0),ci3,ti2,ti3);
2169
+ PM (cr4,cr2,tr1,tr4);
2170
+ PM (ci2,ci4,ti1,ti4);
2171
+ MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2);
2172
+ MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3);
2173
+ MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4);
2174
+ }
2175
+ }
2176
+
2177
+ template<typename T> void radb5(size_t ido, size_t l1,
2178
+ const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2179
+ const T0 * POCKETFFT_RESTRICT wa) const
2180
+ {
2181
+ constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
2182
+ ti11= T0(0.9510565162951535721164393333793821L),
2183
+ tr12= T0(-0.8090169943749474241022934171828191L),
2184
+ ti12= T0(0.5877852522924731291687059546390728L);
2185
+
2186
+ auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
2187
+ auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
2188
+ { return cc[a+ido*(b+5*c)]; };
2189
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2190
+ { return ch[a+ido*(b+l1*c)]; };
2191
+
2192
+ for (size_t k=0; k<l1; k++)
2193
+ {
2194
+ T ti5=CC(0,2,k)+CC(0,2,k);
2195
+ T ti4=CC(0,4,k)+CC(0,4,k);
2196
+ T tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
2197
+ T tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
2198
+ CH(0,k,0)=CC(0,0,k)+tr2+tr3;
2199
+ T cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
2200
+ T cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
2201
+ T ci4, ci5;
2202
+ MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2203
+ PM(CH(0,k,4),CH(0,k,1),cr2,ci5);
2204
+ PM(CH(0,k,3),CH(0,k,2),cr3,ci4);
2205
+ }
2206
+ if (ido==1) return;
2207
+ for (size_t k=0; k<l1;++k)
2208
+ for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2209
+ {
2210
+ T tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
2211
+ PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k));
2212
+ PM(ti5,ti2,CC(i ,2,k),CC(ic ,1,k));
2213
+ PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k));
2214
+ PM(ti4,ti3,CC(i ,4,k),CC(ic ,3,k));
2215
+ CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
2216
+ CH(i ,k,0)=CC(i ,0,k)+ti2+ti3;
2217
+ T cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
2218
+ T ci2=CC(i ,0,k)+tr11*ti2+tr12*ti3;
2219
+ T cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
2220
+ T ci3=CC(i ,0,k)+tr12*ti2+tr11*ti3;
2221
+ T ci4, ci5, cr5, cr4;
2222
+ MULPM(cr5,cr4,tr5,tr4,ti11,ti12);
2223
+ MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2224
+ T dr2, dr3, dr4, dr5, di2, di3, di4, di5;
2225
+ PM(dr4,dr3,cr3,ci4);
2226
+ PM(di3,di4,ci3,cr4);
2227
+ PM(dr5,dr2,cr2,ci5);
2228
+ PM(di2,di5,ci2,cr5);
2229
+ MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2);
2230
+ MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
2231
+ MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4);
2232
+ MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5);
2233
+ }
2234
+ }
2235
+
2236
+ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
2237
+ T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2238
+ const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const
2239
+ {
2240
+ const size_t cdim=ip;
2241
+ size_t ipph=(ip+1)/ 2;
2242
+ size_t idl1 = ido*l1;
2243
+
2244
+ auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
2245
+ { return cc[a+ido*(b+cdim*c)]; };
2246
+ auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2247
+ { return ch[a+ido*(b+l1*c)]; };
2248
+ auto C1 = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
2249
+ { return cc[a+ido*(b+l1*c)]; };
2250
+ auto C2 = [cc,idl1](size_t a, size_t b) -> T&
2251
+ { return cc[a+idl1*b]; };
2252
+ auto CH2 = [ch,idl1](size_t a, size_t b) -> T&
2253
+ { return ch[a+idl1*b]; };
2254
+
2255
+ for (size_t k=0; k<l1; ++k) // 102
2256
+ for (size_t i=0; i<ido; ++i) // 101
2257
+ CH(i,k,0) = CC(i,0,k);
2258
+ for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 108
2259
+ {
2260
+ size_t j2=2*j-1;
2261
+ for (size_t k=0; k<l1; ++k)
2262
+ {
2263
+ CH(0,k,j ) = 2*CC(ido-1,j2,k);
2264
+ CH(0,k,jc) = 2*CC(0,j2+1,k);
2265
+ }
2266
+ }
2267
+
2268
+ if (ido!=1)
2269
+ {
2270
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 111
2271
+ {
2272
+ size_t j2=2*j-1;
2273
+ for (size_t k=0; k<l1; ++k)
2274
+ for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 109
2275
+ {
2276
+ CH(i ,k,j ) = CC(i ,j2+1,k)+CC(ic ,j2,k);
2277
+ CH(i ,k,jc) = CC(i ,j2+1,k)-CC(ic ,j2,k);
2278
+ CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
2279
+ CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
2280
+ }
2281
+ }
2282
+ }
2283
+ for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
2284
+ {
2285
+ for (size_t ik=0; ik<idl1; ++ik)
2286
+ {
2287
+ C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
2288
+ C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
2289
+ }
2290
+ size_t iang=2*l;
2291
+ size_t j=3,jc=ip-3;
2292
+ for(; j<ipph-3; j+=4,jc-=4)
2293
+ {
2294
+ iang+=l; if(iang>ip) iang-=ip;
2295
+ T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2296
+ iang+=l; if(iang>ip) iang-=ip;
2297
+ T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2298
+ iang+=l; if(iang>ip) iang-=ip;
2299
+ T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
2300
+ iang+=l; if(iang>ip) iang-=ip;
2301
+ T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
2302
+ for (size_t ik=0; ik<idl1; ++ik)
2303
+ {
2304
+ C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
2305
+ +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
2306
+ C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
2307
+ +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
2308
+ }
2309
+ }
2310
+ for(; j<ipph-1; j+=2,jc-=2)
2311
+ {
2312
+ iang+=l; if(iang>ip) iang-=ip;
2313
+ T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2314
+ iang+=l; if(iang>ip) iang-=ip;
2315
+ T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2316
+ for (size_t ik=0; ik<idl1; ++ik)
2317
+ {
2318
+ C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
2319
+ C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
2320
+ }
2321
+ }
2322
+ for(; j<ipph; ++j,--jc)
2323
+ {
2324
+ iang+=l; if(iang>ip) iang-=ip;
2325
+ T0 war=csarr[2*iang], wai=csarr[2*iang+1];
2326
+ for (size_t ik=0; ik<idl1; ++ik)
2327
+ {
2328
+ C2(ik,l ) += war*CH2(ik,j );
2329
+ C2(ik,lc) += wai*CH2(ik,jc);
2330
+ }
2331
+ }
2332
+ }
2333
+ for (size_t j=1; j<ipph; ++j)
2334
+ for (size_t ik=0; ik<idl1; ++ik)
2335
+ CH2(ik,0) += CH2(ik,j);
2336
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 124
2337
+ for (size_t k=0; k<l1; ++k)
2338
+ PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc));
2339
+
2340
+ if (ido==1) return;
2341
+
2342
+ for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 127
2343
+ for (size_t k=0; k<l1; ++k)
2344
+ for (size_t i=1; i<=ido-2; i+=2)
2345
+ {
2346
+ CH(i ,k,j ) = C1(i ,k,j)-C1(i+1,k,jc);
2347
+ CH(i ,k,jc) = C1(i ,k,j)+C1(i+1,k,jc);
2348
+ CH(i+1,k,j ) = C1(i+1,k,j)+C1(i ,k,jc);
2349
+ CH(i+1,k,jc) = C1(i+1,k,j)-C1(i ,k,jc);
2350
+ }
2351
+
2352
+ // All in CH
2353
+
2354
+ for (size_t j=1; j<ip; ++j)
2355
+ {
2356
+ size_t is = (j-1)*(ido-1);
2357
+ for (size_t k=0; k<l1; ++k)
2358
+ {
2359
+ size_t idij = is;
2360
+ for (size_t i=1; i<=ido-2; i+=2)
2361
+ {
2362
+ T t1=CH(i,k,j), t2=CH(i+1,k,j);
2363
+ CH(i ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
2364
+ CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
2365
+ idij+=2;
2366
+ }
2367
+ }
2368
+ }
2369
+ }
2370
+
2371
+ template<typename T> void copy_and_norm(T *c, T *p1, T0 fct) const
2372
+ {
2373
+ if (p1!=c)
2374
+ {
2375
+ if (fct!=1.)
2376
+ for (size_t i=0; i<length; ++i)
2377
+ c[i] = fct*p1[i];
2378
+ else
2379
+ std::copy_n (p1, length, c);
2380
+ }
2381
+ else
2382
+ if (fct!=1.)
2383
+ for (size_t i=0; i<length; ++i)
2384
+ c[i] *= fct;
2385
+ }
2386
+
2387
+ public:
2388
+ template<typename T> void exec(T c[], T0 fct, bool r2hc) const
2389
+ {
2390
+ if (length==1) { c[0]*=fct; return; }
2391
+ size_t nf=fact.size();
2392
+ arr<T> ch(length);
2393
+ T *p1=c, *p2=ch.data();
2394
+
2395
+ if (r2hc)
2396
+ for(size_t k1=0, l1=length; k1<nf;++k1)
2397
+ {
2398
+ size_t k=nf-k1-1;
2399
+ size_t ip=fact[k].fct;
2400
+ size_t ido=length / l1;
2401
+ l1 /= ip;
2402
+ if(ip==4)
2403
+ radf4(ido, l1, p1, p2, fact[k].tw);
2404
+ else if(ip==2)
2405
+ radf2(ido, l1, p1, p2, fact[k].tw);
2406
+ else if(ip==3)
2407
+ radf3(ido, l1, p1, p2, fact[k].tw);
2408
+ else if(ip==5)
2409
+ radf5(ido, l1, p1, p2, fact[k].tw);
2410
+ else
2411
+ { radfg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws); std::swap (p1,p2); }
2412
+ std::swap (p1,p2);
2413
+ }
2414
+ else
2415
+ for(size_t k=0, l1=1; k<nf; k++)
2416
+ {
2417
+ size_t ip = fact[k].fct,
2418
+ ido= length/(ip*l1);
2419
+ if(ip==4)
2420
+ radb4(ido, l1, p1, p2, fact[k].tw);
2421
+ else if(ip==2)
2422
+ radb2(ido, l1, p1, p2, fact[k].tw);
2423
+ else if(ip==3)
2424
+ radb3(ido, l1, p1, p2, fact[k].tw);
2425
+ else if(ip==5)
2426
+ radb5(ido, l1, p1, p2, fact[k].tw);
2427
+ else
2428
+ radbg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws);
2429
+ std::swap (p1,p2);
2430
+ l1*=ip;
2431
+ }
2432
+
2433
+ copy_and_norm(c,p1,fct);
2434
+ }
2435
+
2436
+ private:
2437
+ void factorize()
2438
+ {
2439
+ size_t len=length;
2440
+ while ((len%4)==0)
2441
+ { add_factor(4); len>>=2; }
2442
+ if ((len%2)==0)
2443
+ {
2444
+ len>>=1;
2445
+ // factor 2 should be at the front of the factor list
2446
+ add_factor(2);
2447
+ std::swap(fact[0].fct, fact.back().fct);
2448
+ }
2449
+ for (size_t divisor=3; divisor*divisor<=len; divisor+=2)
2450
+ while ((len%divisor)==0)
2451
+ {
2452
+ add_factor(divisor);
2453
+ len/=divisor;
2454
+ }
2455
+ if (len>1) add_factor(len);
2456
+ }
2457
+
2458
+ size_t twsize() const
2459
+ {
2460
+ size_t twsz=0, l1=1;
2461
+ for (size_t k=0; k<fact.size(); ++k)
2462
+ {
2463
+ size_t ip=fact[k].fct, ido=length/(l1*ip);
2464
+ twsz+=(ip-1)*(ido-1);
2465
+ if (ip>5) twsz+=2*ip;
2466
+ l1*=ip;
2467
+ }
2468
+ return twsz;
2469
+ }
2470
+
2471
+ void comp_twiddle()
2472
+ {
2473
+ sincos_2pibyn<T0> twid(length);
2474
+ size_t l1=1;
2475
+ T0 *ptr=mem.data();
2476
+ for (size_t k=0; k<fact.size(); ++k)
2477
+ {
2478
+ size_t ip=fact[k].fct, ido=length/(l1*ip);
2479
+ if (k<fact.size()-1) // last factor doesn't need twiddles
2480
+ {
2481
+ fact[k].tw=ptr; ptr+=(ip-1)*(ido-1);
2482
+ for (size_t j=1; j<ip; ++j)
2483
+ for (size_t i=1; i<=(ido-1)/2; ++i)
2484
+ {
2485
+ fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r;
2486
+ fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i;
2487
+ }
2488
+ }
2489
+ if (ip>5) // special factors required by *g functions
2490
+ {
2491
+ fact[k].tws=ptr; ptr+=2*ip;
2492
+ fact[k].tws[0] = 1.;
2493
+ fact[k].tws[1] = 0.;
2494
+ for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
2495
+ {
2496
+ fact[k].tws[i ] = twid[i/2*(length/ip)].r;
2497
+ fact[k].tws[i+1] = twid[i/2*(length/ip)].i;
2498
+ fact[k].tws[ic] = twid[i/2*(length/ip)].r;
2499
+ fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i;
2500
+ }
2501
+ }
2502
+ l1*=ip;
2503
+ }
2504
+ }
2505
+
2506
+ public:
2507
+ POCKETFFT_NOINLINE rfftp(size_t length_)
2508
+ : length(length_)
2509
+ {
2510
+ if (length==0) throw std::runtime_error("zero-length FFT requested");
2511
+ if (length==1) return;
2512
+ factorize();
2513
+ mem.resize(twsize());
2514
+ comp_twiddle();
2515
+ }
2516
+ };
2517
+
2518
+ //
2519
+ // complex Bluestein transforms
2520
+ //
2521
+
2522
+ template<typename T0> class fftblue
2523
+ {
2524
+ private:
2525
+ size_t n, n2;
2526
+ cfftp<T0> plan;
2527
+ arr<cmplx<T0>> mem;
2528
+ cmplx<T0> *bk, *bkf;
2529
+
2530
+ template<bool fwd, typename T> void fft(cmplx<T> c[], T0 fct) const
2531
+ {
2532
+ arr<cmplx<T>> akf(n2);
2533
+
2534
+ /* initialize a_k and FFT it */
2535
+ for (size_t m=0; m<n; ++m)
2536
+ special_mul<fwd>(c[m],bk[m],akf[m]);
2537
+ auto zero = akf[0]*T0(0);
2538
+ for (size_t m=n; m<n2; ++m)
2539
+ akf[m]=zero;
2540
+
2541
+ plan.exec (akf.data(),1.,true);
2542
+
2543
+ /* do the convolution */
2544
+ akf[0] = akf[0].template special_mul<!fwd>(bkf[0]);
2545
+ for (size_t m=1; m<(n2+1)/2; ++m)
2546
+ {
2547
+ akf[m] = akf[m].template special_mul<!fwd>(bkf[m]);
2548
+ akf[n2-m] = akf[n2-m].template special_mul<!fwd>(bkf[m]);
2549
+ }
2550
+ if ((n2&1)==0)
2551
+ akf[n2/2] = akf[n2/2].template special_mul<!fwd>(bkf[n2/2]);
2552
+
2553
+ /* inverse FFT */
2554
+ plan.exec (akf.data(),1.,false);
2555
+
2556
+ /* multiply by b_k */
2557
+ for (size_t m=0; m<n; ++m)
2558
+ c[m] = akf[m].template special_mul<fwd>(bk[m])*fct;
2559
+ }
2560
+
2561
+ public:
2562
+ POCKETFFT_NOINLINE fftblue(size_t length)
2563
+ : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1),
2564
+ bk(mem.data()), bkf(mem.data()+n)
2565
+ {
2566
+ /* initialize b_k */
2567
+ sincos_2pibyn<T0> tmp(2*n);
2568
+ bk[0].Set(1, 0);
2569
+
2570
+ size_t coeff=0;
2571
+ for (size_t m=1; m<n; ++m)
2572
+ {
2573
+ coeff+=2*m-1;
2574
+ if (coeff>=2*n) coeff-=2*n;
2575
+ bk[m] = tmp[coeff];
2576
+ }
2577
+
2578
+ /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
2579
+ arr<cmplx<T0>> tbkf(n2);
2580
+ T0 xn2 = T0(1)/T0(n2);
2581
+ tbkf[0] = bk[0]*xn2;
2582
+ for (size_t m=1; m<n; ++m)
2583
+ tbkf[m] = tbkf[n2-m] = bk[m]*xn2;
2584
+ for (size_t m=n;m<=(n2-n);++m)
2585
+ tbkf[m].Set(0.,0.);
2586
+ plan.exec(tbkf.data(),1.,true);
2587
+ for (size_t i=0; i<n2/2+1; ++i)
2588
+ bkf[i] = tbkf[i];
2589
+ }
2590
+
2591
+ template<typename T> void exec(cmplx<T> c[], T0 fct, bool fwd) const
2592
+ { fwd ? fft<true>(c,fct) : fft<false>(c,fct); }
2593
+
2594
+ template<typename T> void exec_r(T c[], T0 fct, bool fwd)
2595
+ {
2596
+ arr<cmplx<T>> tmp(n);
2597
+ if (fwd)
2598
+ {
2599
+ auto zero = T0(0)*c[0];
2600
+ for (size_t m=0; m<n; ++m)
2601
+ tmp[m].Set(c[m], zero);
2602
+ fft<true>(tmp.data(),fct);
2603
+ c[0] = tmp[0].r;
2604
+ std::copy_n (&tmp[1].r, n-1, &c[1]);
2605
+ }
2606
+ else
2607
+ {
2608
+ tmp[0].Set(c[0],c[0]*0);
2609
+ std::copy_n (c+1, n-1, &tmp[1].r);
2610
+ if ((n&1)==0) tmp[n/2].i=T0(0)*c[0];
2611
+ for (size_t m=1; 2*m<n; ++m)
2612
+ tmp[n-m].Set(tmp[m].r, -tmp[m].i);
2613
+ fft<false>(tmp.data(),fct);
2614
+ for (size_t m=0; m<n; ++m)
2615
+ c[m] = tmp[m].r;
2616
+ }
2617
+ }
2618
+ };
2619
+
2620
+ //
2621
+ // flexible (FFTPACK/Bluestein) complex 1D transform
2622
+ //
2623
+
2624
+ template<typename T0> class pocketfft_c
2625
+ {
2626
+ private:
2627
+ std::unique_ptr<cfftp<T0>> packplan;
2628
+ std::unique_ptr<fftblue<T0>> blueplan;
2629
+ size_t len;
2630
+
2631
+ public:
2632
+ POCKETFFT_NOINLINE pocketfft_c(size_t length)
2633
+ : len(length)
2634
+ {
2635
+ if (length==0) throw std::runtime_error("zero-length FFT requested");
2636
+ size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length);
2637
+ if (tmp*tmp <= length)
2638
+ {
2639
+ packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
2640
+ return;
2641
+ }
2642
+ double comp1 = util::cost_guess(length);
2643
+ double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
2644
+ comp2*=1.5; /* fudge factor that appears to give good overall performance */
2645
+ if (comp2<comp1) // use Bluestein
2646
+ blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
2647
+ else
2648
+ packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
2649
+ }
2650
+
2651
+ template<typename T> POCKETFFT_NOINLINE void exec(cmplx<T> c[], T0 fct, bool fwd) const
2652
+ { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec(c,fct,fwd); }
2653
+
2654
+ size_t length() const { return len; }
2655
+ };
2656
+
2657
+ //
2658
+ // flexible (FFTPACK/Bluestein) real-valued 1D transform
2659
+ //
2660
+
2661
+ template<typename T0> class pocketfft_r
2662
+ {
2663
+ private:
2664
+ std::unique_ptr<rfftp<T0>> packplan;
2665
+ std::unique_ptr<fftblue<T0>> blueplan;
2666
+ size_t len;
2667
+
2668
+ public:
2669
+ POCKETFFT_NOINLINE pocketfft_r(size_t length)
2670
+ : len(length)
2671
+ {
2672
+ if (length==0) throw std::runtime_error("zero-length FFT requested");
2673
+ size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length);
2674
+ if (tmp*tmp <= length)
2675
+ {
2676
+ packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
2677
+ return;
2678
+ }
2679
+ double comp1 = 0.5*util::cost_guess(length);
2680
+ double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
2681
+ comp2*=1.5; /* fudge factor that appears to give good overall performance */
2682
+ if (comp2<comp1) // use Bluestein
2683
+ blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
2684
+ else
2685
+ packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
2686
+ }
2687
+
2688
+ template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool fwd) const
2689
+ { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec_r(c,fct,fwd); }
2690
+
2691
+ size_t length() const { return len; }
2692
+ };
2693
+
2694
+
2695
+ //
2696
+ // sine/cosine transforms
2697
+ //
2698
+
2699
+ template<typename T0> class T_dct1
2700
+ {
2701
+ private:
2702
+ pocketfft_r<T0> fftplan;
2703
+
2704
+ public:
2705
+ POCKETFFT_NOINLINE T_dct1(size_t length)
2706
+ : fftplan(2*(length-1)) {}
2707
+
2708
+ template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
2709
+ int /*type*/, bool /*cosine*/) const
2710
+ {
2711
+ constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2712
+ size_t N=fftplan.length(), n=N/2+1;
2713
+ if (ortho)
2714
+ { c[0]*=sqrt2; c[n-1]*=sqrt2; }
2715
+ arr<T> tmp(N);
2716
+ tmp[0] = c[0];
2717
+ for (size_t i=1; i<n; ++i)
2718
+ tmp[i] = tmp[N-i] = c[i];
2719
+ fftplan.exec(tmp.data(), fct, true);
2720
+ c[0] = tmp[0];
2721
+ for (size_t i=1; i<n; ++i)
2722
+ c[i] = tmp[2*i-1];
2723
+ if (ortho)
2724
+ { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); }
2725
+ }
2726
+
2727
+ size_t length() const { return fftplan.length()/2+1; }
2728
+ };
2729
+
2730
+ template<typename T0> class T_dst1
2731
+ {
2732
+ private:
2733
+ pocketfft_r<T0> fftplan;
2734
+
2735
+ public:
2736
+ POCKETFFT_NOINLINE T_dst1(size_t length)
2737
+ : fftplan(2*(length+1)) {}
2738
+
2739
+ template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
2740
+ bool /*ortho*/, int /*type*/, bool /*cosine*/) const
2741
+ {
2742
+ size_t N=fftplan.length(), n=N/2-1;
2743
+ arr<T> tmp(N);
2744
+ tmp[0] = tmp[n+1] = c[0]*0;
2745
+ for (size_t i=0; i<n; ++i)
2746
+ { tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
2747
+ fftplan.exec(tmp.data(), fct, true);
2748
+ for (size_t i=0; i<n; ++i)
2749
+ c[i] = -tmp[2*i+2];
2750
+ }
2751
+
2752
+ size_t length() const { return fftplan.length()/2-1; }
2753
+ };
2754
+
2755
+ template<typename T0> class T_dcst23
2756
+ {
2757
+ private:
2758
+ pocketfft_r<T0> fftplan;
2759
+ std::vector<T0> twiddle;
2760
+
2761
+ public:
2762
+ POCKETFFT_NOINLINE T_dcst23(size_t length)
2763
+ : fftplan(length), twiddle(length)
2764
+ {
2765
+ sincos_2pibyn<T0> tw(4*length);
2766
+ for (size_t i=0; i<length; ++i)
2767
+ twiddle[i] = tw[i+1].r;
2768
+ }
2769
+
2770
+ template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
2771
+ int type, bool cosine) const
2772
+ {
2773
+ constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2774
+ size_t N=length();
2775
+ size_t NS2 = (N+1)/2;
2776
+ if (type==2)
2777
+ {
2778
+ if (!cosine)
2779
+ for (size_t k=1; k<N; k+=2)
2780
+ c[k] = -c[k];
2781
+ c[0] *= 2;
2782
+ if ((N&1)==0) c[N-1]*=2;
2783
+ for (size_t k=1; k<N-1; k+=2)
2784
+ MPINPLACE(c[k+1], c[k]);
2785
+ fftplan.exec(c, fct, false);
2786
+ for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
2787
+ {
2788
+ T t1 = twiddle[k-1]*c[kc]+twiddle[kc-1]*c[k];
2789
+ T t2 = twiddle[k-1]*c[k]-twiddle[kc-1]*c[kc];
2790
+ c[k] = T0(0.5)*(t1+t2); c[kc]=T0(0.5)*(t1-t2);
2791
+ }
2792
+ if ((N&1)==0)
2793
+ c[NS2] *= twiddle[NS2-1];
2794
+ if (!cosine)
2795
+ for (size_t k=0, kc=N-1; k<kc; ++k, --kc)
2796
+ std::swap(c[k], c[kc]);
2797
+ if (ortho)
2798
+ cosine ? c[0]*=sqrt2*T0(0.5) : c[N-1]*=sqrt2*T0(0.5);
2799
+ }
2800
+ else
2801
+ {
2802
+ if (ortho)
2803
+ cosine ? c[0]*=sqrt2 : c[N-1]*=sqrt2;
2804
+ if (!cosine)
2805
+ for (size_t k=0, kc=N-1; k<NS2; ++k, --kc)
2806
+ std::swap(c[k], c[kc]);
2807
+ for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
2808
+ {
2809
+ T t1=c[k]+c[kc], t2=c[k]-c[kc];
2810
+ c[k] = twiddle[k-1]*t2+twiddle[kc-1]*t1;
2811
+ c[kc]= twiddle[k-1]*t1-twiddle[kc-1]*t2;
2812
+ }
2813
+ if ((N&1)==0)
2814
+ c[NS2] *= 2*twiddle[NS2-1];
2815
+ fftplan.exec(c, fct, true);
2816
+ for (size_t k=1; k<N-1; k+=2)
2817
+ MPINPLACE(c[k], c[k+1]);
2818
+ if (!cosine)
2819
+ for (size_t k=1; k<N; k+=2)
2820
+ c[k] = -c[k];
2821
+ }
2822
+ }
2823
+
2824
+ size_t length() const { return fftplan.length(); }
2825
+ };
2826
+
2827
+ template<typename T0> class T_dcst4
2828
+ {
2829
+ private:
2830
+ size_t N;
2831
+ std::unique_ptr<pocketfft_c<T0>> fft;
2832
+ std::unique_ptr<pocketfft_r<T0>> rfft;
2833
+ arr<cmplx<T0>> C2;
2834
+
2835
+ public:
2836
+ POCKETFFT_NOINLINE T_dcst4(size_t length)
2837
+ : N(length),
2838
+ fft((N&1) ? nullptr : new pocketfft_c<T0>(N/2)),
2839
+ rfft((N&1)? new pocketfft_r<T0>(N) : nullptr),
2840
+ C2((N&1) ? 0 : N/2)
2841
+ {
2842
+ if ((N&1)==0)
2843
+ {
2844
+ sincos_2pibyn<T0> tw(16*N);
2845
+ for (size_t i=0; i<N/2; ++i)
2846
+ C2[i] = conj(tw[8*i+1]);
2847
+ }
2848
+ }
2849
+
2850
+ template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
2851
+ bool /*ortho*/, int /*type*/, bool cosine) const
2852
+ {
2853
+ size_t n2 = N/2;
2854
+ if (!cosine)
2855
+ for (size_t k=0, kc=N-1; k<n2; ++k, --kc)
2856
+ std::swap(c[k], c[kc]);
2857
+ if (N&1)
2858
+ {
2859
+ // The following code is derived from the FFTW3 function apply_re11()
2860
+ // and is released under the 3-clause BSD license with friendly
2861
+ // permission of Matteo Frigo and Steven G. Johnson.
2862
+
2863
+ arr<T> y(N);
2864
+ {
2865
+ size_t i=0, m=n2;
2866
+ for (; m<N; ++i, m+=4)
2867
+ y[i] = c[m];
2868
+ for (; m<2*N; ++i, m+=4)
2869
+ y[i] = -c[2*N-m-1];
2870
+ for (; m<3*N; ++i, m+=4)
2871
+ y[i] = -c[m-2*N];
2872
+ for (; m<4*N; ++i, m+=4)
2873
+ y[i] = c[4*N-m-1];
2874
+ for (; i<N; ++i, m+=4)
2875
+ y[i] = c[m-4*N];
2876
+ }
2877
+ rfft->exec(y.data(), fct, true);
2878
+ {
2879
+ auto SGN = [](size_t i)
2880
+ {
2881
+ constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2882
+ return (i&2) ? -sqrt2 : sqrt2;
2883
+ };
2884
+ c[n2] = y[0]*SGN(n2+1);
2885
+ size_t i=0, i1=1, k=1;
2886
+ for (; k<n2; ++i, ++i1, k+=2)
2887
+ {
2888
+ c[i ] = y[2*k-1]*SGN(i1) + y[2*k ]*SGN(i);
2889
+ c[N -i1] = y[2*k-1]*SGN(N -i) - y[2*k ]*SGN(N -i1);
2890
+ c[n2-i1] = y[2*k+1]*SGN(n2-i) - y[2*k+2]*SGN(n2-i1);
2891
+ c[n2+i1] = y[2*k+1]*SGN(n2+i+2) + y[2*k+2]*SGN(n2+i1);
2892
+ }
2893
+ if (k == n2)
2894
+ {
2895
+ c[i ] = y[2*k-1]*SGN(i+1) + y[2*k]*SGN(i);
2896
+ c[N-i1] = y[2*k-1]*SGN(i+2) + y[2*k]*SGN(i1);
2897
+ }
2898
+ }
2899
+
2900
+ // FFTW-derived code ends here
2901
+ }
2902
+ else
2903
+ {
2904
+ // even length algorithm from
2905
+ // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
2906
+ arr<cmplx<T>> y(n2);
2907
+ for(size_t i=0; i<n2; ++i)
2908
+ {
2909
+ y[i].Set(c[2*i],c[N-1-2*i]);
2910
+ y[i] *= C2[i];
2911
+ }
2912
+ fft->exec(y.data(), fct, true);
2913
+ for(size_t i=0, ic=n2-1; i<n2; ++i, --ic)
2914
+ {
2915
+ c[2*i ] = 2*(y[i ].r*C2[i ].r-y[i ].i*C2[i ].i);
2916
+ c[2*i+1] = -2*(y[ic].i*C2[ic].r+y[ic].r*C2[ic].i);
2917
+ }
2918
+ }
2919
+ if (!cosine)
2920
+ for (size_t k=1; k<N; k+=2)
2921
+ c[k] = -c[k];
2922
+ }
2923
+
2924
+ size_t length() const { return N; }
2925
+ };
2926
+
2927
+
2928
+ //
2929
+ // multi-D infrastructure
2930
+ //
2931
+
2932
+ template<typename T> std::shared_ptr<T> get_plan(size_t length)
2933
+ {
2934
+ #if POCKETFFT_CACHE_SIZE==0
2935
+ return std::make_shared<T>(length);
2936
+ #else
2937
+ constexpr size_t nmax=POCKETFFT_CACHE_SIZE;
2938
+ static std::array<std::shared_ptr<T>, nmax> cache;
2939
+ static std::array<size_t, nmax> last_access{{0}};
2940
+ static size_t access_counter = 0;
2941
+ static std::mutex mut;
2942
+
2943
+ auto find_in_cache = [&]() -> std::shared_ptr<T>
2944
+ {
2945
+ for (size_t i=0; i<nmax; ++i)
2946
+ if (cache[i] && (cache[i]->length()==length))
2947
+ {
2948
+ // no need to update if this is already the most recent entry
2949
+ if (last_access[i]!=access_counter)
2950
+ {
2951
+ last_access[i] = ++access_counter;
2952
+ // Guard against overflow
2953
+ if (access_counter == 0)
2954
+ last_access.fill(0);
2955
+ }
2956
+ return cache[i];
2957
+ }
2958
+
2959
+ return nullptr;
2960
+ };
2961
+
2962
+ {
2963
+ std::lock_guard<std::mutex> lock(mut);
2964
+ auto p = find_in_cache();
2965
+ if (p) return p;
2966
+ }
2967
+ auto plan = std::make_shared<T>(length);
2968
+ {
2969
+ std::lock_guard<std::mutex> lock(mut);
2970
+ auto p = find_in_cache();
2971
+ if (p) return p;
2972
+
2973
+ size_t lru = 0;
2974
+ for (size_t i=1; i<nmax; ++i)
2975
+ if (last_access[i] < last_access[lru])
2976
+ lru = i;
2977
+
2978
+ cache[lru] = plan;
2979
+ last_access[lru] = ++access_counter;
2980
+ }
2981
+ return plan;
2982
+ #endif
2983
+ }
2984
+
2985
+ class arr_info
2986
+ {
2987
+ protected:
2988
+ shape_t shp;
2989
+ stride_t str;
2990
+
2991
+ public:
2992
+ arr_info(const shape_t &shape_, const stride_t &stride_)
2993
+ : shp(shape_), str(stride_) {}
2994
+ size_t ndim() const { return shp.size(); }
2995
+ size_t size() const { return util::prod(shp); }
2996
+ const shape_t &shape() const { return shp; }
2997
+ size_t shape(size_t i) const { return shp[i]; }
2998
+ const stride_t &stride() const { return str; }
2999
+ const ptrdiff_t &stride(size_t i) const { return str[i]; }
3000
+ };
3001
+
3002
+ template<typename T> class cndarr: public arr_info
3003
+ {
3004
+ protected:
3005
+ const char *d;
3006
+
3007
+ public:
3008
+ cndarr(const void *data_, const shape_t &shape_, const stride_t &stride_)
3009
+ : arr_info(shape_, stride_),
3010
+ d(reinterpret_cast<const char *>(data_)) {}
3011
+ const T &operator[](ptrdiff_t ofs) const
3012
+ { return *reinterpret_cast<const T *>(d+ofs); }
3013
+ };
3014
+
3015
+ template<typename T> class ndarr: public cndarr<T>
3016
+ {
3017
+ public:
3018
+ ndarr(void *data_, const shape_t &shape_, const stride_t &stride_)
3019
+ : cndarr<T>::cndarr(const_cast<const void *>(data_), shape_, stride_)
3020
+ {}
3021
+ T &operator[](ptrdiff_t ofs)
3022
+ { return *reinterpret_cast<T *>(const_cast<char *>(cndarr<T>::d+ofs)); }
3023
+ };
3024
+
3025
+ template<size_t N> class multi_iter
3026
+ {
3027
+ private:
3028
+ shape_t pos;
3029
+ const arr_info &iarr, &oarr;
3030
+ ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o;
3031
+ size_t idim, rem;
3032
+
3033
+ void advance_i()
3034
+ {
3035
+ for (int i_=int(pos.size())-1; i_>=0; --i_)
3036
+ {
3037
+ auto i = size_t(i_);
3038
+ if (i==idim) continue;
3039
+ p_ii += iarr.stride(i);
3040
+ p_oi += oarr.stride(i);
3041
+ if (++pos[i] < iarr.shape(i))
3042
+ return;
3043
+ pos[i] = 0;
3044
+ p_ii -= ptrdiff_t(iarr.shape(i))*iarr.stride(i);
3045
+ p_oi -= ptrdiff_t(oarr.shape(i))*oarr.stride(i);
3046
+ }
3047
+ }
3048
+
3049
+ public:
3050
+ multi_iter(const arr_info &iarr_, const arr_info &oarr_, size_t idim_)
3051
+ : pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0),
3052
+ str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)),
3053
+ idim(idim_), rem(iarr.size()/iarr.shape(idim))
3054
+ {
3055
+ auto nshares = threading::num_threads();
3056
+ if (nshares==1) return;
3057
+ if (nshares==0) throw std::runtime_error("can't run with zero threads");
3058
+ auto myshare = threading::thread_id();
3059
+ if (myshare>=nshares) throw std::runtime_error("impossible share requested");
3060
+ size_t nbase = rem/nshares;
3061
+ size_t additional = rem%nshares;
3062
+ size_t lo = myshare*nbase + ((myshare<additional) ? myshare : additional);
3063
+ size_t hi = lo+nbase+(myshare<additional);
3064
+ size_t todo = hi-lo;
3065
+
3066
+ size_t chunk = rem;
3067
+ for (size_t i=0; i<pos.size(); ++i)
3068
+ {
3069
+ if (i==idim) continue;
3070
+ chunk /= iarr.shape(i);
3071
+ size_t n_advance = lo/chunk;
3072
+ pos[i] += n_advance;
3073
+ p_ii += ptrdiff_t(n_advance)*iarr.stride(i);
3074
+ p_oi += ptrdiff_t(n_advance)*oarr.stride(i);
3075
+ lo -= n_advance*chunk;
3076
+ }
3077
+ rem = todo;
3078
+ }
3079
+ void advance(size_t n)
3080
+ {
3081
+ if (rem<n) throw std::runtime_error("underrun");
3082
+ for (size_t i=0; i<n; ++i)
3083
+ {
3084
+ p_i[i] = p_ii;
3085
+ p_o[i] = p_oi;
3086
+ advance_i();
3087
+ }
3088
+ rem -= n;
3089
+ }
3090
+ ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*str_i; }
3091
+ ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*str_i; }
3092
+ ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*str_o; }
3093
+ ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*str_o; }
3094
+ size_t length_in() const { return iarr.shape(idim); }
3095
+ size_t length_out() const { return oarr.shape(idim); }
3096
+ ptrdiff_t stride_in() const { return str_i; }
3097
+ ptrdiff_t stride_out() const { return str_o; }
3098
+ size_t remaining() const { return rem; }
3099
+ };
3100
+
3101
+ class simple_iter
3102
+ {
3103
+ private:
3104
+ shape_t pos;
3105
+ const arr_info &arr;
3106
+ ptrdiff_t p;
3107
+ size_t rem;
3108
+
3109
+ public:
3110
+ simple_iter(const arr_info &arr_)
3111
+ : pos(arr_.ndim(), 0), arr(arr_), p(0), rem(arr_.size()) {}
3112
+ void advance()
3113
+ {
3114
+ --rem;
3115
+ for (int i_=int(pos.size())-1; i_>=0; --i_)
3116
+ {
3117
+ auto i = size_t(i_);
3118
+ p += arr.stride(i);
3119
+ if (++pos[i] < arr.shape(i))
3120
+ return;
3121
+ pos[i] = 0;
3122
+ p -= ptrdiff_t(arr.shape(i))*arr.stride(i);
3123
+ }
3124
+ }
3125
+ ptrdiff_t ofs() const { return p; }
3126
+ size_t remaining() const { return rem; }
3127
+ };
3128
+
3129
+ class rev_iter
3130
+ {
3131
+ private:
3132
+ shape_t pos;
3133
+ const arr_info &arr;
3134
+ std::vector<char> rev_axis;
3135
+ std::vector<char> rev_jump;
3136
+ size_t last_axis, last_size;
3137
+ shape_t shp;
3138
+ ptrdiff_t p, rp;
3139
+ size_t rem;
3140
+
3141
+ public:
3142
+ rev_iter(const arr_info &arr_, const shape_t &axes)
3143
+ : pos(arr_.ndim(), 0), arr(arr_), rev_axis(arr_.ndim(), 0),
3144
+ rev_jump(arr_.ndim(), 1), p(0), rp(0)
3145
+ {
3146
+ for (auto ax: axes)
3147
+ rev_axis[ax]=1;
3148
+ last_axis = axes.back();
3149
+ last_size = arr.shape(last_axis)/2 + 1;
3150
+ shp = arr.shape();
3151
+ shp[last_axis] = last_size;
3152
+ rem=1;
3153
+ for (auto i: shp)
3154
+ rem *= i;
3155
+ }
3156
+ void advance()
3157
+ {
3158
+ --rem;
3159
+ for (int i_=int(pos.size())-1; i_>=0; --i_)
3160
+ {
3161
+ auto i = size_t(i_);
3162
+ p += arr.stride(i);
3163
+ if (!rev_axis[i])
3164
+ rp += arr.stride(i);
3165
+ else
3166
+ {
3167
+ rp -= arr.stride(i);
3168
+ if (rev_jump[i])
3169
+ {
3170
+ rp += ptrdiff_t(arr.shape(i))*arr.stride(i);
3171
+ rev_jump[i] = 0;
3172
+ }
3173
+ }
3174
+ if (++pos[i] < shp[i])
3175
+ return;
3176
+ pos[i] = 0;
3177
+ p -= ptrdiff_t(shp[i])*arr.stride(i);
3178
+ if (rev_axis[i])
3179
+ {
3180
+ rp -= ptrdiff_t(arr.shape(i)-shp[i])*arr.stride(i);
3181
+ rev_jump[i] = 1;
3182
+ }
3183
+ else
3184
+ rp -= ptrdiff_t(shp[i])*arr.stride(i);
3185
+ }
3186
+ }
3187
+ ptrdiff_t ofs() const { return p; }
3188
+ ptrdiff_t rev_ofs() const { return rp; }
3189
+ size_t remaining() const { return rem; }
3190
+ };
3191
+
3192
+ template<typename T> struct VTYPE {};
3193
+ template <typename T> using vtype_t = typename VTYPE<T>::type;
3194
+
3195
+ #ifndef POCKETFFT_NO_VECTORS
3196
+ template<> struct VTYPE<float>
3197
+ {
3198
+ using type = float __attribute__ ((vector_size (VLEN<float>::val*sizeof(float))));
3199
+ };
3200
+ template<> struct VTYPE<double>
3201
+ {
3202
+ using type = double __attribute__ ((vector_size (VLEN<double>::val*sizeof(double))));
3203
+ };
3204
+ template<> struct VTYPE<long double>
3205
+ {
3206
+ using type = long double __attribute__ ((vector_size (VLEN<long double>::val*sizeof(long double))));
3207
+ };
3208
+ #endif
3209
+
3210
+ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
3211
+ size_t axsize, size_t elemsize)
3212
+ {
3213
+ auto othersize = util::prod(shape)/axsize;
3214
+ auto tmpsize = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
3215
+ return arr<char>(tmpsize*elemsize);
3216
+ }
3217
+ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
3218
+ const shape_t &axes, size_t elemsize)
3219
+ {
3220
+ size_t fullsize=util::prod(shape);
3221
+ size_t tmpsize=0;
3222
+ for (size_t i=0; i<axes.size(); ++i)
3223
+ {
3224
+ auto axsize = shape[axes[i]];
3225
+ auto othersize = fullsize/axsize;
3226
+ auto sz = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
3227
+ if (sz>tmpsize) tmpsize=sz;
3228
+ }
3229
+ return arr<char>(tmpsize*elemsize);
3230
+ }
3231
+
3232
+ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3233
+ const cndarr<cmplx<T>> &src, cmplx<vtype_t<T>> *POCKETFFT_RESTRICT dst)
3234
+ {
3235
+ for (size_t i=0; i<it.length_in(); ++i)
3236
+ for (size_t j=0; j<vlen; ++j)
3237
+ {
3238
+ dst[i].r[j] = src[it.iofs(j,i)].r;
3239
+ dst[i].i[j] = src[it.iofs(j,i)].i;
3240
+ }
3241
+ }
3242
+
3243
+ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3244
+ const cndarr<T> &src, vtype_t<T> *POCKETFFT_RESTRICT dst)
3245
+ {
3246
+ for (size_t i=0; i<it.length_in(); ++i)
3247
+ for (size_t j=0; j<vlen; ++j)
3248
+ dst[i][j] = src[it.iofs(j,i)];
3249
+ }
3250
+
3251
+ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3252
+ const cndarr<T> &src, T *POCKETFFT_RESTRICT dst)
3253
+ {
3254
+ if (dst == &src[it.iofs(0)]) return; // in-place
3255
+ for (size_t i=0; i<it.length_in(); ++i)
3256
+ dst[i] = src[it.iofs(i)];
3257
+ }
3258
+
3259
+ template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3260
+ const cmplx<vtype_t<T>> *POCKETFFT_RESTRICT src, ndarr<cmplx<T>> &dst)
3261
+ {
3262
+ for (size_t i=0; i<it.length_out(); ++i)
3263
+ for (size_t j=0; j<vlen; ++j)
3264
+ dst[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
3265
+ }
3266
+
3267
+ template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3268
+ const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3269
+ {
3270
+ for (size_t i=0; i<it.length_out(); ++i)
3271
+ for (size_t j=0; j<vlen; ++j)
3272
+ dst[it.oofs(j,i)] = src[i][j];
3273
+ }
3274
+
3275
+ template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3276
+ const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3277
+ {
3278
+ if (src == &dst[it.oofs(0)]) return; // in-place
3279
+ for (size_t i=0; i<it.length_out(); ++i)
3280
+ dst[it.oofs(i)] = src[i];
3281
+ }
3282
+
3283
+ template <typename T> struct add_vec { using type = vtype_t<T>; };
3284
+ template <typename T> struct add_vec<cmplx<T>>
3285
+ { using type = cmplx<vtype_t<T>>; };
3286
+ template <typename T> using add_vec_t = typename add_vec<T>::type;
3287
+
3288
+ template<typename Tplan, typename T, typename T0, typename Exec>
3289
+ POCKETFFT_NOINLINE void general_nd(const cndarr<T> &in, ndarr<T> &out,
3290
+ const shape_t &axes, T0 fct, size_t nthreads, const Exec & exec,
3291
+ const bool allow_inplace=true)
3292
+ {
3293
+ std::shared_ptr<Tplan> plan;
3294
+
3295
+ for (size_t iax=0; iax<axes.size(); ++iax)
3296
+ {
3297
+ size_t len=in.shape(axes[iax]);
3298
+ if ((!plan) || (len!=plan->length()))
3299
+ plan = get_plan<Tplan>(len);
3300
+
3301
+ threading::thread_map(
3302
+ util::thread_count(nthreads, in.shape(), axes[iax], VLEN<T>::val),
3303
+ [&] {
3304
+ constexpr auto vlen = VLEN<T0>::val;
3305
+ auto storage = alloc_tmp<T0>(in.shape(), len, sizeof(T));
3306
+ const auto &tin(iax==0? in : out);
3307
+ multi_iter<vlen> it(tin, out, axes[iax]);
3308
+ #ifndef POCKETFFT_NO_VECTORS
3309
+ if (vlen>1)
3310
+ while (it.remaining()>=vlen)
3311
+ {
3312
+ it.advance(vlen);
3313
+ auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data());
3314
+ exec(it, tin, out, tdatav, *plan, fct);
3315
+ }
3316
+ #endif
3317
+ while (it.remaining()>0)
3318
+ {
3319
+ it.advance(1);
3320
+ auto buf = allow_inplace && it.stride_out() == sizeof(T) ?
3321
+ &out[it.oofs(0)] : reinterpret_cast<T *>(storage.data());
3322
+ exec(it, tin, out, buf, *plan, fct);
3323
+ }
3324
+ }); // end of parallel region
3325
+ fct = T0(1); // factor has been applied, use 1 for remaining axes
3326
+ }
3327
+ }
3328
+
3329
+ struct ExecC2C
3330
+ {
3331
+ bool forward;
3332
+
3333
+ template <typename T0, typename T, size_t vlen> void operator () (
3334
+ const multi_iter<vlen> &it, const cndarr<cmplx<T0>> &in,
3335
+ ndarr<cmplx<T0>> &out, T * buf, const pocketfft_c<T0> &plan, T0 fct) const
3336
+ {
3337
+ copy_input(it, in, buf);
3338
+ plan.exec(buf, fct, forward);
3339
+ copy_output(it, buf, out);
3340
+ }
3341
+ };
3342
+
3343
+ template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
3344
+ const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3345
+ {
3346
+ for (size_t j=0; j<vlen; ++j)
3347
+ dst[it.oofs(j,0)] = src[0][j];
3348
+ size_t i=1, i1=1, i2=it.length_out()-1;
3349
+ for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
3350
+ for (size_t j=0; j<vlen; ++j)
3351
+ {
3352
+ dst[it.oofs(j,i1)] = src[i][j]+src[i+1][j];
3353
+ dst[it.oofs(j,i2)] = src[i][j]-src[i+1][j];
3354
+ }
3355
+ if (i<it.length_out())
3356
+ for (size_t j=0; j<vlen; ++j)
3357
+ dst[it.oofs(j,i1)] = src[i][j];
3358
+ }
3359
+
3360
+ template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
3361
+ const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3362
+ {
3363
+ dst[it.oofs(0)] = src[0];
3364
+ size_t i=1, i1=1, i2=it.length_out()-1;
3365
+ for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
3366
+ {
3367
+ dst[it.oofs(i1)] = src[i]+src[i+1];
3368
+ dst[it.oofs(i2)] = src[i]-src[i+1];
3369
+ }
3370
+ if (i<it.length_out())
3371
+ dst[it.oofs(i1)] = src[i];
3372
+ }
3373
+
3374
+ struct ExecHartley
3375
+ {
3376
+ template <typename T0, typename T, size_t vlen> void operator () (
3377
+ const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out,
3378
+ T * buf, const pocketfft_r<T0> &plan, T0 fct) const
3379
+ {
3380
+ copy_input(it, in, buf);
3381
+ plan.exec(buf, fct, true);
3382
+ copy_hartley(it, buf, out);
3383
+ }
3384
+ };
3385
+
3386
+ struct ExecDcst
3387
+ {
3388
+ bool ortho;
3389
+ int type;
3390
+ bool cosine;
3391
+
3392
+ template <typename T0, typename T, typename Tplan, size_t vlen>
3393
+ void operator () (const multi_iter<vlen> &it, const cndarr<T0> &in,
3394
+ ndarr<T0> &out, T * buf, const Tplan &plan, T0 fct) const
3395
+ {
3396
+ copy_input(it, in, buf);
3397
+ plan.exec(buf, fct, ortho, type, cosine);
3398
+ copy_output(it, buf, out);
3399
+ }
3400
+ };
3401
+
3402
+ template<typename T> POCKETFFT_NOINLINE void general_r2c(
3403
+ const cndarr<T> &in, ndarr<cmplx<T>> &out, size_t axis, bool forward, T fct,
3404
+ size_t nthreads)
3405
+ {
3406
+ auto plan = get_plan<pocketfft_r<T>>(in.shape(axis));
3407
+ size_t len=in.shape(axis);
3408
+ threading::thread_map(
3409
+ util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
3410
+ [&] {
3411
+ constexpr auto vlen = VLEN<T>::val;
3412
+ auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T));
3413
+ multi_iter<vlen> it(in, out, axis);
3414
+ #ifndef POCKETFFT_NO_VECTORS
3415
+ if (vlen>1)
3416
+ while (it.remaining()>=vlen)
3417
+ {
3418
+ it.advance(vlen);
3419
+ auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
3420
+ copy_input(it, in, tdatav);
3421
+ plan->exec(tdatav, fct, true);
3422
+ for (size_t j=0; j<vlen; ++j)
3423
+ out[it.oofs(j,0)].Set(tdatav[0][j]);
3424
+ size_t i=1, ii=1;
3425
+ if (forward)
3426
+ for (; i<len-1; i+=2, ++ii)
3427
+ for (size_t j=0; j<vlen; ++j)
3428
+ out[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]);
3429
+ else
3430
+ for (; i<len-1; i+=2, ++ii)
3431
+ for (size_t j=0; j<vlen; ++j)
3432
+ out[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]);
3433
+ if (i<len)
3434
+ for (size_t j=0; j<vlen; ++j)
3435
+ out[it.oofs(j,ii)].Set(tdatav[i][j]);
3436
+ }
3437
+ #endif
3438
+ while (it.remaining()>0)
3439
+ {
3440
+ it.advance(1);
3441
+ auto tdata = reinterpret_cast<T *>(storage.data());
3442
+ copy_input(it, in, tdata);
3443
+ plan->exec(tdata, fct, true);
3444
+ out[it.oofs(0)].Set(tdata[0]);
3445
+ size_t i=1, ii=1;
3446
+ if (forward)
3447
+ for (; i<len-1; i+=2, ++ii)
3448
+ out[it.oofs(ii)].Set(tdata[i], tdata[i+1]);
3449
+ else
3450
+ for (; i<len-1; i+=2, ++ii)
3451
+ out[it.oofs(ii)].Set(tdata[i], -tdata[i+1]);
3452
+ if (i<len)
3453
+ out[it.oofs(ii)].Set(tdata[i]);
3454
+ }
3455
+ }); // end of parallel region
3456
+ }
3457
+ template<typename T> POCKETFFT_NOINLINE void general_c2r(
3458
+ const cndarr<cmplx<T>> &in, ndarr<T> &out, size_t axis, bool forward, T fct,
3459
+ size_t nthreads)
3460
+ {
3461
+ auto plan = get_plan<pocketfft_r<T>>(out.shape(axis));
3462
+ size_t len=out.shape(axis);
3463
+ threading::thread_map(
3464
+ util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
3465
+ [&] {
3466
+ constexpr auto vlen = VLEN<T>::val;
3467
+ auto storage = alloc_tmp<T>(out.shape(), len, sizeof(T));
3468
+ multi_iter<vlen> it(in, out, axis);
3469
+ #ifndef POCKETFFT_NO_VECTORS
3470
+ if (vlen>1)
3471
+ while (it.remaining()>=vlen)
3472
+ {
3473
+ it.advance(vlen);
3474
+ auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
3475
+ for (size_t j=0; j<vlen; ++j)
3476
+ tdatav[0][j]=in[it.iofs(j,0)].r;
3477
+ {
3478
+ size_t i=1, ii=1;
3479
+ if (forward)
3480
+ for (; i<len-1; i+=2, ++ii)
3481
+ for (size_t j=0; j<vlen; ++j)
3482
+ {
3483
+ tdatav[i ][j] = in[it.iofs(j,ii)].r;
3484
+ tdatav[i+1][j] = -in[it.iofs(j,ii)].i;
3485
+ }
3486
+ else
3487
+ for (; i<len-1; i+=2, ++ii)
3488
+ for (size_t j=0; j<vlen; ++j)
3489
+ {
3490
+ tdatav[i ][j] = in[it.iofs(j,ii)].r;
3491
+ tdatav[i+1][j] = in[it.iofs(j,ii)].i;
3492
+ }
3493
+ if (i<len)
3494
+ for (size_t j=0; j<vlen; ++j)
3495
+ tdatav[i][j] = in[it.iofs(j,ii)].r;
3496
+ }
3497
+ plan->exec(tdatav, fct, false);
3498
+ copy_output(it, tdatav, out);
3499
+ }
3500
+ #endif
3501
+ while (it.remaining()>0)
3502
+ {
3503
+ it.advance(1);
3504
+ auto tdata = reinterpret_cast<T *>(storage.data());
3505
+ tdata[0]=in[it.iofs(0)].r;
3506
+ {
3507
+ size_t i=1, ii=1;
3508
+ if (forward)
3509
+ for (; i<len-1; i+=2, ++ii)
3510
+ {
3511
+ tdata[i ] = in[it.iofs(ii)].r;
3512
+ tdata[i+1] = -in[it.iofs(ii)].i;
3513
+ }
3514
+ else
3515
+ for (; i<len-1; i+=2, ++ii)
3516
+ {
3517
+ tdata[i ] = in[it.iofs(ii)].r;
3518
+ tdata[i+1] = in[it.iofs(ii)].i;
3519
+ }
3520
+ if (i<len)
3521
+ tdata[i] = in[it.iofs(ii)].r;
3522
+ }
3523
+ plan->exec(tdata, fct, false);
3524
+ copy_output(it, tdata, out);
3525
+ }
3526
+ }); // end of parallel region
3527
+ }
3528
+
3529
+ struct ExecR2R
3530
+ {
3531
+ bool r2h, forward;
3532
+
3533
+ template <typename T0, typename T, size_t vlen> void operator () (
3534
+ const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out, T * buf,
3535
+ const pocketfft_r<T0> &plan, T0 fct) const
3536
+ {
3537
+ copy_input(it, in, buf);
3538
+ if ((!r2h) && forward)
3539
+ for (size_t i=2; i<it.length_out(); i+=2)
3540
+ buf[i] = -buf[i];
3541
+ plan.exec(buf, fct, r2h);
3542
+ if (r2h && (!forward))
3543
+ for (size_t i=2; i<it.length_out(); i+=2)
3544
+ buf[i] = -buf[i];
3545
+ copy_output(it, buf, out);
3546
+ }
3547
+ };
3548
+
3549
+ template<typename T> void c2c(const shape_t &shape, const stride_t &stride_in,
3550
+ const stride_t &stride_out, const shape_t &axes, bool forward,
3551
+ const std::complex<T> *data_in, std::complex<T> *data_out, T fct,
3552
+ size_t nthreads=1)
3553
+ {
3554
+ if (util::prod(shape)==0) return;
3555
+ util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3556
+ cndarr<cmplx<T>> ain(data_in, shape, stride_in);
3557
+ ndarr<cmplx<T>> aout(data_out, shape, stride_out);
3558
+ general_nd<pocketfft_c<T>>(ain, aout, axes, fct, nthreads, ExecC2C{forward});
3559
+ }
3560
+
3561
+ template<typename T> void dct(const shape_t &shape,
3562
+ const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3563
+ int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1)
3564
+ {
3565
+ if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type");
3566
+ if (util::prod(shape)==0) return;
3567
+ util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3568
+ cndarr<T> ain(data_in, shape, stride_in);
3569
+ ndarr<T> aout(data_out, shape, stride_out);
3570
+ const ExecDcst exec{ortho, type, true};
3571
+ if (type==1)
3572
+ general_nd<T_dct1<T>>(ain, aout, axes, fct, nthreads, exec);
3573
+ else if (type==4)
3574
+ general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec);
3575
+ else
3576
+ general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec);
3577
+ }
3578
+
3579
+ template<typename T> void dst(const shape_t &shape,
3580
+ const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3581
+ int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1)
3582
+ {
3583
+ if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type");
3584
+ if (util::prod(shape)==0) return;
3585
+ util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3586
+ cndarr<T> ain(data_in, shape, stride_in);
3587
+ ndarr<T> aout(data_out, shape, stride_out);
3588
+ const ExecDcst exec{ortho, type, false};
3589
+ if (type==1)
3590
+ general_nd<T_dst1<T>>(ain, aout, axes, fct, nthreads, exec);
3591
+ else if (type==4)
3592
+ general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec);
3593
+ else
3594
+ general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec);
3595
+ }
3596
+
3597
+ template<typename T> void r2c(const shape_t &shape_in,
3598
+ const stride_t &stride_in, const stride_t &stride_out, size_t axis,
3599
+ bool forward, const T *data_in, std::complex<T> *data_out, T fct,
3600
+ size_t nthreads=1)
3601
+ {
3602
+ if (util::prod(shape_in)==0) return;
3603
+ util::sanity_check(shape_in, stride_in, stride_out, false, axis);
3604
+ cndarr<T> ain(data_in, shape_in, stride_in);
3605
+ shape_t shape_out(shape_in);
3606
+ shape_out[axis] = shape_in[axis]/2 + 1;
3607
+ ndarr<cmplx<T>> aout(data_out, shape_out, stride_out);
3608
+ general_r2c(ain, aout, axis, forward, fct, nthreads);
3609
+ }
3610
+
3611
+ template<typename T> void r2c(const shape_t &shape_in,
3612
+ const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3613
+ bool forward, const T *data_in, std::complex<T> *data_out, T fct,
3614
+ size_t nthreads=1)
3615
+ {
3616
+ if (util::prod(shape_in)==0) return;
3617
+ util::sanity_check(shape_in, stride_in, stride_out, false, axes);
3618
+ r2c(shape_in, stride_in, stride_out, axes.back(), forward, data_in, data_out,
3619
+ fct, nthreads);
3620
+ if (axes.size()==1) return;
3621
+
3622
+ shape_t shape_out(shape_in);
3623
+ shape_out[axes.back()] = shape_in[axes.back()]/2 + 1;
3624
+ auto newaxes = shape_t{axes.begin(), --axes.end()};
3625
+ c2c(shape_out, stride_out, stride_out, newaxes, forward, data_out, data_out,
3626
+ T(1), nthreads);
3627
+ }
3628
+
3629
+ template<typename T> void c2r(const shape_t &shape_out,
3630
+ const stride_t &stride_in, const stride_t &stride_out, size_t axis,
3631
+ bool forward, const std::complex<T> *data_in, T *data_out, T fct,
3632
+ size_t nthreads=1)
3633
+ {
3634
+ if (util::prod(shape_out)==0) return;
3635
+ util::sanity_check(shape_out, stride_in, stride_out, false, axis);
3636
+ shape_t shape_in(shape_out);
3637
+ shape_in[axis] = shape_out[axis]/2 + 1;
3638
+ cndarr<cmplx<T>> ain(data_in, shape_in, stride_in);
3639
+ ndarr<T> aout(data_out, shape_out, stride_out);
3640
+ general_c2r(ain, aout, axis, forward, fct, nthreads);
3641
+ }
3642
+
3643
+ template<typename T> void c2r(const shape_t &shape_out,
3644
+ const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3645
+ bool forward, const std::complex<T> *data_in, T *data_out, T fct,
3646
+ size_t nthreads=1)
3647
+ {
3648
+ if (util::prod(shape_out)==0) return;
3649
+ if (axes.size()==1)
3650
+ return c2r(shape_out, stride_in, stride_out, axes[0], forward,
3651
+ data_in, data_out, fct, nthreads);
3652
+ util::sanity_check(shape_out, stride_in, stride_out, false, axes);
3653
+ auto shape_in = shape_out;
3654
+ shape_in[axes.back()] = shape_out[axes.back()]/2 + 1;
3655
+ auto nval = util::prod(shape_in);
3656
+ stride_t stride_inter(shape_in.size());
3657
+ stride_inter.back() = sizeof(cmplx<T>);
3658
+ for (int i=int(shape_in.size())-2; i>=0; --i)
3659
+ stride_inter[size_t(i)] =
3660
+ stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]);
3661
+ arr<std::complex<T>> tmp(nval);
3662
+ auto newaxes = shape_t{axes.begin(), --axes.end()};
3663
+ c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(),
3664
+ T(1), nthreads);
3665
+ c2r(shape_out, stride_inter, stride_out, axes.back(), forward,
3666
+ tmp.data(), data_out, fct, nthreads);
3667
+ }
3668
+
3669
+ template<typename T> void r2r_fftpack(const shape_t &shape,
3670
+ const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3671
+ bool real2hermitian, bool forward, const T *data_in, T *data_out, T fct,
3672
+ size_t nthreads=1)
3673
+ {
3674
+ if (util::prod(shape)==0) return;
3675
+ util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3676
+ cndarr<T> ain(data_in, shape, stride_in);
3677
+ ndarr<T> aout(data_out, shape, stride_out);
3678
+ general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads,
3679
+ ExecR2R{real2hermitian, forward});
3680
+ }
3681
+
3682
+ template<typename T> void r2r_separable_hartley(const shape_t &shape,
3683
+ const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3684
+ const T *data_in, T *data_out, T fct, size_t nthreads=1)
3685
+ {
3686
+ if (util::prod(shape)==0) return;
3687
+ util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3688
+ cndarr<T> ain(data_in, shape, stride_in);
3689
+ ndarr<T> aout(data_out, shape, stride_out);
3690
+ general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecHartley{},
3691
+ false);
3692
+ }
3693
+
3694
+ template<typename T> void r2r_genuine_hartley(const shape_t &shape,
3695
+ const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3696
+ const T *data_in, T *data_out, T fct, size_t nthreads=1)
3697
+ {
3698
+ if (util::prod(shape)==0) return;
3699
+ if (axes.size()==1)
3700
+ return r2r_separable_hartley(shape, stride_in, stride_out, axes, data_in,
3701
+ data_out, fct, nthreads);
3702
+ util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3703
+ shape_t tshp(shape);
3704
+ tshp[axes.back()] = tshp[axes.back()]/2+1;
3705
+ arr<std::complex<T>> tdata(util::prod(tshp));
3706
+ stride_t tstride(shape.size());
3707
+ tstride.back()=sizeof(std::complex<T>);
3708
+ for (size_t i=tstride.size()-1; i>0; --i)
3709
+ tstride[i-1]=tstride[i]*ptrdiff_t(tshp[i]);
3710
+ r2c(shape, stride_in, tstride, axes, true, data_in, tdata.data(), fct, nthreads);
3711
+ cndarr<cmplx<T>> atmp(tdata.data(), tshp, tstride);
3712
+ ndarr<T> aout(data_out, shape, stride_out);
3713
+ simple_iter iin(atmp);
3714
+ rev_iter iout(aout, axes);
3715
+ while(iin.remaining()>0)
3716
+ {
3717
+ auto v = atmp[iin.ofs()];
3718
+ aout[iout.ofs()] = v.r+v.i;
3719
+ aout[iout.rev_ofs()] = v.r-v.i;
3720
+ iin.advance(); iout.advance();
3721
+ }
3722
+ }
3723
+
3724
+ } // namespace detail
3725
+
3726
+ using detail::FORWARD;
3727
+ using detail::BACKWARD;
3728
+ using detail::shape_t;
3729
+ using detail::stride_t;
3730
+ using detail::c2c;
3731
+ using detail::c2r;
3732
+ using detail::r2c;
3733
+ using detail::r2r_fftpack;
3734
+ using detail::r2r_separable_hartley;
3735
+ using detail::r2r_genuine_hartley;
3736
+ using detail::dct;
3737
+ using detail::dst;
3738
+
3739
+ } // namespace pocketfft
3740
+
3741
+ #undef POCKETFFT_NOINLINE
3742
+ #undef POCKETFFT_RESTRICT
3743
+
3744
+ #endif // POCKETFFT_HDRONLY_H