casadi 3.6.7__cp313-none-macosx_11_0_arm64.whl → 3.7.1__cp313-none-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- casadi/_casadi.so +0 -0
- casadi/casadi-cli +0 -0
- casadi/casadi.py +6531 -3039
- casadi/cbc +0 -0
- casadi/clp +0 -0
- casadi/cmake/casadi-config-version.cmake +1 -1
- casadi/cmake/casadi-config.cmake +1 -1
- casadi/cmake/casadi-targets-release.cmake +5 -5
- casadi/cmake/casadi-targets.cmake +7 -7
- casadi/{lib/cmake/tinyxml2/tinyxml2-config-version.cmake → cmake/ghc_filesystem/ghc_filesystem-config-version.cmake} +30 -10
- casadi/cmake/ghc_filesystem/ghc_filesystem-config.cmake +30 -0
- casadi/{lib/cmake/tinyxml2/tinyxml2-static-targets.cmake → cmake/ghc_filesystem/ghc_filesystem-targets.cmake} +10 -10
- casadi/cmake/highs/highs-targets-release.cmake +2 -2
- casadi/cmake/highs/highs-targets.cmake +3 -3
- casadi/cmake/libzip/libzip-config-version.cmake +43 -0
- casadi/cmake/libzip/libzip-config.cmake +69 -0
- casadi/cmake/libzip/libzip-targets-release.cmake +19 -0
- casadi/cmake/libzip/libzip-targets.cmake +107 -0
- casadi/cmake/libzip/modules/FindMbedTLS.cmake +141 -0
- casadi/cmake/libzip/modules/FindNettle.cmake +141 -0
- casadi/cmake/libzip/modules/Findzstd.cmake +186 -0
- casadi/cmake/osqp/osqp-targets.cmake +2 -2
- casadi/cmake/proxsuite/proxsuiteTargets.cmake +2 -2
- casadi/cmake/qdldl/qdldl-targets.cmake +2 -2
- casadi/cmake/sleqp/sleqp-targets.cmake +2 -2
- casadi/cmake/trlib/trlib-config.cmake +2 -2
- casadi/highs +0 -0
- casadi/include/casadi/casadi.i +276 -47
- casadi/include/casadi/config.h +11 -11
- casadi/include/casadi/core/archiver.hpp +58 -0
- casadi/include/casadi/core/blazing_spline.hpp +47 -0
- casadi/include/casadi/core/calculus.hpp +57 -2
- casadi/include/casadi/core/callback.hpp +9 -0
- casadi/include/casadi/core/casadi_common.hpp +37 -0
- casadi/include/casadi/core/casadi_meta.hpp +15 -0
- casadi/include/casadi/core/casadi_misc.hpp +21 -0
- casadi/include/casadi/core/code_generator.hpp +115 -19
- casadi/include/casadi/core/core.hpp +5 -0
- casadi/include/casadi/core/dae_builder.hpp +303 -141
- casadi/include/casadi/core/dm.hpp +3 -0
- casadi/include/casadi/core/filesystem.hpp +58 -0
- casadi/include/casadi/core/fmu.hpp +62 -16
- casadi/include/casadi/core/function.hpp +24 -0
- casadi/include/casadi/core/generic_matrix.hpp +214 -7
- casadi/include/casadi/core/generic_shared.hpp +395 -0
- casadi/include/casadi/core/generic_shared_impl.hpp +218 -0
- casadi/include/casadi/core/generic_shared_internal.hpp +215 -0
- casadi/include/casadi/core/generic_type.hpp +3 -0
- casadi/include/casadi/core/global_options.hpp +10 -0
- casadi/include/casadi/core/integrator.hpp +41 -7
- casadi/include/casadi/core/matrix_decl.hpp +71 -0
- casadi/include/casadi/core/mx.hpp +63 -2
- casadi/include/casadi/core/nlp_builder.hpp +2 -1
- casadi/include/casadi/core/options.hpp +6 -3
- casadi/include/casadi/core/optistack.hpp +43 -9
- casadi/include/casadi/core/printable.hpp +8 -0
- casadi/include/casadi/core/resource.hpp +107 -0
- casadi/include/casadi/core/runtime/casadi_blazing_1d_boor_eval.hpp +112 -0
- casadi/include/casadi/core/runtime/casadi_blazing_2d_boor_eval.hpp +311 -0
- casadi/include/casadi/core/runtime/casadi_blazing_3d_boor_eval.hpp +645 -0
- casadi/include/casadi/core/runtime/casadi_blazing_de_boor.hpp +101 -0
- casadi/include/casadi/core/runtime/casadi_finite_diff.hpp +1 -1
- casadi/include/casadi/core/runtime/casadi_nlp.hpp +8 -2
- casadi/include/casadi/core/runtime/casadi_print_canonical.hpp +55 -0
- casadi/include/casadi/core/runtime/casadi_print_scalar.hpp +25 -0
- casadi/include/casadi/core/runtime/casadi_print_vector.hpp +32 -0
- casadi/include/casadi/core/runtime/casadi_printme.hpp +26 -0
- casadi/include/casadi/core/serializer.hpp +13 -5
- casadi/include/casadi/core/serializing_stream.hpp +9 -2
- casadi/include/casadi/core/shared_object.hpp +73 -161
- casadi/include/casadi/core/sparsity.hpp +13 -1
- casadi/include/casadi/core/sparsity_interface.hpp +19 -1
- casadi/include/casadi/core/sx.hpp +41 -0
- casadi/include/casadi/core/sx_elem.hpp +25 -0
- casadi/include/casadi/core/xml_node.hpp +5 -0
- casadi/include/casadi/doc.i +10026 -6513
- casadi/include/casadi/doc_merged.i +6744 -4449
- casadi/include/casadi/valgrind-casadi.supp +138 -0
- casadi/include/casadi/valgrind-python.supp +2470 -0
- casadi/include/fatrop/blasfeo_wrapper/LinearAlgebraBlasfeo.hpp +4 -1
- casadi/include/ghc/filesystem.hpp +6083 -0
- casadi/include/ghc/fs_fwd.hpp +38 -0
- casadi/include/ghc/fs_impl.hpp +35 -0
- casadi/include/ghc/fs_std.hpp +60 -0
- casadi/include/ghc/fs_std_fwd.hpp +63 -0
- casadi/include/ghc/fs_std_impl.hpp +46 -0
- casadi/include/highs/HConfig.h +6 -4
- casadi/include/highs/Highs.h +260 -86
- casadi/include/highs/filereaderlp/reader.hpp +10 -10
- casadi/include/highs/interfaces/highs_c_api.h +195 -22
- casadi/include/highs/io/Filereader.h +4 -6
- casadi/include/highs/io/FilereaderEms.h +0 -3
- casadi/include/highs/io/FilereaderLp.h +0 -3
- casadi/include/highs/io/FilereaderMps.h +0 -3
- casadi/include/highs/io/HMPSIO.h +1 -4
- casadi/include/highs/io/HMpsFF.h +8 -5
- casadi/include/highs/io/HighsIO.h +10 -5
- casadi/include/highs/io/LoadOptions.h +0 -3
- casadi/include/highs/ipm/IpxSolution.h +0 -3
- casadi/include/highs/ipm/IpxWrapper.h +4 -7
- casadi/include/highs/ipm/ipx/control.h +1 -0
- casadi/include/highs/ipm/ipx/ipx_parameters.h +1 -0
- casadi/include/highs/ipm/ipx/lp_solver.h +3 -2
- casadi/include/highs/ipm/ipx/model.h +7 -3
- casadi/include/highs/lp_data/HConst.h +21 -6
- casadi/include/highs/lp_data/HStruct.h +40 -4
- casadi/include/highs/lp_data/HighsAnalysis.h +0 -3
- casadi/include/highs/lp_data/HighsCallback.h +10 -3
- casadi/include/highs/lp_data/HighsCallbackStruct.h +3 -3
- casadi/include/highs/lp_data/HighsDebug.h +0 -3
- casadi/include/highs/lp_data/HighsIis.h +62 -0
- casadi/include/highs/lp_data/HighsInfo.h +50 -43
- casadi/include/highs/lp_data/HighsInfoDebug.h +0 -3
- casadi/include/highs/lp_data/HighsLp.h +2 -3
- casadi/include/highs/lp_data/HighsLpSolverObject.h +0 -3
- casadi/include/highs/lp_data/HighsLpUtils.h +38 -6
- casadi/include/highs/lp_data/HighsModelUtils.h +21 -18
- casadi/include/highs/lp_data/HighsOptions.h +134 -22
- casadi/include/highs/lp_data/HighsRanging.h +0 -3
- casadi/include/highs/lp_data/HighsSolution.h +10 -3
- casadi/include/highs/lp_data/HighsSolutionDebug.h +0 -3
- casadi/include/highs/lp_data/HighsSolve.h +0 -3
- casadi/include/highs/lp_data/HighsStatus.h +0 -3
- casadi/include/highs/mip/HighsCliqueTable.h +3 -3
- casadi/include/highs/mip/HighsConflictPool.h +0 -3
- casadi/include/highs/mip/HighsCutGeneration.h +13 -3
- casadi/include/highs/mip/HighsCutPool.h +0 -3
- casadi/include/highs/mip/HighsDebugSol.h +0 -3
- casadi/include/highs/mip/HighsDomain.h +16 -4
- casadi/include/highs/mip/HighsDomainChange.h +0 -3
- casadi/include/highs/mip/HighsDynamicRowMatrix.h +0 -3
- casadi/include/highs/mip/HighsGFkSolve.h +3 -6
- casadi/include/highs/mip/HighsImplications.h +29 -5
- casadi/include/highs/mip/HighsLpAggregator.h +0 -3
- casadi/include/highs/mip/HighsLpRelaxation.h +0 -3
- casadi/include/highs/mip/HighsMipAnalysis.h +52 -0
- casadi/include/highs/mip/HighsMipSolver.h +11 -4
- casadi/include/highs/mip/HighsMipSolverData.h +67 -6
- casadi/include/highs/mip/HighsModkSeparator.h +1 -4
- casadi/include/highs/mip/HighsNodeQueue.h +0 -3
- casadi/include/highs/mip/HighsObjectiveFunction.h +0 -3
- casadi/include/highs/mip/HighsPathSeparator.h +1 -4
- casadi/include/highs/mip/HighsPrimalHeuristics.h +7 -5
- casadi/include/highs/mip/HighsPseudocost.h +0 -3
- casadi/include/highs/mip/HighsRedcostFixing.h +0 -3
- casadi/include/highs/mip/HighsSearch.h +0 -3
- casadi/include/highs/mip/HighsSeparation.h +0 -3
- casadi/include/highs/mip/HighsSeparator.h +1 -5
- casadi/include/highs/mip/HighsTableauSeparator.h +1 -4
- casadi/include/highs/mip/HighsTransformedLp.h +0 -3
- casadi/include/highs/mip/MipTimer.h +471 -0
- casadi/include/highs/parallel/HighsBinarySemaphore.h +1 -1
- casadi/include/highs/parallel/HighsCacheAlign.h +1 -1
- casadi/include/highs/parallel/HighsCombinable.h +1 -1
- casadi/include/highs/parallel/HighsMutex.h +1 -1
- casadi/include/highs/parallel/HighsRaceTimer.h +1 -1
- casadi/include/highs/parallel/HighsSchedulerConstants.h +1 -1
- casadi/include/highs/parallel/HighsSplitDeque.h +1 -1
- casadi/include/highs/parallel/HighsTaskExecutor.h +68 -55
- casadi/include/highs/pdlp/CupdlpWrapper.h +18 -7
- casadi/include/highs/pdlp/cupdlp/cupdlp_cs.h +2 -3
- casadi/include/highs/pdlp/cupdlp/cupdlp_defs.h +16 -6
- casadi/include/highs/pdlp/cupdlp/cupdlp_linalg.h +10 -4
- casadi/include/highs/pdlp/cupdlp/cupdlp_proj.h +3 -3
- casadi/include/highs/pdlp/cupdlp/cupdlp_restart.h +6 -6
- casadi/include/highs/pdlp/cupdlp/cupdlp_scaling.h +26 -0
- casadi/include/highs/pdlp/cupdlp/cupdlp_solver.h +11 -11
- casadi/include/highs/pdlp/cupdlp/cupdlp_step.h +8 -4
- casadi/include/highs/pdlp/cupdlp/cupdlp_utils.c +253 -172
- casadi/include/highs/presolve/HPresolve.h +50 -10
- casadi/include/highs/presolve/HPresolveAnalysis.h +0 -3
- casadi/include/highs/presolve/HighsPostsolveStack.h +39 -6
- casadi/include/highs/presolve/HighsSymmetry.h +0 -3
- casadi/include/highs/presolve/ICrash.h +0 -3
- casadi/include/highs/presolve/ICrashUtil.h +1 -4
- casadi/include/highs/presolve/ICrashX.h +0 -3
- casadi/include/highs/presolve/PresolveComponent.h +0 -3
- casadi/include/highs/qpsolver/a_asm.hpp +20 -17
- casadi/include/highs/qpsolver/a_quass.hpp +5 -9
- casadi/include/highs/qpsolver/basis.hpp +12 -10
- casadi/include/highs/qpsolver/crashsolution.hpp +4 -3
- casadi/include/highs/qpsolver/dantzigpricing.hpp +3 -2
- casadi/include/highs/qpsolver/devexpricing.hpp +3 -1
- casadi/include/highs/qpsolver/factor.hpp +6 -5
- casadi/include/highs/qpsolver/feasibility_bounded.hpp +31 -29
- casadi/include/highs/qpsolver/feasibility_highs.hpp +94 -70
- casadi/include/highs/qpsolver/gradient.hpp +1 -1
- casadi/include/highs/qpsolver/instance.hpp +1 -1
- casadi/include/highs/qpsolver/pricing.hpp +2 -2
- casadi/include/highs/qpsolver/qpconst.hpp +2 -2
- casadi/include/highs/qpsolver/quass.hpp +2 -2
- casadi/include/highs/qpsolver/runtime.hpp +2 -2
- casadi/include/highs/qpsolver/settings.hpp +20 -8
- casadi/include/highs/qpsolver/steepestedgepricing.hpp +38 -39
- casadi/include/highs/simplex/HApp.h +88 -34
- casadi/include/highs/simplex/HEkk.h +24 -11
- casadi/include/highs/simplex/HEkkDual.h +2 -4
- casadi/include/highs/simplex/HEkkDualRHS.h +0 -3
- casadi/include/highs/simplex/HEkkDualRow.h +0 -3
- casadi/include/highs/simplex/HEkkPrimal.h +1 -5
- casadi/include/highs/simplex/HSimplex.h +0 -3
- casadi/include/highs/simplex/HSimplexDebug.h +0 -3
- casadi/include/highs/simplex/HSimplexNla.h +0 -26
- casadi/include/highs/simplex/HSimplexReport.h +0 -3
- casadi/include/highs/simplex/HighsSimplexAnalysis.h +5 -8
- casadi/include/highs/simplex/SimplexConst.h +10 -11
- casadi/include/highs/simplex/SimplexStruct.h +9 -12
- casadi/include/highs/simplex/SimplexTimer.h +83 -101
- casadi/include/highs/test/DevKkt.h +0 -3
- casadi/include/highs/test/KktCh2.h +0 -3
- casadi/include/highs/util/FactorTimer.h +39 -64
- casadi/include/highs/util/HFactor.h +4 -4
- casadi/include/highs/util/HFactorConst.h +0 -3
- casadi/include/highs/util/HFactorDebug.h +0 -3
- casadi/include/highs/util/HSet.h +0 -3
- casadi/include/highs/util/HVector.h +0 -3
- casadi/include/highs/util/HVectorBase.h +1 -4
- casadi/include/highs/util/HighsCDouble.h +12 -3
- casadi/include/highs/util/HighsComponent.h +0 -3
- casadi/include/highs/util/HighsDataStack.h +0 -3
- casadi/include/highs/util/HighsDisjointSets.h +0 -3
- casadi/include/highs/util/HighsHash.h +9 -12
- casadi/include/highs/util/HighsHashTree.h +15 -11
- casadi/include/highs/util/HighsInt.h +0 -3
- casadi/include/highs/util/HighsIntegers.h +0 -3
- casadi/include/highs/util/HighsLinearSumBounds.h +0 -3
- casadi/include/highs/util/HighsMatrixPic.h +0 -3
- casadi/include/highs/util/HighsMatrixSlice.h +3 -6
- casadi/include/highs/util/HighsMatrixUtils.h +0 -3
- casadi/include/highs/util/HighsMemoryAllocation.h +11 -3
- casadi/include/highs/util/HighsRandom.h +3 -6
- casadi/include/highs/util/HighsRbTree.h +0 -3
- casadi/include/highs/util/HighsSort.h +0 -3
- casadi/include/highs/util/HighsSparseMatrix.h +6 -3
- casadi/include/highs/util/HighsSparseVectorSum.h +0 -3
- casadi/include/highs/util/HighsSplay.h +0 -3
- casadi/include/highs/util/HighsTimer.h +94 -56
- casadi/include/highs/util/HighsUtils.h +10 -3
- casadi/include/highs/util/stringutil.h +14 -12
- casadi/include/licenses/LICENSE.bzip2.txt +42 -0
- casadi/include/licenses/ghc-external/LICENSE +19 -0
- casadi/include/licenses/libz-external/LICENSE +22 -0
- casadi/include/licenses/libz-external/contrib/dotzlib/LICENSE_1_0.txt +23 -0
- casadi/include/licenses/libzip-external/LICENSE +31 -0
- casadi/include/zconf.h +545 -0
- casadi/include/zip.h +528 -0
- casadi/include/zipconf.h +48 -0
- casadi/include/zlib.h +1938 -0
- casadi/libCbc.3.10.11.dylib +0 -0
- casadi/libCbc.3.dylib +0 -0
- casadi/libCbc.dylib +0 -0
- casadi/libCbc.la +1 -1
- casadi/libCbcSolver.3.10.11.dylib +0 -0
- casadi/libCbcSolver.3.dylib +0 -0
- casadi/libCbcSolver.dylib +0 -0
- casadi/libCbcSolver.la +1 -1
- casadi/libCgl.1.10.8.dylib +0 -0
- casadi/libCgl.1.dylib +0 -0
- casadi/libCgl.dylib +0 -0
- casadi/libCgl.la +1 -1
- casadi/libClp.1.14.9.dylib +0 -0
- casadi/libClp.1.dylib +0 -0
- casadi/libClp.dylib +0 -0
- casadi/libClp.la +1 -1
- casadi/libClpSolver.1.14.9.dylib +0 -0
- casadi/libClpSolver.1.dylib +0 -0
- casadi/libClpSolver.dylib +0 -0
- casadi/libClpSolver.la +1 -1
- casadi/libCoinUtils.3.11.10.dylib +0 -0
- casadi/libCoinUtils.3.dylib +0 -0
- casadi/libCoinUtils.dylib +0 -0
- casadi/libOsi.1.13.9.dylib +0 -0
- casadi/libOsi.1.dylib +0 -0
- casadi/libOsi.dylib +0 -0
- casadi/libOsiCbc.3.10.11.dylib +0 -0
- casadi/libOsiCbc.3.dylib +0 -0
- casadi/libOsiCbc.dylib +0 -0
- casadi/libOsiCbc.la +1 -1
- casadi/libOsiClp.1.14.9.dylib +0 -0
- casadi/libOsiClp.1.dylib +0 -0
- casadi/libOsiClp.dylib +0 -0
- casadi/libOsiClp.la +1 -1
- casadi/libOsiCommonTests.1.13.9.dylib +0 -0
- casadi/libOsiCommonTests.1.dylib +0 -0
- casadi/libOsiCommonTests.dylib +0 -0
- casadi/libblasfeo.dylib +0 -0
- casadi/libbonmin.4.8.9.dylib +0 -0
- casadi/libbonmin.4.dylib +0 -0
- casadi/libbonmin.dylib +0 -0
- casadi/libbonmin.la +1 -1
- casadi/libbz2.1.0.8.dylib +0 -0
- casadi/libc++.1.0.dylib +0 -0
- casadi/libcasadi.3.7.dylib +0 -0
- casadi/libcasadi.dylib +0 -0
- casadi/libcasadi_archiver_libzip.3.7.dylib +0 -0
- casadi/libcasadi_archiver_libzip.dylib +0 -0
- casadi/libcasadi_conic_cbc.3.7.dylib +0 -0
- casadi/libcasadi_conic_cbc.dylib +0 -0
- casadi/libcasadi_conic_clp.3.7.dylib +0 -0
- casadi/libcasadi_conic_clp.dylib +0 -0
- casadi/libcasadi_conic_cplex.3.7.dylib +0 -0
- casadi/libcasadi_conic_cplex.dylib +0 -0
- casadi/libcasadi_conic_daqp.3.7.dylib +0 -0
- casadi/libcasadi_conic_daqp.dylib +0 -0
- casadi/libcasadi_conic_fatrop.3.7.dylib +0 -0
- casadi/libcasadi_conic_fatrop.dylib +0 -0
- casadi/libcasadi_conic_gurobi.3.7.dylib +0 -0
- casadi/libcasadi_conic_gurobi.dylib +0 -0
- casadi/libcasadi_conic_highs.3.7.dylib +0 -0
- casadi/libcasadi_conic_highs.dylib +0 -0
- casadi/libcasadi_conic_ipqp.3.7.dylib +0 -0
- casadi/libcasadi_conic_ipqp.dylib +0 -0
- casadi/libcasadi_conic_nlpsol.3.7.dylib +0 -0
- casadi/libcasadi_conic_nlpsol.dylib +0 -0
- casadi/libcasadi_conic_osqp.3.7.dylib +0 -0
- casadi/libcasadi_conic_osqp.dylib +0 -0
- casadi/libcasadi_conic_proxqp.3.7.dylib +0 -0
- casadi/libcasadi_conic_proxqp.dylib +0 -0
- casadi/libcasadi_conic_qpoases.3.7.dylib +0 -0
- casadi/libcasadi_conic_qpoases.dylib +0 -0
- casadi/libcasadi_conic_qrqp.3.7.dylib +0 -0
- casadi/libcasadi_conic_qrqp.dylib +0 -0
- casadi/libcasadi_conic_superscs.3.7.dylib +0 -0
- casadi/libcasadi_conic_superscs.dylib +0 -0
- casadi/libcasadi_filesystem_ghc.3.7.dylib +0 -0
- casadi/libcasadi_filesystem_ghc.dylib +0 -0
- casadi/libcasadi_importer_shell.3.7.dylib +0 -0
- casadi/libcasadi_importer_shell.dylib +0 -0
- casadi/libcasadi_integrator_collocation.3.7.dylib +0 -0
- casadi/libcasadi_integrator_collocation.dylib +0 -0
- casadi/libcasadi_integrator_cvodes.3.7.dylib +0 -0
- casadi/libcasadi_integrator_cvodes.dylib +0 -0
- casadi/libcasadi_integrator_idas.3.7.dylib +0 -0
- casadi/libcasadi_integrator_idas.dylib +0 -0
- casadi/libcasadi_integrator_rk.3.7.dylib +0 -0
- casadi/libcasadi_integrator_rk.dylib +0 -0
- casadi/libcasadi_interpolant_bspline.3.7.dylib +0 -0
- casadi/libcasadi_interpolant_bspline.dylib +0 -0
- casadi/libcasadi_interpolant_linear.3.7.dylib +0 -0
- casadi/libcasadi_interpolant_linear.dylib +0 -0
- casadi/libcasadi_linsol_csparse.3.7.dylib +0 -0
- casadi/libcasadi_linsol_csparse.dylib +0 -0
- casadi/libcasadi_linsol_csparsecholesky.3.7.dylib +0 -0
- casadi/libcasadi_linsol_csparsecholesky.dylib +0 -0
- casadi/libcasadi_linsol_lapacklu.3.7.dylib +0 -0
- casadi/libcasadi_linsol_lapacklu.dylib +0 -0
- casadi/libcasadi_linsol_lapackqr.3.7.dylib +0 -0
- casadi/libcasadi_linsol_lapackqr.dylib +0 -0
- casadi/libcasadi_linsol_ldl.3.7.dylib +0 -0
- casadi/libcasadi_linsol_ldl.dylib +0 -0
- casadi/libcasadi_linsol_lsqr.3.7.dylib +0 -0
- casadi/libcasadi_linsol_lsqr.dylib +0 -0
- casadi/libcasadi_linsol_ma27.3.7.dylib +0 -0
- casadi/libcasadi_linsol_ma27.dylib +0 -0
- casadi/libcasadi_linsol_mumps.3.7.dylib +0 -0
- casadi/libcasadi_linsol_mumps.dylib +0 -0
- casadi/libcasadi_linsol_qr.3.7.dylib +0 -0
- casadi/libcasadi_linsol_qr.dylib +0 -0
- casadi/libcasadi_linsol_symbolicqr.3.7.dylib +0 -0
- casadi/libcasadi_linsol_symbolicqr.dylib +0 -0
- casadi/libcasadi_linsol_tridiag.3.7.dylib +0 -0
- casadi/libcasadi_linsol_tridiag.dylib +0 -0
- casadi/libcasadi_nlpsol_ampl.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_ampl.dylib +0 -0
- casadi/libcasadi_nlpsol_blocksqp.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_blocksqp.dylib +0 -0
- casadi/libcasadi_nlpsol_bonmin.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_bonmin.dylib +0 -0
- casadi/libcasadi_nlpsol_fatrop.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_fatrop.dylib +0 -0
- casadi/libcasadi_nlpsol_feasiblesqpmethod.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_feasiblesqpmethod.dylib +0 -0
- casadi/libcasadi_nlpsol_ipopt.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_ipopt.dylib +0 -0
- casadi/libcasadi_nlpsol_knitro.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_knitro.dylib +0 -0
- casadi/libcasadi_nlpsol_madnlp.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_madnlp.dylib +0 -0
- casadi/libcasadi_nlpsol_qrsqp.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_qrsqp.dylib +0 -0
- casadi/libcasadi_nlpsol_scpgen.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_scpgen.dylib +0 -0
- casadi/libcasadi_nlpsol_sleqp.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_sleqp.dylib +0 -0
- casadi/libcasadi_nlpsol_snopt.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_snopt.dylib +0 -0
- casadi/libcasadi_nlpsol_sqpmethod.3.7.dylib +0 -0
- casadi/libcasadi_nlpsol_sqpmethod.dylib +0 -0
- casadi/libcasadi_rootfinder_fast_newton.3.7.dylib +0 -0
- casadi/libcasadi_rootfinder_fast_newton.dylib +0 -0
- casadi/libcasadi_rootfinder_kinsol.3.7.dylib +0 -0
- casadi/libcasadi_rootfinder_kinsol.dylib +0 -0
- casadi/libcasadi_rootfinder_newton.3.7.dylib +0 -0
- casadi/libcasadi_rootfinder_newton.dylib +0 -0
- casadi/libcasadi_rootfinder_nlpsol.3.7.dylib +0 -0
- casadi/libcasadi_rootfinder_nlpsol.dylib +0 -0
- casadi/libcasadi_sundials_common.3.7.dylib +0 -0
- casadi/libcasadi_sundials_common.dylib +0 -0
- casadi/libcasadi_xmlfile_tinyxml.3.7.dylib +0 -0
- casadi/libcasadi_xmlfile_tinyxml.dylib +0 -0
- casadi/libcoinmetis.2.dylib +0 -0
- casadi/libcoinmetis.dylib +0 -0
- casadi/libcoinmumps.3.dylib +0 -0
- casadi/libcoinmumps.dylib +0 -0
- casadi/libcoinmumps.la +1 -1
- casadi/libcplex_adaptor.dylib +0 -0
- casadi/libdaqp.dylib +0 -0
- casadi/libdaqpstat.a +0 -0
- casadi/libfatrop.dylib +0 -0
- casadi/libgcc_s.1.1.dylib +0 -0
- casadi/libgfortran.5.dylib +0 -0
- casadi/libgurobi_adaptor.dylib +0 -0
- casadi/libhighs.1.10.dylib +0 -0
- casadi/libhighs.1.dylib +0 -0
- casadi/libhighs.dylib +0 -0
- casadi/libindirect.a +0 -0
- casadi/libipopt.3.dylib +0 -0
- casadi/libipopt.dylib +0 -0
- casadi/libipopt.la +1 -1
- casadi/liblinsys.a +0 -0
- casadi/libmatlab_ipc.dylib +0 -0
- casadi/libosqp.a +0 -0
- casadi/libosqp.dylib +0 -0
- casadi/libqdldl.a +0 -0
- casadi/libqdldl.dylib +0 -0
- casadi/libquadmath.0.dylib +0 -0
- casadi/libsipopt.3.dylib +0 -0
- casadi/libsipopt.dylib +0 -0
- casadi/libsipopt.la +1 -1
- casadi/libsleqp.1.0.1.dylib +0 -0
- casadi/libsleqp.dylib +0 -0
- casadi/libsuperscs.a +0 -0
- casadi/libtrlib.0.4.dylib +0 -0
- casadi/libtrlib.dylib +0 -0
- casadi/libz.1.2.13.dylib +0 -0
- casadi/libz.1.3.1.dylib +0 -0
- casadi/libz.1.dylib +0 -0
- casadi/libz.a +0 -0
- casadi/libz.dylib +0 -0
- casadi/libzip.a +0 -0
- casadi/pkgconfig/casadi.pc +1 -1
- casadi/pkgconfig/coinmumps.pc +1 -1
- casadi/pkgconfig/highs.pc +1 -1
- casadi/pkgconfig/libzip.pc +14 -0
- casadi/tools/__init__.py +3 -1
- casadi/tools/graph/graph.py +1 -1
- casadi/tools/structure3.py +7 -7
- {casadi-3.6.7.dist-info → casadi-3.7.1.dist-info}/METADATA +1 -1
- {casadi-3.6.7.dist-info → casadi-3.7.1.dist-info}/RECORD +450 -405
- casadi/include/highs/lp_data/HighsRuntimeOptions.h +0 -276
- casadi/include/highs/pdlp/cupdlp/cupdlp_scaling_cuda.h +0 -28
- casadi/include/tinyxml2.h +0 -2380
- casadi/lib/cmake/tinyxml2/tinyxml2-config.cmake +0 -57
- casadi/lib/cmake/tinyxml2/tinyxml2-static-targets-release.cmake +0 -19
- casadi/lib/libtinyxml2.a +0 -0
- casadi/lib/pkgconfig/tinyxml2.pc +0 -10
- casadi/libhighs.1.7.dylib +0 -0
- casadi/tools/structure.py +0 -1446
- {casadi-3.6.7.dist-info → casadi-3.7.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,645 @@
|
|
1
|
+
//
|
2
|
+
// MIT No Attribution
|
3
|
+
//
|
4
|
+
// Copyright (C) 2010-2023 Joel Andersson, Joris Gillis, Moritz Diehl, KU Leuven.
|
5
|
+
//
|
6
|
+
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
|
7
|
+
// software and associated documentation files (the "Software"), to deal in the Software
|
8
|
+
// without restriction, including without limitation the rights to use, copy, modify,
|
9
|
+
// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
// permit persons to whom the Software is furnished to do so.
|
11
|
+
//
|
12
|
+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
13
|
+
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
14
|
+
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
15
|
+
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
16
|
+
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
17
|
+
// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
18
|
+
//
|
19
|
+
|
20
|
+
// SYMBOL "blazing_3d_boor_eval"
|
21
|
+
template<typename T1>
|
22
|
+
void casadi_blazing_3d_boor_eval(T1* f, T1* J, T1* H, const T1* all_knots, const casadi_int* offset, const T1* c, const T1* dc, const T1* ddc, const T1* all_x, const casadi_int* lookup_mode, casadi_int* iw, T1* w) { // NOLINT(whitespace/line_length)
|
23
|
+
casadi_int n_dims = 3;
|
24
|
+
casadi_int m = 1;
|
25
|
+
casadi_int n_iter, k, i, pivot;
|
26
|
+
casadi_int *boor_offset, *starts, *index, *coeff_offset;
|
27
|
+
T1 *cumprod;
|
28
|
+
boor_offset = iw; iw+=n_dims+1;
|
29
|
+
starts = iw; iw+=n_dims;
|
30
|
+
index = iw; iw+=n_dims;
|
31
|
+
coeff_offset = iw;
|
32
|
+
cumprod = w; w+= n_dims+1;
|
33
|
+
boor_offset[0] = 0;
|
34
|
+
cumprod[n_dims] = 1;
|
35
|
+
coeff_offset[n_dims] = 0;
|
36
|
+
|
37
|
+
casadi_int stride1 = offset[1]-offset[0]-4;
|
38
|
+
casadi_int stride2 = (offset[2]-offset[1]-4)*stride1;
|
39
|
+
|
40
|
+
simde__m256d zero = simde_mm256_set1_pd(0.0);
|
41
|
+
|
42
|
+
simde__m256d boor_start_0000 = zero;
|
43
|
+
simde__m256d boor_start_1111 = simde_mm256_set1_pd(1.0);
|
44
|
+
simde__m256d boor_start_0001 = simde_mm256_set_pd(1.0, 0.0, 0.0, 0.0);
|
45
|
+
simde__m256d boor_start_0010 = simde_mm256_set_pd(0.0, 1.0, 0.0, 0.0);
|
46
|
+
|
47
|
+
simde__m256d boor0_d3;
|
48
|
+
simde__m256d boor0_d2;
|
49
|
+
simde__m256d boor0_d1;
|
50
|
+
simde__m256d boor0_d0;
|
51
|
+
|
52
|
+
simde__m256d boor1_d3;
|
53
|
+
simde__m256d boor1_d2;
|
54
|
+
simde__m256d boor1_d1;
|
55
|
+
simde__m256d boor1_d0;
|
56
|
+
|
57
|
+
simde__m256d boor2_d3;
|
58
|
+
simde__m256d boor2_d2;
|
59
|
+
simde__m256d boor2_d1;
|
60
|
+
simde__m256d boor2_d0;
|
61
|
+
|
62
|
+
const T1* knots;
|
63
|
+
T1 x;
|
64
|
+
casadi_int degree, n_knots, n_b, L, start;
|
65
|
+
degree = 3;
|
66
|
+
knots = all_knots + offset[0];
|
67
|
+
n_knots = offset[0+1]-offset[0];
|
68
|
+
n_b = n_knots-degree-1;
|
69
|
+
x = all_x[0];
|
70
|
+
L = casadi_low(x, knots+degree, n_knots-2*degree, lookup_mode[0]);
|
71
|
+
start = L;
|
72
|
+
if (start>n_b-degree-1) start = n_b-degree-1;
|
73
|
+
starts[0] = start;
|
74
|
+
boor0_d3 = boor_start_0000;
|
75
|
+
if (x>=knots[0] && x<=knots[n_knots-1]) {
|
76
|
+
if (x==knots[1]) {
|
77
|
+
boor0_d3 = boor_start_1111;
|
78
|
+
} else if (x==knots[n_knots-1]) {
|
79
|
+
boor0_d3 = boor_start_0001;
|
80
|
+
} else if (knots[L+degree]==x) {
|
81
|
+
boor0_d3 = boor_start_0010;
|
82
|
+
} else {
|
83
|
+
boor0_d3 = boor_start_0001;
|
84
|
+
}
|
85
|
+
}
|
86
|
+
casadi_blazing_de_boor(x, knots+start, &boor0_d0, &boor0_d1, &boor0_d2, &boor0_d3);
|
87
|
+
|
88
|
+
knots = all_knots + offset[1];
|
89
|
+
n_knots = offset[1+1]-offset[1];
|
90
|
+
n_b = n_knots-degree-1;
|
91
|
+
x = all_x[1];
|
92
|
+
L = casadi_low(x, knots+degree, n_knots-2*degree, lookup_mode[1]);
|
93
|
+
start = L;
|
94
|
+
if (start>n_b-degree-1) start = n_b-degree-1;
|
95
|
+
starts[1] = start;
|
96
|
+
boor1_d3 = boor_start_0000;
|
97
|
+
if (x>=knots[0] && x<=knots[n_knots-1]) {
|
98
|
+
if (x==knots[1]) {
|
99
|
+
boor1_d3 = boor_start_1111;
|
100
|
+
} else if (x==knots[n_knots-1]) {
|
101
|
+
boor1_d3 = boor_start_0001;
|
102
|
+
} else if (knots[L+degree]==x) {
|
103
|
+
boor1_d3 = boor_start_0010;
|
104
|
+
} else {
|
105
|
+
boor1_d3 = boor_start_0001;
|
106
|
+
}
|
107
|
+
}
|
108
|
+
casadi_blazing_de_boor(x, knots+start, &boor1_d0, &boor1_d1, &boor1_d2, &boor1_d3);
|
109
|
+
|
110
|
+
knots = all_knots + offset[2];
|
111
|
+
n_knots = offset[2+1]-offset[2];
|
112
|
+
n_b = n_knots-degree-1;
|
113
|
+
x = all_x[2];
|
114
|
+
L = casadi_low(x, knots+degree, n_knots-2*degree, lookup_mode[2]);
|
115
|
+
start = L;
|
116
|
+
if (start>n_b-degree-1) start = n_b-degree-1;
|
117
|
+
starts[2] = start;
|
118
|
+
boor2_d3 = boor_start_0000;
|
119
|
+
if (x>=knots[0] && x<=knots[n_knots-1]) {
|
120
|
+
if (x==knots[1]) {
|
121
|
+
boor2_d3 = boor_start_1111;
|
122
|
+
} else if (x==knots[n_knots-1]) {
|
123
|
+
boor2_d3 = boor_start_0001;
|
124
|
+
} else if (knots[L+degree]==x) {
|
125
|
+
boor2_d3 = boor_start_0010;
|
126
|
+
} else {
|
127
|
+
boor2_d3 = boor_start_0001;
|
128
|
+
}
|
129
|
+
}
|
130
|
+
casadi_blazing_de_boor(x, knots+start, &boor2_d0, &boor2_d1, &boor2_d2, &boor2_d3);
|
131
|
+
|
132
|
+
simde__m256d C[16];
|
133
|
+
|
134
|
+
for (int j=0;j<4;++j) {
|
135
|
+
for (int k=0;k<4;++k) {
|
136
|
+
C[j+4*k] = simde_mm256_loadu_pd(c+(starts[1]+j)*stride1+(starts[2]+k)*stride2+starts[0]);
|
137
|
+
}
|
138
|
+
}
|
139
|
+
|
140
|
+
simde__m256d a, b0, b1, b2, b3, c0, c1, c2, c3, r;
|
141
|
+
simde__m256d ab[4], cab[4];
|
142
|
+
simde__m128d r0, r1;
|
143
|
+
|
144
|
+
a = boor0_d0;
|
145
|
+
b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
146
|
+
b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
147
|
+
b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
148
|
+
b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
149
|
+
|
150
|
+
c0 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
151
|
+
c1 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
152
|
+
c2 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
153
|
+
c3 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
154
|
+
|
155
|
+
// Need to compute sum_abc C_abc A_a B_b C_c
|
156
|
+
|
157
|
+
// Step 1: Outer product a b: A_a B_b
|
158
|
+
ab[0] = simde_mm256_mul_pd(a, b0);
|
159
|
+
ab[1] = simde_mm256_mul_pd(a, b1);
|
160
|
+
ab[2] = simde_mm256_mul_pd(a, b2);
|
161
|
+
ab[3] = simde_mm256_mul_pd(a, b3);
|
162
|
+
|
163
|
+
// Sum over b axis: sum_b C_abc * (A_a B_b)_b
|
164
|
+
// cab <- cab + ab[i]*C[i]
|
165
|
+
for (int i=0;i<4;++i) {
|
166
|
+
cab[i] = simde_mm256_set1_pd(0);
|
167
|
+
cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
|
168
|
+
cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
|
169
|
+
cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
|
170
|
+
cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
|
171
|
+
}
|
172
|
+
|
173
|
+
if (f) {
|
174
|
+
// Reduce over the c direction
|
175
|
+
r = simde_mm256_set1_pd(0);
|
176
|
+
r = simde_mm256_fmadd_pd(cab[0], c0, r);
|
177
|
+
r = simde_mm256_fmadd_pd(cab[1], c1, r);
|
178
|
+
r = simde_mm256_fmadd_pd(cab[2], c2, r);
|
179
|
+
r = simde_mm256_fmadd_pd(cab[3], c3, r);
|
180
|
+
|
181
|
+
// Sum all r entries
|
182
|
+
r0 = simde_mm256_castpd256_pd128(r);
|
183
|
+
r1 = simde_mm256_extractf128_pd(r, 1);
|
184
|
+
r0 = simde_mm_add_pd(r0, r1);
|
185
|
+
f[0] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
|
186
|
+
}
|
187
|
+
|
188
|
+
// First derivative
|
189
|
+
if (dc && J) {
|
190
|
+
stride1 = offset[1]-offset[0]-4-1;
|
191
|
+
stride2 = (offset[2]-offset[1]-4)*stride1;
|
192
|
+
for (int j=0;j<4;++j) {
|
193
|
+
for (int k=0;k<4;++k) {
|
194
|
+
C[j+4*k] = simde_mm256_loadu_pd(
|
195
|
+
dc+(starts[1]+j)*stride1+(starts[2]+k)*stride2+starts[0]-1);
|
196
|
+
}
|
197
|
+
}
|
198
|
+
dc += stride2*(offset[3]-offset[2]-4);
|
199
|
+
|
200
|
+
a = boor0_d1;
|
201
|
+
ab[0] = simde_mm256_mul_pd(a, b0);
|
202
|
+
ab[1] = simde_mm256_mul_pd(a, b1);
|
203
|
+
ab[2] = simde_mm256_mul_pd(a, b2);
|
204
|
+
ab[3] = simde_mm256_mul_pd(a, b3);
|
205
|
+
// Sum over b axis: sum_b C_abc * (A_a B_b)_b
|
206
|
+
// cab <- cab + ab[i]*C[i]
|
207
|
+
for (int i=0;i<4;++i) {
|
208
|
+
cab[i] = simde_mm256_set1_pd(0);
|
209
|
+
cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
|
210
|
+
cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
|
211
|
+
cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
|
212
|
+
cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
|
213
|
+
}
|
214
|
+
|
215
|
+
// Reduce over the c direction
|
216
|
+
r = simde_mm256_set1_pd(0);
|
217
|
+
r = simde_mm256_fmadd_pd(cab[0], c0, r);
|
218
|
+
r = simde_mm256_fmadd_pd(cab[1], c1, r);
|
219
|
+
r = simde_mm256_fmadd_pd(cab[2], c2, r);
|
220
|
+
r = simde_mm256_fmadd_pd(cab[3], c3, r);
|
221
|
+
|
222
|
+
// Sum all r entries
|
223
|
+
r0 = simde_mm256_castpd256_pd128(r);
|
224
|
+
r1 = simde_mm256_extractf128_pd(r, 1);
|
225
|
+
r0 = simde_mm_add_pd(r0, r1);
|
226
|
+
J[0] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
|
227
|
+
|
228
|
+
|
229
|
+
stride1 = offset[1]-offset[0]-4;
|
230
|
+
stride2 = (offset[2]-offset[1]-4-1)*stride1;
|
231
|
+
for (int j=0;j<4;++j) {
|
232
|
+
for (int k=0;k<4;++k) {
|
233
|
+
if (j==0) {
|
234
|
+
C[j+4*k] = zero;
|
235
|
+
} else {
|
236
|
+
C[j+4*k] = simde_mm256_loadu_pd(
|
237
|
+
dc+(starts[1]+j-1)*stride1+(starts[2]+k)*stride2+starts[0]);
|
238
|
+
}
|
239
|
+
}
|
240
|
+
}
|
241
|
+
dc += stride2*(offset[3]-offset[2]-4);
|
242
|
+
|
243
|
+
a = boor0_d0;
|
244
|
+
|
245
|
+
b0 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
246
|
+
b1 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
247
|
+
b2 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
248
|
+
b3 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
249
|
+
|
250
|
+
ab[0] = simde_mm256_mul_pd(a, b0);
|
251
|
+
ab[1] = simde_mm256_mul_pd(a, b1);
|
252
|
+
ab[2] = simde_mm256_mul_pd(a, b2);
|
253
|
+
ab[3] = simde_mm256_mul_pd(a, b3);
|
254
|
+
|
255
|
+
// Sum over b axis: sum_b C_abc * (A_a B_b)_b
|
256
|
+
// cab <- cab + ab[i]*C[i]
|
257
|
+
for (int i=0;i<4;++i) {
|
258
|
+
cab[i] = simde_mm256_set1_pd(0);
|
259
|
+
cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
|
260
|
+
cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
|
261
|
+
cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
|
262
|
+
cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
|
263
|
+
}
|
264
|
+
|
265
|
+
// Reduce over the c direction
|
266
|
+
r = simde_mm256_set1_pd(0);
|
267
|
+
r = simde_mm256_fmadd_pd(cab[0], c0, r);
|
268
|
+
r = simde_mm256_fmadd_pd(cab[1], c1, r);
|
269
|
+
r = simde_mm256_fmadd_pd(cab[2], c2, r);
|
270
|
+
r = simde_mm256_fmadd_pd(cab[3], c3, r);
|
271
|
+
|
272
|
+
// Sum all r entries
|
273
|
+
r0 = simde_mm256_castpd256_pd128(r);
|
274
|
+
r1 = simde_mm256_extractf128_pd(r, 1);
|
275
|
+
r0 = simde_mm_add_pd(r0, r1);
|
276
|
+
J[1] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
|
277
|
+
|
278
|
+
stride1 = offset[1]-offset[0]-4;
|
279
|
+
stride2 = (offset[2]-offset[1]-4)*stride1;
|
280
|
+
for (int j=0;j<4;++j) {
|
281
|
+
for (int k=0;k<4;++k) {
|
282
|
+
if (k==0) {
|
283
|
+
C[j+4*k] = zero;
|
284
|
+
} else {
|
285
|
+
C[j+4*k] = simde_mm256_loadu_pd(
|
286
|
+
dc+(starts[1]+j)*stride1+(starts[2]+k-1)*stride2+starts[0]);
|
287
|
+
}
|
288
|
+
}
|
289
|
+
}
|
290
|
+
|
291
|
+
b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
292
|
+
b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
293
|
+
b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
294
|
+
b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
295
|
+
|
296
|
+
c0 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
297
|
+
c1 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
298
|
+
c2 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
299
|
+
c3 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
300
|
+
|
301
|
+
ab[0] = simde_mm256_mul_pd(a, b0);
|
302
|
+
ab[1] = simde_mm256_mul_pd(a, b1);
|
303
|
+
ab[2] = simde_mm256_mul_pd(a, b2);
|
304
|
+
ab[3] = simde_mm256_mul_pd(a, b3);
|
305
|
+
|
306
|
+
// Sum over b axis: sum_b C_abc * (A_a B_b)_b
|
307
|
+
// cab <- cab + ab[i]*C[i]
|
308
|
+
for (int i=0;i<4;++i) {
|
309
|
+
cab[i] = simde_mm256_set1_pd(0);
|
310
|
+
cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
|
311
|
+
cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
|
312
|
+
cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
|
313
|
+
cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
|
314
|
+
}
|
315
|
+
|
316
|
+
// Reduce over the c direction
|
317
|
+
r = simde_mm256_set1_pd(0);
|
318
|
+
r = simde_mm256_fmadd_pd(cab[0], c0, r);
|
319
|
+
r = simde_mm256_fmadd_pd(cab[1], c1, r);
|
320
|
+
r = simde_mm256_fmadd_pd(cab[2], c2, r);
|
321
|
+
r = simde_mm256_fmadd_pd(cab[3], c3, r);
|
322
|
+
|
323
|
+
// Sum all r entries
|
324
|
+
r0 = simde_mm256_castpd256_pd128(r);
|
325
|
+
r1 = simde_mm256_extractf128_pd(r, 1);
|
326
|
+
r0 = simde_mm_add_pd(r0, r1);
|
327
|
+
J[2] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
|
328
|
+
|
329
|
+
}
|
330
|
+
|
331
|
+
if (ddc && H) {
|
332
|
+
stride1 = offset[1]-offset[0]-4-2;
|
333
|
+
stride2 = (offset[2]-offset[1]-4)*stride1;
|
334
|
+
for (int j=0;j<4;++j) {
|
335
|
+
for (int k=0;k<4;++k) {
|
336
|
+
C[j+4*k] = simde_mm256_loadu_pd(
|
337
|
+
ddc+(starts[1]+j)*stride1+(starts[2]+k)*stride2+starts[0]-2);
|
338
|
+
}
|
339
|
+
}
|
340
|
+
ddc += stride2*(offset[3]-offset[2]-4);
|
341
|
+
|
342
|
+
a = boor0_d2;
|
343
|
+
b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
344
|
+
b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
345
|
+
b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
346
|
+
b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
347
|
+
|
348
|
+
c0 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
349
|
+
c1 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
350
|
+
c2 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
351
|
+
c3 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
352
|
+
|
353
|
+
ab[0] = simde_mm256_mul_pd(a, b0);
|
354
|
+
ab[1] = simde_mm256_mul_pd(a, b1);
|
355
|
+
ab[2] = simde_mm256_mul_pd(a, b2);
|
356
|
+
ab[3] = simde_mm256_mul_pd(a, b3);
|
357
|
+
// Sum over b axis: sum_b C_abc * (A_a B_b)_b
|
358
|
+
// cab <- cab + ab[i]*C[i]
|
359
|
+
for (int i=0;i<4;++i) {
|
360
|
+
cab[i] = simde_mm256_set1_pd(0);
|
361
|
+
cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
|
362
|
+
cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
|
363
|
+
cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
|
364
|
+
cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
|
365
|
+
}
|
366
|
+
|
367
|
+
// Reduce over the c direction
|
368
|
+
r = simde_mm256_set1_pd(0);
|
369
|
+
r = simde_mm256_fmadd_pd(cab[0], c0, r);
|
370
|
+
r = simde_mm256_fmadd_pd(cab[1], c1, r);
|
371
|
+
r = simde_mm256_fmadd_pd(cab[2], c2, r);
|
372
|
+
r = simde_mm256_fmadd_pd(cab[3], c3, r);
|
373
|
+
|
374
|
+
// Sum all r entries
|
375
|
+
r0 = simde_mm256_castpd256_pd128(r);
|
376
|
+
r1 = simde_mm256_extractf128_pd(r, 1);
|
377
|
+
r0 = simde_mm_add_pd(r0, r1);
|
378
|
+
H[0] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
|
379
|
+
|
380
|
+
stride1 = offset[1]-offset[0]-4;
|
381
|
+
stride2 = (offset[2]-offset[1]-4-2)*stride1;
|
382
|
+
for (int j=0;j<4;++j) {
|
383
|
+
for (int k=0;k<4;++k) {
|
384
|
+
if (j<=1) {
|
385
|
+
C[j+4*k] = zero;
|
386
|
+
} else {
|
387
|
+
C[j+4*k] = simde_mm256_loadu_pd(
|
388
|
+
ddc+(starts[1]+j-2)*stride1+(starts[2]+k)*stride2+starts[0]);
|
389
|
+
}
|
390
|
+
}
|
391
|
+
}
|
392
|
+
ddc += stride2*(offset[3]-offset[2]-4);
|
393
|
+
|
394
|
+
a = boor0_d0;
|
395
|
+
b0 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
396
|
+
b1 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
397
|
+
b2 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
398
|
+
b3 = simde_mm256_permute4x64_pd(boor1_d2, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
399
|
+
|
400
|
+
c0 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
401
|
+
c1 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
402
|
+
c2 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
403
|
+
c3 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
404
|
+
|
405
|
+
ab[0] = simde_mm256_mul_pd(a, b0);
|
406
|
+
ab[1] = simde_mm256_mul_pd(a, b1);
|
407
|
+
ab[2] = simde_mm256_mul_pd(a, b2);
|
408
|
+
ab[3] = simde_mm256_mul_pd(a, b3);
|
409
|
+
// Sum over b axis: sum_b C_abc * (A_a B_b)_b
|
410
|
+
// cab <- cab + ab[i]*C[i]
|
411
|
+
for (int i=0;i<4;++i) {
|
412
|
+
cab[i] = simde_mm256_set1_pd(0);
|
413
|
+
cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
|
414
|
+
cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
|
415
|
+
cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
|
416
|
+
cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
|
417
|
+
}
|
418
|
+
|
419
|
+
// Reduce over the c direction
|
420
|
+
r = simde_mm256_set1_pd(0);
|
421
|
+
r = simde_mm256_fmadd_pd(cab[0], c0, r);
|
422
|
+
r = simde_mm256_fmadd_pd(cab[1], c1, r);
|
423
|
+
r = simde_mm256_fmadd_pd(cab[2], c2, r);
|
424
|
+
r = simde_mm256_fmadd_pd(cab[3], c3, r);
|
425
|
+
|
426
|
+
// Sum all r entries
|
427
|
+
r0 = simde_mm256_castpd256_pd128(r);
|
428
|
+
r1 = simde_mm256_extractf128_pd(r, 1);
|
429
|
+
r0 = simde_mm_add_pd(r0, r1);
|
430
|
+
H[4] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
|
431
|
+
|
432
|
+
stride1 = offset[1]-offset[0]-4;
|
433
|
+
stride2 = (offset[2]-offset[1]-4)*stride1;
|
434
|
+
for (int j=0;j<4;++j) {
|
435
|
+
for (int k=0;k<4;++k) {
|
436
|
+
if (k<=1) {
|
437
|
+
C[j+4*k] = zero;
|
438
|
+
} else {
|
439
|
+
C[j+4*k] = simde_mm256_loadu_pd(
|
440
|
+
ddc+(starts[1]+j)*stride1+(starts[2]+k-2)*stride2+starts[0]);
|
441
|
+
}
|
442
|
+
}
|
443
|
+
}
|
444
|
+
ddc += stride2*(offset[3]-offset[2]-4-2);
|
445
|
+
|
446
|
+
a = boor0_d0;
|
447
|
+
b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
448
|
+
b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
449
|
+
b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
450
|
+
b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
451
|
+
|
452
|
+
c0 = simde_mm256_permute4x64_pd(boor2_d2, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
453
|
+
c1 = simde_mm256_permute4x64_pd(boor2_d2, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
454
|
+
c2 = simde_mm256_permute4x64_pd(boor2_d2, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
455
|
+
c3 = simde_mm256_permute4x64_pd(boor2_d2, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
456
|
+
|
457
|
+
ab[0] = simde_mm256_mul_pd(a, b0);
|
458
|
+
ab[1] = simde_mm256_mul_pd(a, b1);
|
459
|
+
ab[2] = simde_mm256_mul_pd(a, b2);
|
460
|
+
ab[3] = simde_mm256_mul_pd(a, b3);
|
461
|
+
// Sum over b axis: sum_b C_abc * (A_a B_b)_b
|
462
|
+
// cab <- cab + ab[i]*C[i]
|
463
|
+
for (int i=0;i<4;++i) {
|
464
|
+
cab[i] = simde_mm256_set1_pd(0);
|
465
|
+
cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
|
466
|
+
cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
|
467
|
+
cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
|
468
|
+
cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
|
469
|
+
}
|
470
|
+
|
471
|
+
// Reduce over the c direction
|
472
|
+
r = simde_mm256_set1_pd(0);
|
473
|
+
r = simde_mm256_fmadd_pd(cab[0], c0, r);
|
474
|
+
r = simde_mm256_fmadd_pd(cab[1], c1, r);
|
475
|
+
r = simde_mm256_fmadd_pd(cab[2], c2, r);
|
476
|
+
r = simde_mm256_fmadd_pd(cab[3], c3, r);
|
477
|
+
|
478
|
+
// Sum all r entries
|
479
|
+
r0 = simde_mm256_castpd256_pd128(r);
|
480
|
+
r1 = simde_mm256_extractf128_pd(r, 1);
|
481
|
+
r0 = simde_mm_add_pd(r0, r1);
|
482
|
+
H[8] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
|
483
|
+
|
484
|
+
stride1 = offset[1]-offset[0]-5;
|
485
|
+
stride2 = (offset[2]-offset[1]-5)*stride1;
|
486
|
+
for (int j=0;j<4;++j) {
|
487
|
+
for (int k=0;k<4;++k) {
|
488
|
+
if (j==0) {
|
489
|
+
C[j+4*k] = zero;
|
490
|
+
} else {
|
491
|
+
C[j+4*k] = simde_mm256_loadu_pd(
|
492
|
+
ddc+(starts[1]+j-1)*stride1+(starts[2]+k)*stride2+starts[0]-1);
|
493
|
+
}
|
494
|
+
}
|
495
|
+
}
|
496
|
+
ddc += stride2*(offset[3]-offset[2]-4);
|
497
|
+
|
498
|
+
a = boor0_d1;
|
499
|
+
|
500
|
+
b0 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
501
|
+
b1 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
502
|
+
b2 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
503
|
+
b3 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
504
|
+
|
505
|
+
c0 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
506
|
+
c1 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
507
|
+
c2 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
508
|
+
c3 = simde_mm256_permute4x64_pd(boor2_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
509
|
+
|
510
|
+
ab[0] = simde_mm256_mul_pd(a, b0);
|
511
|
+
ab[1] = simde_mm256_mul_pd(a, b1);
|
512
|
+
ab[2] = simde_mm256_mul_pd(a, b2);
|
513
|
+
ab[3] = simde_mm256_mul_pd(a, b3);
|
514
|
+
// Sum over b axis: sum_b C_abc * (A_a B_b)_b
|
515
|
+
// cab <- cab + ab[i]*C[i]
|
516
|
+
for (int i=0;i<4;++i) {
|
517
|
+
cab[i] = simde_mm256_set1_pd(0);
|
518
|
+
cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
|
519
|
+
cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
|
520
|
+
cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
|
521
|
+
cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
|
522
|
+
}
|
523
|
+
|
524
|
+
// Reduce over the c direction
|
525
|
+
r = simde_mm256_set1_pd(0);
|
526
|
+
r = simde_mm256_fmadd_pd(cab[0], c0, r);
|
527
|
+
r = simde_mm256_fmadd_pd(cab[1], c1, r);
|
528
|
+
r = simde_mm256_fmadd_pd(cab[2], c2, r);
|
529
|
+
r = simde_mm256_fmadd_pd(cab[3], c3, r);
|
530
|
+
|
531
|
+
// Sum all r entries
|
532
|
+
r0 = simde_mm256_castpd256_pd128(r);
|
533
|
+
r1 = simde_mm256_extractf128_pd(r, 1);
|
534
|
+
r0 = simde_mm_add_pd(r0, r1);
|
535
|
+
H[1] = H[3] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
|
536
|
+
|
537
|
+
stride1 = offset[1]-offset[0]-4;
|
538
|
+
stride2 = (offset[2]-offset[1]-5)*stride1;
|
539
|
+
for (int j=0;j<4;++j) {
|
540
|
+
for (int k=0;k<4;++k) {
|
541
|
+
if (k==0) {
|
542
|
+
C[j+4*k] = zero;
|
543
|
+
} else {
|
544
|
+
C[j+4*k] = simde_mm256_loadu_pd(
|
545
|
+
ddc+(starts[1]+j-1)*stride1+(starts[2]+k-1)*stride2+starts[0]);
|
546
|
+
}
|
547
|
+
}
|
548
|
+
}
|
549
|
+
ddc += stride2*(offset[3]-offset[2]-5);
|
550
|
+
|
551
|
+
a = boor0_d0;
|
552
|
+
|
553
|
+
b0 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
554
|
+
b1 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
555
|
+
b2 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
556
|
+
b3 = simde_mm256_permute4x64_pd(boor1_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
557
|
+
|
558
|
+
c0 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
559
|
+
c1 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
560
|
+
c2 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
561
|
+
c3 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
562
|
+
|
563
|
+
ab[0] = simde_mm256_mul_pd(a, b0);
|
564
|
+
ab[1] = simde_mm256_mul_pd(a, b1);
|
565
|
+
ab[2] = simde_mm256_mul_pd(a, b2);
|
566
|
+
ab[3] = simde_mm256_mul_pd(a, b3);
|
567
|
+
// Sum over b axis: sum_b C_abc * (A_a B_b)_b
|
568
|
+
// cab <- cab + ab[i]*C[i]
|
569
|
+
for (int i=0;i<4;++i) {
|
570
|
+
cab[i] = simde_mm256_set1_pd(0);
|
571
|
+
cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
|
572
|
+
cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
|
573
|
+
cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
|
574
|
+
cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
|
575
|
+
}
|
576
|
+
|
577
|
+
// Reduce over the c direction
|
578
|
+
r = simde_mm256_set1_pd(0);
|
579
|
+
r = simde_mm256_fmadd_pd(cab[0], c0, r);
|
580
|
+
r = simde_mm256_fmadd_pd(cab[1], c1, r);
|
581
|
+
r = simde_mm256_fmadd_pd(cab[2], c2, r);
|
582
|
+
r = simde_mm256_fmadd_pd(cab[3], c3, r);
|
583
|
+
|
584
|
+
// Sum all r entries
|
585
|
+
r0 = simde_mm256_castpd256_pd128(r);
|
586
|
+
r1 = simde_mm256_extractf128_pd(r, 1);
|
587
|
+
r0 = simde_mm_add_pd(r0, r1);
|
588
|
+
H[5] = H[7] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
|
589
|
+
|
590
|
+
|
591
|
+
|
592
|
+
stride1 = offset[1]-offset[0]-5;
|
593
|
+
stride2 = (offset[2]-offset[1]-4)*stride1;
|
594
|
+
for (int j=0;j<4;++j) {
|
595
|
+
for (int k=0;k<4;++k) {
|
596
|
+
if (k==0) {
|
597
|
+
C[j+4*k] = zero;
|
598
|
+
} else {
|
599
|
+
C[j+4*k] = simde_mm256_loadu_pd(
|
600
|
+
ddc+(starts[1]+j)*stride1+(starts[2]+k-1)*stride2+starts[0]-1);
|
601
|
+
}
|
602
|
+
}
|
603
|
+
}
|
604
|
+
ddc += stride2*(offset[3]-offset[2]-5);
|
605
|
+
|
606
|
+
a = boor0_d1;
|
607
|
+
|
608
|
+
b0 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
609
|
+
b1 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
610
|
+
b2 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
611
|
+
b3 = simde_mm256_permute4x64_pd(boor1_d0, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
612
|
+
|
613
|
+
c0 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
|
614
|
+
c1 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
|
615
|
+
c2 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
|
616
|
+
c3 = simde_mm256_permute4x64_pd(boor2_d1, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
|
617
|
+
|
618
|
+
ab[0] = simde_mm256_mul_pd(a, b0);
|
619
|
+
ab[1] = simde_mm256_mul_pd(a, b1);
|
620
|
+
ab[2] = simde_mm256_mul_pd(a, b2);
|
621
|
+
ab[3] = simde_mm256_mul_pd(a, b3);
|
622
|
+
// Sum over b axis: sum_b C_abc * (A_a B_b)_b
|
623
|
+
// cab <- cab + ab[i]*C[i]
|
624
|
+
for (int i=0;i<4;++i) {
|
625
|
+
cab[i] = simde_mm256_set1_pd(0);
|
626
|
+
cab[i] = simde_mm256_fmadd_pd(ab[0], C[4*i+0], cab[i]);
|
627
|
+
cab[i] = simde_mm256_fmadd_pd(ab[1], C[4*i+1], cab[i]);
|
628
|
+
cab[i] = simde_mm256_fmadd_pd(ab[2], C[4*i+2], cab[i]);
|
629
|
+
cab[i] = simde_mm256_fmadd_pd(ab[3], C[4*i+3], cab[i]);
|
630
|
+
}
|
631
|
+
|
632
|
+
// Reduce over the c direction
|
633
|
+
r = simde_mm256_set1_pd(0);
|
634
|
+
r = simde_mm256_fmadd_pd(cab[0], c0, r);
|
635
|
+
r = simde_mm256_fmadd_pd(cab[1], c1, r);
|
636
|
+
r = simde_mm256_fmadd_pd(cab[2], c2, r);
|
637
|
+
r = simde_mm256_fmadd_pd(cab[3], c3, r);
|
638
|
+
|
639
|
+
// Sum all r entries
|
640
|
+
r0 = simde_mm256_castpd256_pd128(r);
|
641
|
+
r1 = simde_mm256_extractf128_pd(r, 1);
|
642
|
+
r0 = simde_mm_add_pd(r0, r1);
|
643
|
+
H[2] = H[6] = simde_mm_cvtsd_f64(simde_mm_add_sd(r0, simde_mm_unpackhi_pd(r0, r0)));
|
644
|
+
}
|
645
|
+
}
|