netgen-mesher 6.2.2506.post35.dev0__cp314-cp314-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- netgen/NgOCC.py +7 -0
- netgen/__init__.py +114 -0
- netgen/__init__.pyi +22 -0
- netgen/__main__.py +53 -0
- netgen/cmake/NetgenConfig.cmake +79 -0
- netgen/cmake/netgen-targets-release.cmake +69 -0
- netgen/cmake/netgen-targets.cmake +146 -0
- netgen/config/__init__.py +1 -0
- netgen/config/__init__.pyi +52 -0
- netgen/config/__main__.py +4 -0
- netgen/config/config.py +68 -0
- netgen/config/config.pyi +54 -0
- netgen/csg.py +25 -0
- netgen/geom2d.py +178 -0
- netgen/gui.py +82 -0
- netgen/include/core/archive.hpp +1256 -0
- netgen/include/core/array.hpp +1760 -0
- netgen/include/core/autodiff.hpp +1131 -0
- netgen/include/core/autodiffdiff.hpp +733 -0
- netgen/include/core/bitarray.hpp +240 -0
- netgen/include/core/concurrentqueue.h +3619 -0
- netgen/include/core/exception.hpp +145 -0
- netgen/include/core/flags.hpp +199 -0
- netgen/include/core/hashtable.hpp +1281 -0
- netgen/include/core/localheap.hpp +318 -0
- netgen/include/core/logging.hpp +117 -0
- netgen/include/core/memtracer.hpp +221 -0
- netgen/include/core/mpi4py_pycapi.h +245 -0
- netgen/include/core/mpi_wrapper.hpp +643 -0
- netgen/include/core/ng_mpi.hpp +94 -0
- netgen/include/core/ng_mpi_generated_declarations.hpp +155 -0
- netgen/include/core/ng_mpi_native.hpp +25 -0
- netgen/include/core/ngcore.hpp +32 -0
- netgen/include/core/ngcore_api.hpp +152 -0
- netgen/include/core/ngstream.hpp +115 -0
- netgen/include/core/paje_trace.hpp +279 -0
- netgen/include/core/profiler.hpp +382 -0
- netgen/include/core/python_ngcore.hpp +457 -0
- netgen/include/core/ranges.hpp +109 -0
- netgen/include/core/register_archive.hpp +100 -0
- netgen/include/core/signal.hpp +82 -0
- netgen/include/core/simd.hpp +160 -0
- netgen/include/core/simd_arm64.hpp +407 -0
- netgen/include/core/simd_avx.hpp +394 -0
- netgen/include/core/simd_avx512.hpp +285 -0
- netgen/include/core/simd_generic.hpp +1053 -0
- netgen/include/core/simd_math.hpp +178 -0
- netgen/include/core/simd_sse.hpp +289 -0
- netgen/include/core/statushandler.hpp +37 -0
- netgen/include/core/symboltable.hpp +153 -0
- netgen/include/core/table.hpp +810 -0
- netgen/include/core/taskmanager.hpp +1161 -0
- netgen/include/core/type_traits.hpp +65 -0
- netgen/include/core/utils.hpp +385 -0
- netgen/include/core/version.hpp +102 -0
- netgen/include/core/xbool.hpp +47 -0
- netgen/include/csg/algprim.hpp +563 -0
- netgen/include/csg/brick.hpp +150 -0
- netgen/include/csg/csg.hpp +43 -0
- netgen/include/csg/csgeom.hpp +389 -0
- netgen/include/csg/csgparser.hpp +101 -0
- netgen/include/csg/curve2d.hpp +67 -0
- netgen/include/csg/edgeflw.hpp +112 -0
- netgen/include/csg/explicitcurve2d.hpp +113 -0
- netgen/include/csg/extrusion.hpp +185 -0
- netgen/include/csg/gencyl.hpp +70 -0
- netgen/include/csg/geoml.hpp +16 -0
- netgen/include/csg/identify.hpp +213 -0
- netgen/include/csg/manifold.hpp +29 -0
- netgen/include/csg/meshsurf.hpp +46 -0
- netgen/include/csg/polyhedra.hpp +121 -0
- netgen/include/csg/revolution.hpp +180 -0
- netgen/include/csg/singularref.hpp +84 -0
- netgen/include/csg/solid.hpp +295 -0
- netgen/include/csg/specpoin.hpp +194 -0
- netgen/include/csg/spline3d.hpp +99 -0
- netgen/include/csg/splinesurface.hpp +85 -0
- netgen/include/csg/surface.hpp +394 -0
- netgen/include/csg/triapprox.hpp +63 -0
- netgen/include/csg/vscsg.hpp +34 -0
- netgen/include/general/autodiff.hpp +356 -0
- netgen/include/general/autoptr.hpp +39 -0
- netgen/include/general/gzstream.h +121 -0
- netgen/include/general/hashtabl.hpp +1692 -0
- netgen/include/general/myadt.hpp +48 -0
- netgen/include/general/mystring.hpp +226 -0
- netgen/include/general/netgenout.hpp +205 -0
- netgen/include/general/ngarray.hpp +797 -0
- netgen/include/general/ngbitarray.hpp +149 -0
- netgen/include/general/ngpython.hpp +74 -0
- netgen/include/general/optmem.hpp +44 -0
- netgen/include/general/parthreads.hpp +138 -0
- netgen/include/general/seti.hpp +50 -0
- netgen/include/general/sort.hpp +47 -0
- netgen/include/general/spbita2d.hpp +59 -0
- netgen/include/general/stack.hpp +114 -0
- netgen/include/general/table.hpp +280 -0
- netgen/include/general/template.hpp +509 -0
- netgen/include/geom2d/csg2d.hpp +750 -0
- netgen/include/geom2d/geometry2d.hpp +280 -0
- netgen/include/geom2d/spline2d.hpp +234 -0
- netgen/include/geom2d/vsgeom2d.hpp +28 -0
- netgen/include/gprim/adtree.hpp +1392 -0
- netgen/include/gprim/geom2d.hpp +858 -0
- netgen/include/gprim/geom3d.hpp +749 -0
- netgen/include/gprim/geomfuncs.hpp +212 -0
- netgen/include/gprim/geomobjects.hpp +544 -0
- netgen/include/gprim/geomops.hpp +404 -0
- netgen/include/gprim/geomtest3d.hpp +101 -0
- netgen/include/gprim/gprim.hpp +33 -0
- netgen/include/gprim/spline.hpp +778 -0
- netgen/include/gprim/splinegeometry.hpp +73 -0
- netgen/include/gprim/transform3d.hpp +216 -0
- netgen/include/include/acisgeom.hpp +3 -0
- netgen/include/include/csg.hpp +1 -0
- netgen/include/include/geometry2d.hpp +1 -0
- netgen/include/include/gprim.hpp +1 -0
- netgen/include/include/incopengl.hpp +62 -0
- netgen/include/include/inctcl.hpp +13 -0
- netgen/include/include/incvis.hpp +6 -0
- netgen/include/include/linalg.hpp +1 -0
- netgen/include/include/meshing.hpp +1 -0
- netgen/include/include/myadt.hpp +1 -0
- netgen/include/include/mydefs.hpp +70 -0
- netgen/include/include/mystdlib.h +59 -0
- netgen/include/include/netgen_config.hpp +27 -0
- netgen/include/include/netgen_version.hpp +9 -0
- netgen/include/include/nginterface_v2_impl.hpp +395 -0
- netgen/include/include/ngsimd.hpp +1 -0
- netgen/include/include/occgeom.hpp +1 -0
- netgen/include/include/opti.hpp +1 -0
- netgen/include/include/parallel.hpp +1 -0
- netgen/include/include/stlgeom.hpp +1 -0
- netgen/include/include/visual.hpp +1 -0
- netgen/include/interface/rw_medit.hpp +11 -0
- netgen/include/interface/writeuser.hpp +80 -0
- netgen/include/linalg/densemat.hpp +414 -0
- netgen/include/linalg/linalg.hpp +29 -0
- netgen/include/linalg/opti.hpp +142 -0
- netgen/include/linalg/polynomial.hpp +47 -0
- netgen/include/linalg/vector.hpp +217 -0
- netgen/include/meshing/adfront2.hpp +274 -0
- netgen/include/meshing/adfront3.hpp +332 -0
- netgen/include/meshing/basegeom.hpp +370 -0
- netgen/include/meshing/bcfunctions.hpp +53 -0
- netgen/include/meshing/bisect.hpp +72 -0
- netgen/include/meshing/boundarylayer.hpp +113 -0
- netgen/include/meshing/classifyhpel.hpp +1984 -0
- netgen/include/meshing/clusters.hpp +46 -0
- netgen/include/meshing/curvedelems.hpp +274 -0
- netgen/include/meshing/delaunay2d.hpp +73 -0
- netgen/include/meshing/fieldlines.hpp +103 -0
- netgen/include/meshing/findip.hpp +198 -0
- netgen/include/meshing/findip2.hpp +103 -0
- netgen/include/meshing/geomsearch.hpp +69 -0
- netgen/include/meshing/global.hpp +54 -0
- netgen/include/meshing/hpref_hex.hpp +330 -0
- netgen/include/meshing/hpref_prism.hpp +3405 -0
- netgen/include/meshing/hpref_pyramid.hpp +154 -0
- netgen/include/meshing/hpref_quad.hpp +2082 -0
- netgen/include/meshing/hpref_segm.hpp +122 -0
- netgen/include/meshing/hpref_tet.hpp +4230 -0
- netgen/include/meshing/hpref_trig.hpp +848 -0
- netgen/include/meshing/hprefinement.hpp +366 -0
- netgen/include/meshing/improve2.hpp +178 -0
- netgen/include/meshing/improve3.hpp +151 -0
- netgen/include/meshing/localh.hpp +223 -0
- netgen/include/meshing/meshclass.hpp +1076 -0
- netgen/include/meshing/meshfunc.hpp +47 -0
- netgen/include/meshing/meshing.hpp +63 -0
- netgen/include/meshing/meshing2.hpp +163 -0
- netgen/include/meshing/meshing3.hpp +123 -0
- netgen/include/meshing/meshtool.hpp +90 -0
- netgen/include/meshing/meshtype.hpp +1930 -0
- netgen/include/meshing/msghandler.hpp +62 -0
- netgen/include/meshing/paralleltop.hpp +172 -0
- netgen/include/meshing/python_mesh.hpp +206 -0
- netgen/include/meshing/ruler2.hpp +172 -0
- netgen/include/meshing/ruler3.hpp +211 -0
- netgen/include/meshing/soldata.hpp +141 -0
- netgen/include/meshing/specials.hpp +17 -0
- netgen/include/meshing/surfacegeom.hpp +73 -0
- netgen/include/meshing/topology.hpp +1003 -0
- netgen/include/meshing/validate.hpp +21 -0
- netgen/include/meshing/visual_interface.hpp +71 -0
- netgen/include/mydefs.hpp +70 -0
- netgen/include/nginterface.h +474 -0
- netgen/include/nginterface_v2.hpp +406 -0
- netgen/include/nglib.h +697 -0
- netgen/include/nglib_occ.h +50 -0
- netgen/include/occ/occ_edge.hpp +47 -0
- netgen/include/occ/occ_face.hpp +52 -0
- netgen/include/occ/occ_solid.hpp +23 -0
- netgen/include/occ/occ_utils.hpp +376 -0
- netgen/include/occ/occ_vertex.hpp +30 -0
- netgen/include/occ/occgeom.hpp +659 -0
- netgen/include/occ/occmeshsurf.hpp +168 -0
- netgen/include/occ/vsocc.hpp +33 -0
- netgen/include/pybind11/LICENSE +29 -0
- netgen/include/pybind11/attr.h +722 -0
- netgen/include/pybind11/buffer_info.h +208 -0
- netgen/include/pybind11/cast.h +2361 -0
- netgen/include/pybind11/chrono.h +228 -0
- netgen/include/pybind11/common.h +2 -0
- netgen/include/pybind11/complex.h +74 -0
- netgen/include/pybind11/conduit/README.txt +15 -0
- netgen/include/pybind11/conduit/pybind11_conduit_v1.h +116 -0
- netgen/include/pybind11/conduit/pybind11_platform_abi_id.h +87 -0
- netgen/include/pybind11/conduit/wrap_include_python_h.h +72 -0
- netgen/include/pybind11/critical_section.h +56 -0
- netgen/include/pybind11/detail/class.h +823 -0
- netgen/include/pybind11/detail/common.h +1348 -0
- netgen/include/pybind11/detail/cpp_conduit.h +75 -0
- netgen/include/pybind11/detail/descr.h +226 -0
- netgen/include/pybind11/detail/dynamic_raw_ptr_cast_if_possible.h +39 -0
- netgen/include/pybind11/detail/exception_translation.h +71 -0
- netgen/include/pybind11/detail/function_record_pyobject.h +191 -0
- netgen/include/pybind11/detail/init.h +538 -0
- netgen/include/pybind11/detail/internals.h +799 -0
- netgen/include/pybind11/detail/native_enum_data.h +209 -0
- netgen/include/pybind11/detail/pybind11_namespace_macros.h +82 -0
- netgen/include/pybind11/detail/struct_smart_holder.h +378 -0
- netgen/include/pybind11/detail/type_caster_base.h +1591 -0
- netgen/include/pybind11/detail/typeid.h +65 -0
- netgen/include/pybind11/detail/using_smart_holder.h +22 -0
- netgen/include/pybind11/detail/value_and_holder.h +90 -0
- netgen/include/pybind11/eigen/common.h +9 -0
- netgen/include/pybind11/eigen/matrix.h +723 -0
- netgen/include/pybind11/eigen/tensor.h +521 -0
- netgen/include/pybind11/eigen.h +12 -0
- netgen/include/pybind11/embed.h +320 -0
- netgen/include/pybind11/eval.h +161 -0
- netgen/include/pybind11/functional.h +147 -0
- netgen/include/pybind11/gil.h +199 -0
- netgen/include/pybind11/gil_safe_call_once.h +102 -0
- netgen/include/pybind11/gil_simple.h +37 -0
- netgen/include/pybind11/iostream.h +265 -0
- netgen/include/pybind11/native_enum.h +67 -0
- netgen/include/pybind11/numpy.h +2312 -0
- netgen/include/pybind11/operators.h +202 -0
- netgen/include/pybind11/options.h +92 -0
- netgen/include/pybind11/pybind11.h +3645 -0
- netgen/include/pybind11/pytypes.h +2680 -0
- netgen/include/pybind11/stl/filesystem.h +114 -0
- netgen/include/pybind11/stl.h +666 -0
- netgen/include/pybind11/stl_bind.h +858 -0
- netgen/include/pybind11/subinterpreter.h +299 -0
- netgen/include/pybind11/trampoline_self_life_support.h +65 -0
- netgen/include/pybind11/type_caster_pyobject_ptr.h +61 -0
- netgen/include/pybind11/typing.h +298 -0
- netgen/include/pybind11/warnings.h +75 -0
- netgen/include/stlgeom/meshstlsurface.hpp +67 -0
- netgen/include/stlgeom/stlgeom.hpp +491 -0
- netgen/include/stlgeom/stlline.hpp +193 -0
- netgen/include/stlgeom/stltool.hpp +331 -0
- netgen/include/stlgeom/stltopology.hpp +419 -0
- netgen/include/stlgeom/vsstl.hpp +58 -0
- netgen/include/visualization/meshdoc.hpp +42 -0
- netgen/include/visualization/mvdraw.hpp +325 -0
- netgen/include/visualization/vispar.hpp +128 -0
- netgen/include/visualization/visual.hpp +28 -0
- netgen/include/visualization/visual_api.hpp +10 -0
- netgen/include/visualization/vssolution.hpp +399 -0
- netgen/lib/libnggui.lib +0 -0
- netgen/lib/ngcore.lib +0 -0
- netgen/lib/nglib.lib +0 -0
- netgen/lib/togl.lib +0 -0
- netgen/libnggui.dll +0 -0
- netgen/libngguipy.lib +0 -0
- netgen/libngguipy.pyd +0 -0
- netgen/libngpy/_NgOCC.pyi +1545 -0
- netgen/libngpy/__init__.pyi +7 -0
- netgen/libngpy/_csg.pyi +259 -0
- netgen/libngpy/_geom2d.pyi +323 -0
- netgen/libngpy/_meshing.pyi +1111 -0
- netgen/libngpy/_stl.pyi +131 -0
- netgen/libngpy.lib +0 -0
- netgen/libngpy.pyd +0 -0
- netgen/meshing.py +65 -0
- netgen/ngcore.dll +0 -0
- netgen/nglib.dll +0 -0
- netgen/occ.py +52 -0
- netgen/read_gmsh.py +259 -0
- netgen/read_meshio.py +22 -0
- netgen/stl.py +2 -0
- netgen/togl.dll +0 -0
- netgen/version.py +2 -0
- netgen/webgui.py +529 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/boundarycondition.geo +16 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/boxcyl.geo +32 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/circle_on_cube.geo +27 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cone.geo +13 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cube.geo +16 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cubeandring.geo +55 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cubeandspheres.geo +21 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cubemcyl.geo +18 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cubemsphere.geo +19 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cylinder.geo +12 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cylsphere.geo +12 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/doc/ng4.pdf +0 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/ellipsoid.geo +8 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/ellipticcyl.geo +10 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/extrusion.geo +99 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/fichera.geo +24 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/frame.step +11683 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/hinge.stl +8486 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/lshape3d.geo +26 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/manyholes.geo +26 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/manyholes2.geo +26 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/matrix.geo +27 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/ortho.geo +11 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/part1.stl +2662 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/period.geo +33 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/py_tutorials/exportNeutral.py +26 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/py_tutorials/mesh.py +19 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/py_tutorials/shaft.geo +65 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/revolution.geo +18 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/screw.step +1694 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/sculpture.geo +13 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/shaft.geo +65 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/shell.geo +10 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/sphere.geo +8 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/sphereincube.geo +17 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/square.in2d +35 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/squarecircle.in2d +48 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/squarehole.in2d +47 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/torus.geo +8 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/trafo.geo +57 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/twobricks.geo +15 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/twocubes.geo +18 -0
- netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/twocyl.geo +16 -0
- netgen_mesher-6.2.2506.post35.dev0.dist-info/METADATA +15 -0
- netgen_mesher-6.2.2506.post35.dev0.dist-info/RECORD +340 -0
- netgen_mesher-6.2.2506.post35.dev0.dist-info/WHEEL +5 -0
- netgen_mesher-6.2.2506.post35.dev0.dist-info/entry_points.txt +2 -0
- netgen_mesher-6.2.2506.post35.dev0.dist-info/licenses/AUTHORS +1 -0
- netgen_mesher-6.2.2506.post35.dev0.dist-info/licenses/LICENSE +504 -0
- netgen_mesher-6.2.2506.post35.dev0.dist-info/top_level.txt +2 -0
- pyngcore/__init__.py +1 -0
- pyngcore/pyngcore.cp314-win_amd64.pyd +0 -0
|
@@ -0,0 +1,1053 @@
|
|
|
1
|
+
#ifndef NETGEN_CORE_SIMD_GENERIC_HPP
|
|
2
|
+
#define NETGEN_CORE_SIMD_GENERIC_HPP
|
|
3
|
+
|
|
4
|
+
/**************************************************************************/
|
|
5
|
+
/* File: simd_base.hpp */
|
|
6
|
+
/* Author: Joachim Schoeberl, Matthias Hochsteger */
|
|
7
|
+
/* Date: 25. Mar. 16 */
|
|
8
|
+
/**************************************************************************/
|
|
9
|
+
|
|
10
|
+
#include <type_traits>
|
|
11
|
+
#include <functional>
|
|
12
|
+
#include <tuple>
|
|
13
|
+
#include <cmath>
|
|
14
|
+
|
|
15
|
+
#include "array.hpp"
|
|
16
|
+
|
|
17
|
+
namespace ngcore
|
|
18
|
+
{
|
|
19
|
+
#if defined __AVX512F__
|
|
20
|
+
#define NETGEN_DEFAULT_SIMD_SIZE 8
|
|
21
|
+
#elif defined __AVX__
|
|
22
|
+
#define NETGEN_DEFAULT_SIMD_SIZE 4
|
|
23
|
+
#else
|
|
24
|
+
#define NETGEN_DEFAULT_SIMD_SIZE 2
|
|
25
|
+
#endif
|
|
26
|
+
|
|
27
|
+
constexpr int GetDefaultSIMDSize() {
|
|
28
|
+
return NETGEN_DEFAULT_SIMD_SIZE;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
constexpr bool IsNativeSIMDSize(int n) {
|
|
32
|
+
if(n==1) return true;
|
|
33
|
+
if(n==2) return true;
|
|
34
|
+
#if defined __AVX__
|
|
35
|
+
if(n==4) return true;
|
|
36
|
+
#endif
|
|
37
|
+
#if defined __AVX512F__
|
|
38
|
+
if(n==8) return true;
|
|
39
|
+
#endif
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// split n = k+l such that k is the largest natively supported simd size < n
|
|
44
|
+
constexpr int GetLargestNativeSIMDPart(int n) {
|
|
45
|
+
int k = n-1;
|
|
46
|
+
while(!IsNativeSIMDSize(k))
|
|
47
|
+
k--;
|
|
48
|
+
return k;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
constexpr size_t LargestPowerOfTwo (size_t x)
|
|
52
|
+
{
|
|
53
|
+
size_t y = 1;
|
|
54
|
+
while (2*y <= x) y *= 2;
|
|
55
|
+
return y;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
template <typename T, int N=GetDefaultSIMDSize()> class SIMD;
|
|
60
|
+
|
|
61
|
+
class mask64;
|
|
62
|
+
|
|
63
|
+
////////////////////////////////////////////////////////////////////////////
|
|
64
|
+
namespace detail {
|
|
65
|
+
template <typename T, size_t N, size_t... I>
|
|
66
|
+
auto array_range_impl(std::array<T, N> const& arr,
|
|
67
|
+
size_t first,
|
|
68
|
+
std::index_sequence<I...>)
|
|
69
|
+
-> std::array<T, sizeof...(I)> {
|
|
70
|
+
return {arr[first + I]...};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
template <size_t S, typename T, size_t N>
|
|
74
|
+
auto array_range(std::array<T, N> const& arr, size_t first) {
|
|
75
|
+
return array_range_impl(arr, first, std::make_index_sequence<S>{});
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
} // namespace detail
|
|
79
|
+
|
|
80
|
+
////////////////////////////////////////////////////////////////////////////
|
|
81
|
+
// mask
|
|
82
|
+
|
|
83
|
+
template <>
|
|
84
|
+
class SIMD<mask64,1>
|
|
85
|
+
{
|
|
86
|
+
int64_t mask;
|
|
87
|
+
public:
|
|
88
|
+
SIMD (int64_t i)
|
|
89
|
+
: mask(i > 0 ? -1 : 0) { ; }
|
|
90
|
+
bool Data() const { return mask; }
|
|
91
|
+
static constexpr int Size() { return 1; }
|
|
92
|
+
auto operator[] (int /* i */) const { return mask; }
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
template <int N>
|
|
97
|
+
class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD<mask64,N>
|
|
98
|
+
{
|
|
99
|
+
// static constexpr int N1 = GetLargestNativeSIMDPart(N);
|
|
100
|
+
static constexpr size_t N1 = LargestPowerOfTwo(N-1);
|
|
101
|
+
static constexpr int N2 = N-N1;
|
|
102
|
+
|
|
103
|
+
SIMD<mask64,N1> lo;
|
|
104
|
+
SIMD<mask64,N2> hi;
|
|
105
|
+
public:
|
|
106
|
+
|
|
107
|
+
SIMD (int64_t i) : lo(i), hi(i-N1 ) { ; }
|
|
108
|
+
SIMD (SIMD<mask64,N1> lo_, SIMD<mask64,N2> hi_) : lo(lo_), hi(hi_) { ; }
|
|
109
|
+
SIMD<mask64,N1> Lo() const { return lo; }
|
|
110
|
+
SIMD<mask64,N2> Hi() const { return hi; }
|
|
111
|
+
static constexpr int Size() { return N; }
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
template<int N>
|
|
115
|
+
NETGEN_INLINE SIMD<mask64,N> operator&& (SIMD<mask64,N> a, SIMD<mask64,N> b)
|
|
116
|
+
{
|
|
117
|
+
if constexpr(N==1) return a.Data() && b.Data();
|
|
118
|
+
else return { a.Lo() && b.Lo(), a.Hi() && b.Hi() };
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
////////////////////////////////////////////////////////////////////////////
|
|
123
|
+
// int32
|
|
124
|
+
|
|
125
|
+
template<>
|
|
126
|
+
class SIMD<int32_t,1>
|
|
127
|
+
{
|
|
128
|
+
int32_t data;
|
|
129
|
+
|
|
130
|
+
public:
|
|
131
|
+
static constexpr int Size() { return 1; }
|
|
132
|
+
SIMD () {}
|
|
133
|
+
SIMD (const SIMD &) = default;
|
|
134
|
+
SIMD & operator= (const SIMD &) = default;
|
|
135
|
+
// SIMD (int val) : data{val} {}
|
|
136
|
+
SIMD (int32_t val) : data{val} {}
|
|
137
|
+
SIMD (size_t val) : data(val) {}
|
|
138
|
+
explicit SIMD (std::array<int32_t, 1> arr) : data{arr[0]} {}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
int32_t operator[] (int i) const { return ((int32_t*)(&data))[i]; }
|
|
143
|
+
auto Data() const { return data; }
|
|
144
|
+
static SIMD FirstInt(int32_t n0=0) { return {n0}; }
|
|
145
|
+
template <int I>
|
|
146
|
+
int32_t Get()
|
|
147
|
+
{
|
|
148
|
+
static_assert(I==0);
|
|
149
|
+
return data;
|
|
150
|
+
}
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
template<int N>
|
|
154
|
+
class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD<int32_t,N>
|
|
155
|
+
{
|
|
156
|
+
// static constexpr int N1 = GetLargestNativeSIMDPart(N);
|
|
157
|
+
static constexpr size_t N1 = LargestPowerOfTwo(N-1);
|
|
158
|
+
static constexpr int N2 = N-N1;
|
|
159
|
+
|
|
160
|
+
SIMD<int32_t,N1> lo;
|
|
161
|
+
SIMD<int32_t,N2> high;
|
|
162
|
+
|
|
163
|
+
public:
|
|
164
|
+
static constexpr int Size() { return N; }
|
|
165
|
+
|
|
166
|
+
SIMD () {}
|
|
167
|
+
SIMD (const SIMD &) = default;
|
|
168
|
+
SIMD & operator= (const SIMD &) = default;
|
|
169
|
+
|
|
170
|
+
// SIMD (int val) : lo{val}, high{val} { ; }
|
|
171
|
+
SIMD (int32_t val) : lo{val}, high{val} { ; }
|
|
172
|
+
SIMD (size_t val) : lo{val}, high{val} { ; }
|
|
173
|
+
SIMD (int32_t * p) : lo{p}, high{p+N1} { ; }
|
|
174
|
+
|
|
175
|
+
SIMD (SIMD<int32_t,N1> lo_, SIMD<int32_t,N2> high_) : lo(lo_), high(high_) { ; }
|
|
176
|
+
|
|
177
|
+
explicit SIMD( std::array<int32_t, N> arr )
|
|
178
|
+
: lo(detail::array_range<N1>(arr, 0)),
|
|
179
|
+
high(detail::array_range<N2>(arr, N1))
|
|
180
|
+
{}
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
template<typename ...T>
|
|
184
|
+
explicit SIMD(const T... vals)
|
|
185
|
+
: lo(detail::array_range<N1>(std::array<int32_t, N>{vals...}, 0)),
|
|
186
|
+
high(detail::array_range<N2>(std::array<int32_t, N>{vals...}, N1))
|
|
187
|
+
{
|
|
188
|
+
static_assert(sizeof...(vals)==N, "wrong number of arguments");
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int32_t(int)>>::value, int>::type = 0>
|
|
193
|
+
SIMD (const T & func)
|
|
194
|
+
{
|
|
195
|
+
for(auto i : IntRange(N1))
|
|
196
|
+
lo[i] = func(i);
|
|
197
|
+
for(auto i : IntRange(N2))
|
|
198
|
+
high[i] = func(N1+i);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
auto Lo() const { return lo; }
|
|
202
|
+
auto Hi() const { return high; }
|
|
203
|
+
|
|
204
|
+
int32_t operator[] (int i) const { return ((int32_t*)(&lo))[i]; }
|
|
205
|
+
|
|
206
|
+
void Store (int32_t * p) { lo.Store(p); high.Store(p+N1); }
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
/*
|
|
210
|
+
operator tuple<int32_t&,int32_t&,int32_t&,int32_t&> ()
|
|
211
|
+
{ return tuple<int32_t&,int32_t&,int32_t&,int32_t&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
|
|
212
|
+
*/
|
|
213
|
+
|
|
214
|
+
/*
|
|
215
|
+
static SIMD FirstInt() { return { 0, 1, 2, 3 }; }
|
|
216
|
+
*/
|
|
217
|
+
static SIMD FirstInt(int32_t n0=0) { return {SIMD<int32_t,N1>::FirstInt(n0), SIMD<int32_t,N2>::FirstInt(n0+N1)}; }
|
|
218
|
+
template <int I>
|
|
219
|
+
int32_t Get()
|
|
220
|
+
{
|
|
221
|
+
static_assert(I>=0 && I<N, "Index out of range");
|
|
222
|
+
if constexpr(I<N1) return lo.template Get<I>();
|
|
223
|
+
else return high.template Get<I-N1>();
|
|
224
|
+
}
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
////////////////////////////////////////////////////////////////////////////
|
|
230
|
+
// int64
|
|
231
|
+
|
|
232
|
+
template<>
|
|
233
|
+
class SIMD<int64_t,1>
|
|
234
|
+
{
|
|
235
|
+
int64_t data;
|
|
236
|
+
|
|
237
|
+
public:
|
|
238
|
+
static constexpr int Size() { return 1; }
|
|
239
|
+
SIMD () {}
|
|
240
|
+
SIMD (const SIMD &) = default;
|
|
241
|
+
SIMD & operator= (const SIMD &) = default;
|
|
242
|
+
SIMD (int val) : data{val} {}
|
|
243
|
+
SIMD (int64_t val) : data{val} {}
|
|
244
|
+
SIMD (size_t val) : data(val) {}
|
|
245
|
+
explicit SIMD (std::array<int64_t, 1> arr)
|
|
246
|
+
: data{arr[0]}
|
|
247
|
+
{}
|
|
248
|
+
|
|
249
|
+
int64_t operator[] (int i) const { return ((int64_t*)(&data))[i]; }
|
|
250
|
+
auto Data() const { return data; }
|
|
251
|
+
static SIMD FirstInt(int64_t n0=0) { return {n0}; }
|
|
252
|
+
template <int I>
|
|
253
|
+
int64_t Get()
|
|
254
|
+
{
|
|
255
|
+
static_assert(I==0);
|
|
256
|
+
return data;
|
|
257
|
+
}
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
template<int N>
|
|
261
|
+
class alignas(GetLargestNativeSIMDPart(N)*sizeof(int64_t)) SIMD<int64_t,N>
|
|
262
|
+
{
|
|
263
|
+
// static constexpr int N1 = GetLargestNativeSIMDPart(N);
|
|
264
|
+
static constexpr size_t N1 = LargestPowerOfTwo(N-1);
|
|
265
|
+
static constexpr int N2 = N-N1;
|
|
266
|
+
|
|
267
|
+
SIMD<int64_t,N1> lo;
|
|
268
|
+
SIMD<int64_t,N2> high;
|
|
269
|
+
|
|
270
|
+
public:
|
|
271
|
+
static constexpr int Size() { return N; }
|
|
272
|
+
|
|
273
|
+
SIMD () {}
|
|
274
|
+
SIMD (const SIMD &) = default;
|
|
275
|
+
SIMD & operator= (const SIMD &) = default;
|
|
276
|
+
|
|
277
|
+
SIMD (int val) : lo{val}, high{val} { ; }
|
|
278
|
+
SIMD (int64_t val) : lo{val}, high{val} { ; }
|
|
279
|
+
SIMD (size_t val) : lo{val}, high{val} { ; }
|
|
280
|
+
SIMD (SIMD<int64_t,N1> lo_, SIMD<int64_t,N2> high_) : lo(lo_), high(high_) { ; }
|
|
281
|
+
|
|
282
|
+
explicit SIMD( std::array<int64_t, N> arr )
|
|
283
|
+
: lo(detail::array_range<N1>(arr, 0)),
|
|
284
|
+
high(detail::array_range<N2>(arr, N1))
|
|
285
|
+
{}
|
|
286
|
+
|
|
287
|
+
template<typename ...T>
|
|
288
|
+
explicit SIMD(const T... vals)
|
|
289
|
+
: lo(detail::array_range<N1>(std::array<int64_t, N>{vals...}, 0)),
|
|
290
|
+
high(detail::array_range<N2>(std::array<int64_t, N>{vals...}, N1))
|
|
291
|
+
{
|
|
292
|
+
static_assert(sizeof...(vals)==N, "wrong number of arguments");
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int64_t(int)>>::value, int>::type = 0>
|
|
297
|
+
SIMD (const T & func)
|
|
298
|
+
{
|
|
299
|
+
for(auto i : IntRange(N1))
|
|
300
|
+
lo[i] = func(i);
|
|
301
|
+
for(auto i : IntRange(N2))
|
|
302
|
+
high[i] = func(N1+i);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
auto Lo() const { return lo; }
|
|
306
|
+
auto Hi() const { return high; }
|
|
307
|
+
|
|
308
|
+
int64_t operator[] (int i) const { return ((int64_t*)(&lo))[i]; }
|
|
309
|
+
|
|
310
|
+
/*
|
|
311
|
+
operator tuple<int64_t&,int64_t&,int64_t&,int64_t&> ()
|
|
312
|
+
{ return tuple<int64_t&,int64_t&,int64_t&,int64_t&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
|
|
313
|
+
*/
|
|
314
|
+
|
|
315
|
+
/*
|
|
316
|
+
static SIMD FirstInt() { return { 0, 1, 2, 3 }; }
|
|
317
|
+
*/
|
|
318
|
+
static SIMD FirstInt(int64_t n0=0) { return {SIMD<int64_t,N1>::FirstInt(n0), SIMD<int64_t,N2>::FirstInt(n0+N1)}; }
|
|
319
|
+
template <int I>
|
|
320
|
+
int64_t Get()
|
|
321
|
+
{
|
|
322
|
+
static_assert(I>=0 && I<N, "Index out of range");
|
|
323
|
+
if constexpr(I<N1) return lo.template Get<I>();
|
|
324
|
+
else return high.template Get<I-N1>();
|
|
325
|
+
}
|
|
326
|
+
};
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
////////////////////////////////////////////////////////////////////////////
|
|
331
|
+
// double
|
|
332
|
+
|
|
333
|
+
template<>
|
|
334
|
+
class SIMD<double,1>
|
|
335
|
+
{
|
|
336
|
+
double data;
|
|
337
|
+
|
|
338
|
+
public:
|
|
339
|
+
static constexpr int Size() { return 1; }
|
|
340
|
+
SIMD () {}
|
|
341
|
+
SIMD (const SIMD &) = default;
|
|
342
|
+
SIMD & operator= (const SIMD &) = default;
|
|
343
|
+
SIMD (double val) { data = val; }
|
|
344
|
+
SIMD (int val) { data = val; }
|
|
345
|
+
SIMD (size_t val) { data = val; }
|
|
346
|
+
SIMD (double const * p) { data = *p; }
|
|
347
|
+
SIMD (double const * p, SIMD<mask64,1> mask) { data = mask.Data() ? *p : 0.0; }
|
|
348
|
+
explicit SIMD (std::array<double, 1> arr)
|
|
349
|
+
: data{arr[0]}
|
|
350
|
+
{}
|
|
351
|
+
|
|
352
|
+
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
|
|
353
|
+
SIMD (const T & func)
|
|
354
|
+
{
|
|
355
|
+
data = func(0);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
|
|
359
|
+
SIMD & operator= (const T & func)
|
|
360
|
+
{
|
|
361
|
+
data = func(0);
|
|
362
|
+
return *this;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
void Store (double * p) { *p = data; }
|
|
366
|
+
void Store (double * p, SIMD<mask64,1> mask) { if (mask.Data()) *p = data; }
|
|
367
|
+
|
|
368
|
+
double operator[] (int i) const { return ((double*)(&data))[i]; }
|
|
369
|
+
double Data() const { return data; }
|
|
370
|
+
template <int I>
|
|
371
|
+
double Get()
|
|
372
|
+
{
|
|
373
|
+
static_assert(I==0);
|
|
374
|
+
return data;
|
|
375
|
+
}
|
|
376
|
+
};
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
template<int N>
|
|
380
|
+
class alignas(GetLargestNativeSIMDPart(N)*sizeof(double)) SIMD<double, N>
|
|
381
|
+
{
|
|
382
|
+
// static constexpr int N1 = GetLargestNativeSIMDPart(N);
|
|
383
|
+
static constexpr size_t N1 = LargestPowerOfTwo(N-1);
|
|
384
|
+
static constexpr int N2 = N-N1;
|
|
385
|
+
|
|
386
|
+
SIMD<double, N1> lo;
|
|
387
|
+
SIMD<double, N2> high;
|
|
388
|
+
|
|
389
|
+
public:
|
|
390
|
+
static constexpr int Size() { return N; }
|
|
391
|
+
SIMD () {}
|
|
392
|
+
SIMD (const SIMD &) = default;
|
|
393
|
+
SIMD (SIMD<double,N1> lo_, SIMD<double,N2> hi_) : lo(lo_), high(hi_) { ; }
|
|
394
|
+
|
|
395
|
+
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
|
|
396
|
+
SIMD (const T & func)
|
|
397
|
+
{
|
|
398
|
+
double *p = (double*)this;
|
|
399
|
+
for(auto i : IntRange(N))
|
|
400
|
+
p[i] = func(i);
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
template <typename T, typename std::enable_if<std::is_convertible<T,std::function<double(int)>>::value,int>::type = 0>
|
|
404
|
+
SIMD & operator= (const T & func)
|
|
405
|
+
{
|
|
406
|
+
double *p = (double*)this;
|
|
407
|
+
for(auto i : IntRange(N))
|
|
408
|
+
p[i] = func(i);
|
|
409
|
+
return *this;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
SIMD & operator= (const SIMD &) = default;
|
|
414
|
+
|
|
415
|
+
SIMD (double val) : lo{val}, high{val} { ; }
|
|
416
|
+
SIMD (int val) : lo{val}, high{val} { ; }
|
|
417
|
+
SIMD (size_t val) : lo{val}, high{val} { ; }
|
|
418
|
+
|
|
419
|
+
SIMD (double const * p) : lo{p}, high{p+N1} { ; }
|
|
420
|
+
SIMD (double const * p, SIMD<mask64,N> mask)
|
|
421
|
+
: lo{p, mask.Lo()}, high{p+N1, mask.Hi()}
|
|
422
|
+
{ }
|
|
423
|
+
SIMD (double * p) : lo{p}, high{p+N1} { ; }
|
|
424
|
+
SIMD (double * p, SIMD<mask64,N> mask)
|
|
425
|
+
: lo{p, mask.Lo()}, high{p+N1, mask.Hi()}
|
|
426
|
+
{ }
|
|
427
|
+
|
|
428
|
+
explicit SIMD( std::array<double, N> arr )
|
|
429
|
+
: lo(detail::array_range<N1>(arr, 0)),
|
|
430
|
+
high(detail::array_range<N2>(arr, N1))
|
|
431
|
+
{}
|
|
432
|
+
|
|
433
|
+
template<typename ...T>
|
|
434
|
+
explicit SIMD(const T... vals)
|
|
435
|
+
: lo(detail::array_range<N1>(std::array<double, N>{vals...}, 0)),
|
|
436
|
+
high(detail::array_range<N2>(std::array<double, N>{vals...}, N1))
|
|
437
|
+
{
|
|
438
|
+
static_assert(sizeof...(vals)==N, "wrong number of arguments");
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
void Store (double * p) { lo.Store(p); high.Store(p+N1); }
|
|
442
|
+
void Store (double * p, SIMD<mask64,N> mask)
|
|
443
|
+
{
|
|
444
|
+
lo.Store(p, mask.Lo());
|
|
445
|
+
high.Store(p+N1, mask.Hi());
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
auto Lo() const { return lo; }
|
|
449
|
+
auto Hi() const { return high; }
|
|
450
|
+
|
|
451
|
+
double operator[] (int i) const { return ((double*)(&lo))[i]; }
|
|
452
|
+
|
|
453
|
+
template<typename=std::enable_if<N==2>>
|
|
454
|
+
operator std::tuple<double&,double&> ()
|
|
455
|
+
{
|
|
456
|
+
double *p = (double*)this;
|
|
457
|
+
return std::tuple<double&,double&>(p[0], p[1]);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
template<typename=std::enable_if<N==4>>
|
|
461
|
+
operator std::tuple<double&,double&,double&,double&> ()
|
|
462
|
+
{ return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
|
|
463
|
+
|
|
464
|
+
template <int I>
|
|
465
|
+
double Get()
|
|
466
|
+
{
|
|
467
|
+
static_assert(I>=0 && I<N, "Index out of range");
|
|
468
|
+
if constexpr(I<N1) return lo.template Get<I>();
|
|
469
|
+
else return high.template Get<I-N1>();
|
|
470
|
+
}
|
|
471
|
+
auto Data() const { return *this; }
|
|
472
|
+
};
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
// Generic operators for any arithmetic type/simd width
|
|
476
|
+
template <typename T, int N>
|
|
477
|
+
NETGEN_INLINE SIMD<T,N> operator+ (SIMD<T,N> a, SIMD<T,N> b) {
|
|
478
|
+
if constexpr(N==1) return a.Data()+b.Data();
|
|
479
|
+
else return { a.Lo()+b.Lo(), a.Hi()+b.Hi() };
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
template <typename T, int N>
|
|
483
|
+
NETGEN_INLINE SIMD<T,N> operator- (SIMD<T,N> a, SIMD<T,N> b) {
|
|
484
|
+
if constexpr(N==1) return a.Data()-b.Data();
|
|
485
|
+
else return { a.Lo()-b.Lo(), a.Hi()-b.Hi() };
|
|
486
|
+
}
|
|
487
|
+
template <typename T, int N>
|
|
488
|
+
NETGEN_INLINE SIMD<T,N> operator- (SIMD<T,N> a) {
|
|
489
|
+
if constexpr(N==1) return -a.Data();
|
|
490
|
+
else return { -a.Lo(), -a.Hi() };
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
template <typename T, int N>
|
|
494
|
+
NETGEN_INLINE SIMD<T,N> operator* (SIMD<T,N> a, SIMD<T,N> b) {
|
|
495
|
+
if constexpr(N==1) return a.Data()*b.Data();
|
|
496
|
+
else return { a.Lo()*b.Lo(), a.Hi()*b.Hi() };
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
template <typename T, int N>
|
|
500
|
+
NETGEN_INLINE SIMD<T,N> operator/ (SIMD<T,N> a, SIMD<T,N> b) {
|
|
501
|
+
if constexpr(N==1) return a.Data()/b.Data();
|
|
502
|
+
else return { a.Lo()/b.Lo(), a.Hi()/b.Hi() };
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
template <typename T, int N>
|
|
506
|
+
NETGEN_INLINE SIMD<mask64,N> operator< (SIMD<T,N> a, SIMD<T,N> b)
|
|
507
|
+
{
|
|
508
|
+
if constexpr(N==1) return a.Data() < b.Data();
|
|
509
|
+
else return { a.Lo()<b.Lo(), a.Hi()<b.Hi() };
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
template <typename T, int N>
|
|
513
|
+
NETGEN_INLINE SIMD<mask64,N> operator<= (SIMD<T,N> a, SIMD<T,N> b)
|
|
514
|
+
{
|
|
515
|
+
if constexpr(N==1) return a.Data() <= b.Data();
|
|
516
|
+
else return { a.Lo()<=b.Lo(), a.Hi()<=b.Hi() };
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
template <typename T, int N>
|
|
520
|
+
NETGEN_INLINE SIMD<mask64,N> operator> (SIMD<T,N> a, SIMD<T,N> b)
|
|
521
|
+
{
|
|
522
|
+
if constexpr(N==1) return a.Data() > b.Data();
|
|
523
|
+
else return { a.Lo()>b.Lo(), a.Hi()>b.Hi() };
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
template <typename T, int N>
|
|
527
|
+
NETGEN_INLINE SIMD<mask64,N> operator>= (SIMD<T,N> a, SIMD<T,N> b)
|
|
528
|
+
{
|
|
529
|
+
if constexpr(N==1) return a.Data() >= b.Data();
|
|
530
|
+
else return { a.Lo()>=b.Lo(), a.Hi()>=b.Hi() };
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
template <typename T, int N>
|
|
534
|
+
NETGEN_INLINE SIMD<mask64,N> operator== (SIMD<T,N> a, SIMD<T,N> b)
|
|
535
|
+
{
|
|
536
|
+
if constexpr(N==1) return a.Data() == b.Data();
|
|
537
|
+
else return { a.Lo()==b.Lo(), a.Hi()==b.Hi() };
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
template <typename T, int N>
|
|
541
|
+
NETGEN_INLINE SIMD<mask64,N> operator!= (SIMD<T,N> a, SIMD<T,N> b)
|
|
542
|
+
{
|
|
543
|
+
if constexpr(N==1) return a.Data() != b.Data();
|
|
544
|
+
else return { a.Lo()!=b.Lo(), a.Hi()!=b.Hi() };
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
template <int N>
|
|
548
|
+
NETGEN_INLINE SIMD<int64_t,N> operator& (SIMD<int64_t,N> a, SIMD<int64_t,N> b)
|
|
549
|
+
{
|
|
550
|
+
if constexpr(N==1) return a.Data() & b.Data();
|
|
551
|
+
else return { (a.Lo()&b.Lo()), (a.Hi()&b.Hi()) };
|
|
552
|
+
}
|
|
553
|
+
template <int N>
|
|
554
|
+
NETGEN_INLINE SIMD<int64_t,N> operator| (SIMD<int64_t,N> a, SIMD<int64_t,N> b)
|
|
555
|
+
{
|
|
556
|
+
if constexpr(N==1) return a.Data() & b.Data();
|
|
557
|
+
else return { (a.Lo()|b.Lo()), (a.Hi()|b.Hi()) };
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
// int64_t operators with scalar operand (implement overloads to allow implicit casts for second operand)
|
|
562
|
+
template <int N>
|
|
563
|
+
NETGEN_INLINE SIMD<int64_t,N> operator+ (SIMD<int64_t,N> a, int64_t b) { return a+SIMD<int64_t,N>(b); }
|
|
564
|
+
template <int N>
|
|
565
|
+
NETGEN_INLINE SIMD<int64_t,N> operator+ (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)+b; }
|
|
566
|
+
template <int N>
|
|
567
|
+
NETGEN_INLINE SIMD<int64_t,N> operator- (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)-b; }
|
|
568
|
+
template <int N>
|
|
569
|
+
NETGEN_INLINE SIMD<int64_t,N> operator- (SIMD<int64_t,N> a, int64_t b) { return a-SIMD<int64_t,N>(b); }
|
|
570
|
+
template <int N>
|
|
571
|
+
NETGEN_INLINE SIMD<int64_t,N> operator* (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)*b; }
|
|
572
|
+
template <int N>
|
|
573
|
+
NETGEN_INLINE SIMD<int64_t,N> operator* (SIMD<int64_t,N> b, int64_t a) { return SIMD<int64_t,N>(a)*b; }
|
|
574
|
+
template <int N>
|
|
575
|
+
NETGEN_INLINE SIMD<int64_t,N> operator/ (SIMD<int64_t,N> a, int64_t b) { return a/SIMD<int64_t,N>(b); }
|
|
576
|
+
template <int N>
|
|
577
|
+
NETGEN_INLINE SIMD<int64_t,N> operator/ (int64_t a, SIMD<int64_t,N> b) { return SIMD<int64_t,N>(a)/b; }
|
|
578
|
+
template <int N>
|
|
579
|
+
NETGEN_INLINE SIMD<int64_t,N> & operator+= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a=a+b; return a; }
|
|
580
|
+
template <int N>
|
|
581
|
+
NETGEN_INLINE SIMD<int64_t,N> & operator+= (SIMD<int64_t,N> & a, int64_t b) { a+=SIMD<int64_t,N>(b); return a; }
|
|
582
|
+
template <int N>
|
|
583
|
+
NETGEN_INLINE SIMD<int64_t,N> & operator-= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a = a-b; return a; }
|
|
584
|
+
template <int N>
|
|
585
|
+
NETGEN_INLINE SIMD<int64_t,N> & operator-= (SIMD<int64_t,N> & a, int64_t b) { a-=SIMD<int64_t,N>(b); return a; }
|
|
586
|
+
template <int N>
|
|
587
|
+
NETGEN_INLINE SIMD<int64_t,N> & operator*= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a=a*b; return a; }
|
|
588
|
+
template <int N>
|
|
589
|
+
NETGEN_INLINE SIMD<int64_t,N> & operator*= (SIMD<int64_t,N> & a, int64_t b) { a*=SIMD<int64_t,N>(b); return a; }
|
|
590
|
+
template <int N>
|
|
591
|
+
NETGEN_INLINE SIMD<int64_t,N> & operator/= (SIMD<int64_t,N> & a, SIMD<int64_t,N> b) { a = a/b; return a; }
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
// double operators with scalar operand (implement overloads to allow implicit casts for second operand)
|
|
595
|
+
template <int N>
|
|
596
|
+
NETGEN_INLINE SIMD<double,N> operator+ (SIMD<double,N> a, double b) { return a+SIMD<double,N>(b); }
|
|
597
|
+
template <int N>
|
|
598
|
+
NETGEN_INLINE SIMD<double,N> operator+ (double a, SIMD<double,N> b) { return SIMD<double,N>(a)+b; }
|
|
599
|
+
template <int N>
|
|
600
|
+
NETGEN_INLINE SIMD<double,N> operator- (double a, SIMD<double,N> b) { return SIMD<double,N>(a)-b; }
|
|
601
|
+
template <int N>
|
|
602
|
+
NETGEN_INLINE SIMD<double,N> operator- (SIMD<double,N> a, double b) { return a-SIMD<double,N>(b); }
|
|
603
|
+
template <int N>
|
|
604
|
+
NETGEN_INLINE SIMD<double,N> operator* (double a, SIMD<double,N> b) { return SIMD<double,N>(a)*b; }
|
|
605
|
+
template <int N>
|
|
606
|
+
NETGEN_INLINE SIMD<double,N> operator* (SIMD<double,N> b, double a) { return SIMD<double,N>(a)*b; }
|
|
607
|
+
template <int N>
|
|
608
|
+
NETGEN_INLINE SIMD<double,N> operator/ (SIMD<double,N> a, double b) { return a/SIMD<double,N>(b); }
|
|
609
|
+
template <int N>
|
|
610
|
+
NETGEN_INLINE SIMD<double,N> operator/ (double a, SIMD<double,N> b) { return SIMD<double,N>(a)/b; }
|
|
611
|
+
template <int N>
|
|
612
|
+
NETGEN_INLINE SIMD<double,N> & operator+= (SIMD<double,N> & a, SIMD<double,N> b) { a=a+b; return a; }
|
|
613
|
+
template <int N>
|
|
614
|
+
NETGEN_INLINE SIMD<double,N> & operator+= (SIMD<double,N> & a, double b) { a+=SIMD<double,N>(b); return a; }
|
|
615
|
+
template <int N>
|
|
616
|
+
NETGEN_INLINE SIMD<double,N> & operator-= (SIMD<double,N> & a, SIMD<double,N> b) { a = a-b; return a; }
|
|
617
|
+
template <int N>
|
|
618
|
+
NETGEN_INLINE SIMD<double,N> & operator-= (SIMD<double,N> & a, double b) { a-=SIMD<double,N>(b); return a; }
|
|
619
|
+
template <int N>
|
|
620
|
+
NETGEN_INLINE SIMD<double,N> & operator*= (SIMD<double,N> & a, SIMD<double,N> b) { a=a*b; return a; }
|
|
621
|
+
template <int N>
|
|
622
|
+
NETGEN_INLINE SIMD<double,N> & operator*= (SIMD<double,N> & a, double b) { a*=SIMD<double,N>(b); return a; }
|
|
623
|
+
template <int N>
|
|
624
|
+
NETGEN_INLINE SIMD<double,N> & operator/= (SIMD<double,N> & a, SIMD<double,N> b) { a = a/b; return a; }
|
|
625
|
+
|
|
626
|
+
template <int N>
|
|
627
|
+
NETGEN_INLINE auto operator> (SIMD<double,N> & a, double b) { return a > SIMD<double,N>(b); }
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
// double functions
|
|
631
|
+
|
|
632
|
+
template <int N>
|
|
633
|
+
NETGEN_INLINE SIMD<double,N> L2Norm2 (SIMD<double,N> a) { return a*a; }
|
|
634
|
+
template <int N>
|
|
635
|
+
NETGEN_INLINE SIMD<double,N> Trans (SIMD<double,N> a) { return a; }
|
|
636
|
+
|
|
637
|
+
template <int N>
|
|
638
|
+
NETGEN_INLINE double HSum (SIMD<double,N> a)
|
|
639
|
+
{
|
|
640
|
+
if constexpr(N==1)
|
|
641
|
+
return a.Data();
|
|
642
|
+
else
|
|
643
|
+
return HSum(a.Lo()) + HSum(a.Hi());
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
template<typename T, int N>
|
|
648
|
+
NETGEN_INLINE SIMD<T,N> IfPos (SIMD<T,N> a, SIMD<T,N> b, SIMD<T,N> c)
|
|
649
|
+
{
|
|
650
|
+
if constexpr(N==1) return a.Data()>0.0 ? b : c;
|
|
651
|
+
else return { IfPos(a.Lo(), b.Lo(), c.Lo()), IfPos(a.Hi(), b.Hi(), c.Hi())};
|
|
652
|
+
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
template<typename T, int N>
|
|
656
|
+
NETGEN_INLINE SIMD<T,N> IfZero (SIMD<T,N> a, SIMD<T,N> b, SIMD<T,N> c)
|
|
657
|
+
{
|
|
658
|
+
if constexpr(N==1) return a.Data()==0.0 ? b : c;
|
|
659
|
+
else return { IfZero(a.Lo(), b.Lo(), c.Lo()), IfZero(a.Hi(), b.Hi(), c.Hi())};
|
|
660
|
+
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
template<typename T, int N>
|
|
664
|
+
NETGEN_INLINE SIMD<T,N> If (SIMD<mask64,N> a, SIMD<T,N> b, SIMD<T,N> c)
|
|
665
|
+
{
|
|
666
|
+
if constexpr(N==1) return a.Data() ? b : c;
|
|
667
|
+
else return { If(a.Lo(), b.Lo(), c.Lo()), If(a.Hi(), b.Hi(), c.Hi())};
|
|
668
|
+
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
// a*b+c
|
|
672
|
+
template <typename T1, typename T2, typename T3>
|
|
673
|
+
NETGEN_INLINE auto FMA(T1 a, T2 b, T3 c)
|
|
674
|
+
{
|
|
675
|
+
return c+a*b;
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
template <typename T1, typename T2, typename T3>
|
|
679
|
+
NETGEN_INLINE auto FNMA(T1 a, T2 b, T3 c)
|
|
680
|
+
{
|
|
681
|
+
return c-a*b;
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
// update form of fma
|
|
685
|
+
template <int N>
|
|
686
|
+
void FMAasm (SIMD<double,N> a, SIMD<double,N> b, SIMD<double,N> & sum)
|
|
687
|
+
{
|
|
688
|
+
sum = FMA(a,b,sum);
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
// update form of fms
|
|
692
|
+
template <int N>
|
|
693
|
+
void FNMAasm (SIMD<double,N> a, SIMD<double,N> b, SIMD<double,N> & sum)
|
|
694
|
+
{
|
|
695
|
+
// sum -= a*b;
|
|
696
|
+
sum = FNMA(a,b,sum);
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
// c += a*b (a0re, a0im, a1re, a1im, ...),
|
|
700
|
+
template <int N>
|
|
701
|
+
void FMAComplex (SIMD<double,N> a, SIMD<double,N> b, SIMD<double,N> & c)
|
|
702
|
+
{
|
|
703
|
+
auto [are, aim] = Unpack(a, a);
|
|
704
|
+
SIMD<double,N> bswap = SwapPairs(b);
|
|
705
|
+
SIMD<double,N> aim_bswap = aim*bswap;
|
|
706
|
+
c += FMAddSub (are, b, aim_bswap);
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
template <int i, typename T, int N>
|
|
710
|
+
T get(SIMD<T,N> a) { return a.template Get<i>(); }
|
|
711
|
+
|
|
712
|
+
template <int NUM, typename FUNC>
|
|
713
|
+
NETGEN_INLINE void Iterate2 (FUNC f)
|
|
714
|
+
{
|
|
715
|
+
if constexpr (NUM > 1) Iterate2<NUM-1> (f);
|
|
716
|
+
if constexpr (NUM >= 1) f(std::integral_constant<int,NUM-1>());
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
template<typename T2, typename T1>
|
|
721
|
+
T2 BitCast(T1 a)
|
|
722
|
+
{
|
|
723
|
+
T2 result;
|
|
724
|
+
static_assert(sizeof(T1) == sizeof(T2), "BitCast requires same size");
|
|
725
|
+
memcpy(&result, &a, sizeof(T1));
|
|
726
|
+
return result;
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
template <typename T, typename T1, int N>
|
|
730
|
+
SIMD<T, N> Reinterpret (SIMD<T1,N> a)
|
|
731
|
+
{
|
|
732
|
+
if constexpr (N == 1)
|
|
733
|
+
return SIMD<T,N> ( * (T*)(void*) & a.Data());
|
|
734
|
+
else if constexpr (N == 2)
|
|
735
|
+
return SIMD<T,N> { BitCast<T> (a.Lo()),
|
|
736
|
+
BitCast<T> (a.Hi()) };
|
|
737
|
+
else
|
|
738
|
+
return SIMD<T,N> (Reinterpret<T> (a.Lo()), Reinterpret<T> (a.Hi()));
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
using std::round;
|
|
743
|
+
template <int N>
|
|
744
|
+
SIMD<double,N> round (SIMD<double,N> x)
|
|
745
|
+
{
|
|
746
|
+
if constexpr (N == 1) return round(x);
|
|
747
|
+
else return { round(x.Lo()), round(x.Hi()) };
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
// NETGEN_INLINE int64_t RoundI (double x) { return lround(x); }
|
|
751
|
+
using std::lround;
|
|
752
|
+
template <int N>
|
|
753
|
+
SIMD<int64_t,N> lround (SIMD<double,N> x)
|
|
754
|
+
{
|
|
755
|
+
if constexpr (N == 1) return SIMD<int64_t,1> (lround(x));
|
|
756
|
+
else return { lround(x.Lo()), lround(x.Hi()) };
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
/*
|
|
760
|
+
reciprocal square root
|
|
761
|
+
Quake III algorithm, or intrinsics
|
|
762
|
+
*/
|
|
763
|
+
NETGEN_INLINE double rsqrt (double x) { return 1.0/sqrt(x); }
|
|
764
|
+
template <int N>
|
|
765
|
+
SIMD<double,N> rsqrt (SIMD<double,N> x)
|
|
766
|
+
{
|
|
767
|
+
if constexpr (N == 1) return 1.0/sqrt(x.Data());
|
|
768
|
+
else return { rsqrt(x.Lo()), rsqrt(x.Hi()) };
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
template <int N>
|
|
772
|
+
int64_t operator<< (int64_t a, IC<N> n) { return a << n.value; }
|
|
773
|
+
|
|
774
|
+
template <int S, int N>
|
|
775
|
+
SIMD<int64_t,S> operator<< (SIMD<int64_t,S> a, IC<N> n)
|
|
776
|
+
{
|
|
777
|
+
if constexpr (S == 1) return SIMD<int64_t,1> (a.Data() << n);
|
|
778
|
+
else return SIMD<int64_t,S> (a.Lo() << n, a.Hi() << n);
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
template <typename T, int N>
|
|
785
|
+
auto Min (SIMD<T,N> a, SIMD<T,N> b)
|
|
786
|
+
{
|
|
787
|
+
if constexpr (N==1)
|
|
788
|
+
return SIMD<T,1> (std::min(a[0], b[0]));
|
|
789
|
+
else
|
|
790
|
+
return SIMD<T,N> (Min(a.Lo(), b.Lo()), Min(a.Hi(), b.Hi()));
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
template <typename T, int N>
|
|
794
|
+
auto Max (SIMD<T,N> a, SIMD<T,N> b)
|
|
795
|
+
{
|
|
796
|
+
if constexpr (N==1)
|
|
797
|
+
return SIMD<T,1> (std::max(a[0], b[0]));
|
|
798
|
+
else
|
|
799
|
+
return SIMD<T,N> (Max(a.Lo(), b.Lo()), Max(a.Hi(), b.Hi()));
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
template <typename T, int N>
|
|
807
|
+
ostream & operator<< (ostream & ost, SIMD<T,N> simd)
|
|
808
|
+
{
|
|
809
|
+
/*
|
|
810
|
+
ost << simd[0];
|
|
811
|
+
for (int i = 1; i < simd.Size(); i++)
|
|
812
|
+
ost << " " << simd[i];
|
|
813
|
+
*/
|
|
814
|
+
Iterate2<simd.Size()> ([&] (auto I) {
|
|
815
|
+
if (I.value != 0) ost << " ";
|
|
816
|
+
ost << get<I.value>(simd);
|
|
817
|
+
});
|
|
818
|
+
return ost;
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
using std::sqrt;
|
|
822
|
+
template <int N>
|
|
823
|
+
NETGEN_INLINE ngcore::SIMD<double,N> sqrt (ngcore::SIMD<double,N> a)
|
|
824
|
+
{
|
|
825
|
+
if constexpr (N == 1) return sqrt(a.Data());
|
|
826
|
+
else return { sqrt(a.Lo()), sqrt(a.Hi()) };
|
|
827
|
+
// return ngcore::SIMD<double,N>([a](int i)->double { return sqrt(a[i]); } );
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
using std::fabs;
|
|
831
|
+
template <int N>
|
|
832
|
+
NETGEN_INLINE ngcore::SIMD<double,N> fabs (ngcore::SIMD<double,N> a) {
|
|
833
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return fabs(a[i]); } );
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
using std::floor;
|
|
837
|
+
template <int N>
|
|
838
|
+
NETGEN_INLINE ngcore::SIMD<double,N> floor (ngcore::SIMD<double,N> a) {
|
|
839
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return floor(a[i]); } );
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
using std::ceil;
|
|
843
|
+
template <int N>
|
|
844
|
+
NETGEN_INLINE ngcore::SIMD<double,N> ceil (ngcore::SIMD<double,N> a) {
|
|
845
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return ceil(a[i]); } );
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
using std::exp;
|
|
849
|
+
template <int N>
|
|
850
|
+
NETGEN_INLINE ngcore::SIMD<double,N> exp (ngcore::SIMD<double,N> a) {
|
|
851
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return exp(a[i]); } );
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
using std::log;
|
|
855
|
+
template <int N>
|
|
856
|
+
NETGEN_INLINE ngcore::SIMD<double,N> log (ngcore::SIMD<double,N> a) {
|
|
857
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return log(a[i]); } );
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
using std::erf;
|
|
861
|
+
template <int N>
|
|
862
|
+
NETGEN_INLINE ngcore::SIMD<double,N> erf (ngcore::SIMD<double,N> a) {
|
|
863
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return erf(a[i]); } );
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
using std::pow;
|
|
867
|
+
template <int N>
|
|
868
|
+
NETGEN_INLINE ngcore::SIMD<double,N> pow (ngcore::SIMD<double,N> a, double x) {
|
|
869
|
+
return ngcore::SIMD<double,N>([a,x](int i)->double { return pow(a[i],x); } );
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
template <int N>
|
|
873
|
+
NETGEN_INLINE ngcore::SIMD<double,N> pow (ngcore::SIMD<double,N> a, ngcore::SIMD<double,N> b) {
|
|
874
|
+
return ngcore::SIMD<double,N>([a,b](int i)->double { return pow(a[i],b[i]); } );
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
using std::sin;
|
|
878
|
+
template <int N>
|
|
879
|
+
NETGEN_INLINE ngcore::SIMD<double,N> sin (ngcore::SIMD<double,N> a) {
|
|
880
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return sin(a[i]); } );
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
using std::cos;
|
|
884
|
+
template <int N>
|
|
885
|
+
NETGEN_INLINE ngcore::SIMD<double,N> cos (ngcore::SIMD<double,N> a) {
|
|
886
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return cos(a[i]); } );
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
using std::tan;
|
|
890
|
+
template <int N>
|
|
891
|
+
NETGEN_INLINE ngcore::SIMD<double,N> tan (ngcore::SIMD<double,N> a) {
|
|
892
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return tan(a[i]); } );
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
using std::atan;
|
|
896
|
+
template <int N>
|
|
897
|
+
NETGEN_INLINE ngcore::SIMD<double,N> atan (ngcore::SIMD<double,N> a) {
|
|
898
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return atan(a[i]); } );
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
using std::atan2;
|
|
902
|
+
template <int N>
|
|
903
|
+
NETGEN_INLINE ngcore::SIMD<double,N> atan2 (ngcore::SIMD<double,N> y, ngcore::SIMD<double,N> x) {
|
|
904
|
+
return ngcore::SIMD<double,N>([y,x](int i)->double { return atan2(y[i], x[i]); } );
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
using std::acos;
|
|
908
|
+
template <int N>
|
|
909
|
+
NETGEN_INLINE ngcore::SIMD<double,N> acos (ngcore::SIMD<double,N> a) {
|
|
910
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return acos(a[i]); } );
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
using std::asin;
|
|
914
|
+
template <int N>
|
|
915
|
+
NETGEN_INLINE ngcore::SIMD<double,N> asin (ngcore::SIMD<double,N> a) {
|
|
916
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return asin(a[i]); } );
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
using std::sinh;
|
|
920
|
+
template <int N>
|
|
921
|
+
NETGEN_INLINE ngcore::SIMD<double,N> sinh (ngcore::SIMD<double,N> a) {
|
|
922
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return sinh(a[i]); } );
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
using std::cosh;
|
|
926
|
+
template <int N>
|
|
927
|
+
NETGEN_INLINE ngcore::SIMD<double,N> cosh (ngcore::SIMD<double,N> a) {
|
|
928
|
+
return ngcore::SIMD<double,N>([a](int i)->double { return cosh(a[i]); } );
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
template<int N, typename T>
|
|
932
|
+
using MultiSIMD = SIMD<T, N*GetDefaultSIMDSize()>;
|
|
933
|
+
|
|
934
|
+
template<int N>
|
|
935
|
+
NETGEN_INLINE auto Unpack (SIMD<double,N> a, SIMD<double,N> b)
|
|
936
|
+
{
|
|
937
|
+
if constexpr(N==1)
|
|
938
|
+
{
|
|
939
|
+
return std::make_tuple(SIMD<double,N>{a.Data()}, SIMD<double,N>{b.Data()} );
|
|
940
|
+
}
|
|
941
|
+
else if constexpr(N==2)
|
|
942
|
+
{
|
|
943
|
+
return std::make_tuple(SIMD<double,N>{ a.Lo(), b.Lo() },
|
|
944
|
+
SIMD<double,N>{ a.Hi(), b.Hi() });
|
|
945
|
+
}
|
|
946
|
+
else
|
|
947
|
+
{
|
|
948
|
+
auto [a1,b1] = Unpack(a.Lo(), b.Lo());
|
|
949
|
+
auto [a2,b2] = Unpack(a.Hi(), b.Hi());
|
|
950
|
+
return std::make_tuple(SIMD<double,N>{ a1, a2 },
|
|
951
|
+
SIMD<double,N>{ b1, b2 });
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
// TODO: specialize for AVX, ...
|
|
956
|
+
template<int N>
|
|
957
|
+
NETGEN_INLINE auto SwapPairs (SIMD<double,N> a)
|
|
958
|
+
{
|
|
959
|
+
if constexpr(N==1) {
|
|
960
|
+
// static_assert(false);
|
|
961
|
+
return a;
|
|
962
|
+
}
|
|
963
|
+
else if constexpr(N==2) {
|
|
964
|
+
return SIMD<double,N> (a.Hi(), a.Lo());
|
|
965
|
+
}
|
|
966
|
+
else {
|
|
967
|
+
return SIMD<double,N> (SwapPairs(a.Lo()), SwapPairs(a.Hi()));
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
template<int N>
|
|
973
|
+
NETGEN_INLINE auto HSum128 (SIMD<double,N> a)
|
|
974
|
+
{
|
|
975
|
+
if constexpr(N==1) {
|
|
976
|
+
// static_assert(false);
|
|
977
|
+
return a;
|
|
978
|
+
}
|
|
979
|
+
else if constexpr(N==2) {
|
|
980
|
+
return a;
|
|
981
|
+
}
|
|
982
|
+
else {
|
|
983
|
+
return HSum128(a.Lo()) + HSum128(a.Hi());
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
// TODO: specialize for AVX, ...
|
|
989
|
+
// a*b+-c (even: -, odd: +)
|
|
990
|
+
template<int N>
|
|
991
|
+
NETGEN_INLINE auto FMAddSub (SIMD<double,N> a, SIMD<double,N> b, SIMD<double,N> c)
|
|
992
|
+
{
|
|
993
|
+
if constexpr(N==1) {
|
|
994
|
+
// static_assert(false);
|
|
995
|
+
return a*b-c;
|
|
996
|
+
}
|
|
997
|
+
else if constexpr(N==2) {
|
|
998
|
+
return SIMD<double,N> (a.Lo()*b.Lo()-c.Lo(),
|
|
999
|
+
a.Hi()*b.Hi()+c.Hi());
|
|
1000
|
+
}
|
|
1001
|
+
else {
|
|
1002
|
+
return SIMD<double,N> (FMAddSub(a.Lo(), b.Lo(), c.Lo()),
|
|
1003
|
+
FMAddSub(a.Hi(), b.Hi(), c.Hi()));
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
|
|
1010
|
+
template <int BASE, typename Tuple, std::size_t ... Is>
|
|
1011
|
+
auto subtuple (const Tuple& tup, std::index_sequence<Is...>)
|
|
1012
|
+
{
|
|
1013
|
+
return std::make_tuple(std::get<BASE+Is>(tup)...);
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
template <typename ...Args, typename T, int M>
|
|
1017
|
+
auto Concat (std::tuple<SIMD<T,M>, Args...> tup)
|
|
1018
|
+
{
|
|
1019
|
+
constexpr size_t N = std::tuple_size<std::tuple<SIMD<T,M>, Args...>>();
|
|
1020
|
+
|
|
1021
|
+
if constexpr (N == 1)
|
|
1022
|
+
return get<0>(tup);
|
|
1023
|
+
else
|
|
1024
|
+
{
|
|
1025
|
+
static constexpr size_t N1 = LargestPowerOfTwo(N-1);
|
|
1026
|
+
static constexpr int N2 = N-N1;
|
|
1027
|
+
|
|
1028
|
+
auto SEQ1 = std::make_index_sequence<N1>();
|
|
1029
|
+
auto sub1 = subtuple<0>(tup, SEQ1);
|
|
1030
|
+
|
|
1031
|
+
auto SEQ2 = std::make_index_sequence<N2>();
|
|
1032
|
+
auto sub2 = subtuple<N1>(tup, SEQ2);
|
|
1033
|
+
|
|
1034
|
+
auto S1 = Concat(sub1);
|
|
1035
|
+
auto S2 = Concat(sub2);
|
|
1036
|
+
return SIMD<T,S1.Size()+S2.Size()>(S1, S2);
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
|
|
1045
|
+
namespace std
|
|
1046
|
+
{
|
|
1047
|
+
// structured binding support
|
|
1048
|
+
template <typename T, int N >
|
|
1049
|
+
struct tuple_size<ngcore::SIMD<T,N>> : std::integral_constant<std::size_t, N> {};
|
|
1050
|
+
template<size_t N, typename T, int M> struct tuple_element<N,ngcore::SIMD<T,M>> { using type = T; };
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
#endif // NETGEN_CORE_SIMD_GENERIC_HPP
|