PyPI - netgen-mesher - Versions diffs - 6.2.2506.post35.dev0__cp314-cp314-win_amd64.whl - Mend

netgen-mesher 6.2.2506.post35.dev0__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (340) hide show

netgen/NgOCC.py +7 -0
netgen/__init__.py +114 -0
netgen/__init__.pyi +22 -0
netgen/__main__.py +53 -0
netgen/cmake/NetgenConfig.cmake +79 -0
netgen/cmake/netgen-targets-release.cmake +69 -0
netgen/cmake/netgen-targets.cmake +146 -0
netgen/config/__init__.py +1 -0
netgen/config/__init__.pyi +52 -0
netgen/config/__main__.py +4 -0
netgen/config/config.py +68 -0
netgen/config/config.pyi +54 -0
netgen/csg.py +25 -0
netgen/geom2d.py +178 -0
netgen/gui.py +82 -0
netgen/include/core/archive.hpp +1256 -0
netgen/include/core/array.hpp +1760 -0
netgen/include/core/autodiff.hpp +1131 -0
netgen/include/core/autodiffdiff.hpp +733 -0
netgen/include/core/bitarray.hpp +240 -0
netgen/include/core/concurrentqueue.h +3619 -0
netgen/include/core/exception.hpp +145 -0
netgen/include/core/flags.hpp +199 -0
netgen/include/core/hashtable.hpp +1281 -0
netgen/include/core/localheap.hpp +318 -0
netgen/include/core/logging.hpp +117 -0
netgen/include/core/memtracer.hpp +221 -0
netgen/include/core/mpi4py_pycapi.h +245 -0
netgen/include/core/mpi_wrapper.hpp +643 -0
netgen/include/core/ng_mpi.hpp +94 -0
netgen/include/core/ng_mpi_generated_declarations.hpp +155 -0
netgen/include/core/ng_mpi_native.hpp +25 -0
netgen/include/core/ngcore.hpp +32 -0
netgen/include/core/ngcore_api.hpp +152 -0
netgen/include/core/ngstream.hpp +115 -0
netgen/include/core/paje_trace.hpp +279 -0
netgen/include/core/profiler.hpp +382 -0
netgen/include/core/python_ngcore.hpp +457 -0
netgen/include/core/ranges.hpp +109 -0
netgen/include/core/register_archive.hpp +100 -0
netgen/include/core/signal.hpp +82 -0
netgen/include/core/simd.hpp +160 -0
netgen/include/core/simd_arm64.hpp +407 -0
netgen/include/core/simd_avx.hpp +394 -0
netgen/include/core/simd_avx512.hpp +285 -0
netgen/include/core/simd_generic.hpp +1053 -0
netgen/include/core/simd_math.hpp +178 -0
netgen/include/core/simd_sse.hpp +289 -0
netgen/include/core/statushandler.hpp +37 -0
netgen/include/core/symboltable.hpp +153 -0
netgen/include/core/table.hpp +810 -0
netgen/include/core/taskmanager.hpp +1161 -0
netgen/include/core/type_traits.hpp +65 -0
netgen/include/core/utils.hpp +385 -0
netgen/include/core/version.hpp +102 -0
netgen/include/core/xbool.hpp +47 -0
netgen/include/csg/algprim.hpp +563 -0
netgen/include/csg/brick.hpp +150 -0
netgen/include/csg/csg.hpp +43 -0
netgen/include/csg/csgeom.hpp +389 -0
netgen/include/csg/csgparser.hpp +101 -0
netgen/include/csg/curve2d.hpp +67 -0
netgen/include/csg/edgeflw.hpp +112 -0
netgen/include/csg/explicitcurve2d.hpp +113 -0
netgen/include/csg/extrusion.hpp +185 -0
netgen/include/csg/gencyl.hpp +70 -0
netgen/include/csg/geoml.hpp +16 -0
netgen/include/csg/identify.hpp +213 -0
netgen/include/csg/manifold.hpp +29 -0
netgen/include/csg/meshsurf.hpp +46 -0
netgen/include/csg/polyhedra.hpp +121 -0
netgen/include/csg/revolution.hpp +180 -0
netgen/include/csg/singularref.hpp +84 -0
netgen/include/csg/solid.hpp +295 -0
netgen/include/csg/specpoin.hpp +194 -0
netgen/include/csg/spline3d.hpp +99 -0
netgen/include/csg/splinesurface.hpp +85 -0
netgen/include/csg/surface.hpp +394 -0
netgen/include/csg/triapprox.hpp +63 -0
netgen/include/csg/vscsg.hpp +34 -0
netgen/include/general/autodiff.hpp +356 -0
netgen/include/general/autoptr.hpp +39 -0
netgen/include/general/gzstream.h +121 -0
netgen/include/general/hashtabl.hpp +1692 -0
netgen/include/general/myadt.hpp +48 -0
netgen/include/general/mystring.hpp +226 -0
netgen/include/general/netgenout.hpp +205 -0
netgen/include/general/ngarray.hpp +797 -0
netgen/include/general/ngbitarray.hpp +149 -0
netgen/include/general/ngpython.hpp +74 -0
netgen/include/general/optmem.hpp +44 -0
netgen/include/general/parthreads.hpp +138 -0
netgen/include/general/seti.hpp +50 -0
netgen/include/general/sort.hpp +47 -0
netgen/include/general/spbita2d.hpp +59 -0
netgen/include/general/stack.hpp +114 -0
netgen/include/general/table.hpp +280 -0
netgen/include/general/template.hpp +509 -0
netgen/include/geom2d/csg2d.hpp +750 -0
netgen/include/geom2d/geometry2d.hpp +280 -0
netgen/include/geom2d/spline2d.hpp +234 -0
netgen/include/geom2d/vsgeom2d.hpp +28 -0
netgen/include/gprim/adtree.hpp +1392 -0
netgen/include/gprim/geom2d.hpp +858 -0
netgen/include/gprim/geom3d.hpp +749 -0
netgen/include/gprim/geomfuncs.hpp +212 -0
netgen/include/gprim/geomobjects.hpp +544 -0
netgen/include/gprim/geomops.hpp +404 -0
netgen/include/gprim/geomtest3d.hpp +101 -0
netgen/include/gprim/gprim.hpp +33 -0
netgen/include/gprim/spline.hpp +778 -0
netgen/include/gprim/splinegeometry.hpp +73 -0
netgen/include/gprim/transform3d.hpp +216 -0
netgen/include/include/acisgeom.hpp +3 -0
netgen/include/include/csg.hpp +1 -0
netgen/include/include/geometry2d.hpp +1 -0
netgen/include/include/gprim.hpp +1 -0
netgen/include/include/incopengl.hpp +62 -0
netgen/include/include/inctcl.hpp +13 -0
netgen/include/include/incvis.hpp +6 -0
netgen/include/include/linalg.hpp +1 -0
netgen/include/include/meshing.hpp +1 -0
netgen/include/include/myadt.hpp +1 -0
netgen/include/include/mydefs.hpp +70 -0
netgen/include/include/mystdlib.h +59 -0
netgen/include/include/netgen_config.hpp +27 -0
netgen/include/include/netgen_version.hpp +9 -0
netgen/include/include/nginterface_v2_impl.hpp +395 -0
netgen/include/include/ngsimd.hpp +1 -0
netgen/include/include/occgeom.hpp +1 -0
netgen/include/include/opti.hpp +1 -0
netgen/include/include/parallel.hpp +1 -0
netgen/include/include/stlgeom.hpp +1 -0
netgen/include/include/visual.hpp +1 -0
netgen/include/interface/rw_medit.hpp +11 -0
netgen/include/interface/writeuser.hpp +80 -0
netgen/include/linalg/densemat.hpp +414 -0
netgen/include/linalg/linalg.hpp +29 -0
netgen/include/linalg/opti.hpp +142 -0
netgen/include/linalg/polynomial.hpp +47 -0
netgen/include/linalg/vector.hpp +217 -0
netgen/include/meshing/adfront2.hpp +274 -0
netgen/include/meshing/adfront3.hpp +332 -0
netgen/include/meshing/basegeom.hpp +370 -0
netgen/include/meshing/bcfunctions.hpp +53 -0
netgen/include/meshing/bisect.hpp +72 -0
netgen/include/meshing/boundarylayer.hpp +113 -0
netgen/include/meshing/classifyhpel.hpp +1984 -0
netgen/include/meshing/clusters.hpp +46 -0
netgen/include/meshing/curvedelems.hpp +274 -0
netgen/include/meshing/delaunay2d.hpp +73 -0
netgen/include/meshing/fieldlines.hpp +103 -0
netgen/include/meshing/findip.hpp +198 -0
netgen/include/meshing/findip2.hpp +103 -0
netgen/include/meshing/geomsearch.hpp +69 -0
netgen/include/meshing/global.hpp +54 -0
netgen/include/meshing/hpref_hex.hpp +330 -0
netgen/include/meshing/hpref_prism.hpp +3405 -0
netgen/include/meshing/hpref_pyramid.hpp +154 -0
netgen/include/meshing/hpref_quad.hpp +2082 -0
netgen/include/meshing/hpref_segm.hpp +122 -0
netgen/include/meshing/hpref_tet.hpp +4230 -0
netgen/include/meshing/hpref_trig.hpp +848 -0
netgen/include/meshing/hprefinement.hpp +366 -0
netgen/include/meshing/improve2.hpp +178 -0
netgen/include/meshing/improve3.hpp +151 -0
netgen/include/meshing/localh.hpp +223 -0
netgen/include/meshing/meshclass.hpp +1076 -0
netgen/include/meshing/meshfunc.hpp +47 -0
netgen/include/meshing/meshing.hpp +63 -0
netgen/include/meshing/meshing2.hpp +163 -0
netgen/include/meshing/meshing3.hpp +123 -0
netgen/include/meshing/meshtool.hpp +90 -0
netgen/include/meshing/meshtype.hpp +1930 -0
netgen/include/meshing/msghandler.hpp +62 -0
netgen/include/meshing/paralleltop.hpp +172 -0
netgen/include/meshing/python_mesh.hpp +206 -0
netgen/include/meshing/ruler2.hpp +172 -0
netgen/include/meshing/ruler3.hpp +211 -0
netgen/include/meshing/soldata.hpp +141 -0
netgen/include/meshing/specials.hpp +17 -0
netgen/include/meshing/surfacegeom.hpp +73 -0
netgen/include/meshing/topology.hpp +1003 -0
netgen/include/meshing/validate.hpp +21 -0
netgen/include/meshing/visual_interface.hpp +71 -0
netgen/include/mydefs.hpp +70 -0
netgen/include/nginterface.h +474 -0
netgen/include/nginterface_v2.hpp +406 -0
netgen/include/nglib.h +697 -0
netgen/include/nglib_occ.h +50 -0
netgen/include/occ/occ_edge.hpp +47 -0
netgen/include/occ/occ_face.hpp +52 -0
netgen/include/occ/occ_solid.hpp +23 -0
netgen/include/occ/occ_utils.hpp +376 -0
netgen/include/occ/occ_vertex.hpp +30 -0
netgen/include/occ/occgeom.hpp +659 -0
netgen/include/occ/occmeshsurf.hpp +168 -0
netgen/include/occ/vsocc.hpp +33 -0
netgen/include/pybind11/LICENSE +29 -0
netgen/include/pybind11/attr.h +722 -0
netgen/include/pybind11/buffer_info.h +208 -0
netgen/include/pybind11/cast.h +2361 -0
netgen/include/pybind11/chrono.h +228 -0
netgen/include/pybind11/common.h +2 -0
netgen/include/pybind11/complex.h +74 -0
netgen/include/pybind11/conduit/README.txt +15 -0
netgen/include/pybind11/conduit/pybind11_conduit_v1.h +116 -0
netgen/include/pybind11/conduit/pybind11_platform_abi_id.h +87 -0
netgen/include/pybind11/conduit/wrap_include_python_h.h +72 -0
netgen/include/pybind11/critical_section.h +56 -0
netgen/include/pybind11/detail/class.h +823 -0
netgen/include/pybind11/detail/common.h +1348 -0
netgen/include/pybind11/detail/cpp_conduit.h +75 -0
netgen/include/pybind11/detail/descr.h +226 -0
netgen/include/pybind11/detail/dynamic_raw_ptr_cast_if_possible.h +39 -0
netgen/include/pybind11/detail/exception_translation.h +71 -0
netgen/include/pybind11/detail/function_record_pyobject.h +191 -0
netgen/include/pybind11/detail/init.h +538 -0
netgen/include/pybind11/detail/internals.h +799 -0
netgen/include/pybind11/detail/native_enum_data.h +209 -0
netgen/include/pybind11/detail/pybind11_namespace_macros.h +82 -0
netgen/include/pybind11/detail/struct_smart_holder.h +378 -0
netgen/include/pybind11/detail/type_caster_base.h +1591 -0
netgen/include/pybind11/detail/typeid.h +65 -0
netgen/include/pybind11/detail/using_smart_holder.h +22 -0
netgen/include/pybind11/detail/value_and_holder.h +90 -0
netgen/include/pybind11/eigen/common.h +9 -0
netgen/include/pybind11/eigen/matrix.h +723 -0
netgen/include/pybind11/eigen/tensor.h +521 -0
netgen/include/pybind11/eigen.h +12 -0
netgen/include/pybind11/embed.h +320 -0
netgen/include/pybind11/eval.h +161 -0
netgen/include/pybind11/functional.h +147 -0
netgen/include/pybind11/gil.h +199 -0
netgen/include/pybind11/gil_safe_call_once.h +102 -0
netgen/include/pybind11/gil_simple.h +37 -0
netgen/include/pybind11/iostream.h +265 -0
netgen/include/pybind11/native_enum.h +67 -0
netgen/include/pybind11/numpy.h +2312 -0
netgen/include/pybind11/operators.h +202 -0
netgen/include/pybind11/options.h +92 -0
netgen/include/pybind11/pybind11.h +3645 -0
netgen/include/pybind11/pytypes.h +2680 -0
netgen/include/pybind11/stl/filesystem.h +114 -0
netgen/include/pybind11/stl.h +666 -0
netgen/include/pybind11/stl_bind.h +858 -0
netgen/include/pybind11/subinterpreter.h +299 -0
netgen/include/pybind11/trampoline_self_life_support.h +65 -0
netgen/include/pybind11/type_caster_pyobject_ptr.h +61 -0
netgen/include/pybind11/typing.h +298 -0
netgen/include/pybind11/warnings.h +75 -0
netgen/include/stlgeom/meshstlsurface.hpp +67 -0
netgen/include/stlgeom/stlgeom.hpp +491 -0
netgen/include/stlgeom/stlline.hpp +193 -0
netgen/include/stlgeom/stltool.hpp +331 -0
netgen/include/stlgeom/stltopology.hpp +419 -0
netgen/include/stlgeom/vsstl.hpp +58 -0
netgen/include/visualization/meshdoc.hpp +42 -0
netgen/include/visualization/mvdraw.hpp +325 -0
netgen/include/visualization/vispar.hpp +128 -0
netgen/include/visualization/visual.hpp +28 -0
netgen/include/visualization/visual_api.hpp +10 -0
netgen/include/visualization/vssolution.hpp +399 -0
netgen/lib/libnggui.lib +0 -0
netgen/lib/ngcore.lib +0 -0
netgen/lib/nglib.lib +0 -0
netgen/lib/togl.lib +0 -0
netgen/libnggui.dll +0 -0
netgen/libngguipy.lib +0 -0
netgen/libngguipy.pyd +0 -0
netgen/libngpy/_NgOCC.pyi +1545 -0
netgen/libngpy/__init__.pyi +7 -0
netgen/libngpy/_csg.pyi +259 -0
netgen/libngpy/_geom2d.pyi +323 -0
netgen/libngpy/_meshing.pyi +1111 -0
netgen/libngpy/_stl.pyi +131 -0
netgen/libngpy.lib +0 -0
netgen/libngpy.pyd +0 -0
netgen/meshing.py +65 -0
netgen/ngcore.dll +0 -0
netgen/nglib.dll +0 -0
netgen/occ.py +52 -0
netgen/read_gmsh.py +259 -0
netgen/read_meshio.py +22 -0
netgen/stl.py +2 -0
netgen/togl.dll +0 -0
netgen/version.py +2 -0
netgen/webgui.py +529 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/boundarycondition.geo +16 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/boxcyl.geo +32 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/circle_on_cube.geo +27 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cone.geo +13 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cube.geo +16 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cubeandring.geo +55 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cubeandspheres.geo +21 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cubemcyl.geo +18 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cubemsphere.geo +19 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cylinder.geo +12 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/cylsphere.geo +12 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/doc/ng4.pdf +0 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/ellipsoid.geo +8 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/ellipticcyl.geo +10 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/extrusion.geo +99 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/fichera.geo +24 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/frame.step +11683 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/hinge.stl +8486 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/lshape3d.geo +26 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/manyholes.geo +26 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/manyholes2.geo +26 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/matrix.geo +27 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/ortho.geo +11 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/part1.stl +2662 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/period.geo +33 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/py_tutorials/exportNeutral.py +26 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/py_tutorials/mesh.py +19 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/py_tutorials/shaft.geo +65 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/revolution.geo +18 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/screw.step +1694 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/sculpture.geo +13 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/shaft.geo +65 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/shell.geo +10 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/sphere.geo +8 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/sphereincube.geo +17 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/square.in2d +35 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/squarecircle.in2d +48 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/squarehole.in2d +47 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/torus.geo +8 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/trafo.geo +57 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/twobricks.geo +15 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/twocubes.geo +18 -0
netgen_mesher-6.2.2506.post35.dev0.data/data/share/netgen/twocyl.geo +16 -0
netgen_mesher-6.2.2506.post35.dev0.dist-info/METADATA +15 -0
netgen_mesher-6.2.2506.post35.dev0.dist-info/RECORD +340 -0
netgen_mesher-6.2.2506.post35.dev0.dist-info/WHEEL +5 -0
netgen_mesher-6.2.2506.post35.dev0.dist-info/entry_points.txt +2 -0
netgen_mesher-6.2.2506.post35.dev0.dist-info/licenses/AUTHORS +1 -0
netgen_mesher-6.2.2506.post35.dev0.dist-info/licenses/LICENSE +504 -0
netgen_mesher-6.2.2506.post35.dev0.dist-info/top_level.txt +2 -0
pyngcore/__init__.py +1 -0
pyngcore/pyngcore.cp314-win_amd64.pyd +0 -0

netgen/include/core/simd_avx.hpp ADDED Viewed

@@ -0,0 +1,394 @@
+#ifndef NETGEN_CORE_SIMD_AVX_HPP
+#define NETGEN_CORE_SIMD_AVX_HPP
+/**************************************************************************/
+/* File:   simd_avx.hpp                                                   */
+/* Author: Joachim Schoeberl, Matthias Hochsteger                         */
+/* Date:   25. Mar. 16                                                    */
+/**************************************************************************/
+#include <immintrin.h>
+namespace ngcore
+{
+#if defined(__GNUC__) && (__GNUC__ == 7)
+  // GCC7 does not have intrinsic _mm256_set_m128i, see
+  // https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values
+  NETGEN_INLINE auto _mm256_set_m128i(__m128i v0, __m128i v1) {
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1);
+  }
+#endif // defined(__GNUC__) && (__GNUC__ == 7)
+#if defined(__AVX2__)
+  NETGEN_INLINE __m256i my_mm256_cmpeq_epi64 (__m256i a, __m256i b)
+  {
+    return _mm256_cmpeq_epi64 (a,b);
+  }
+  NETGEN_INLINE __m256i my_mm256_cmpgt_epi64 (__m256i a, __m256i b)
+  {
+    return _mm256_cmpgt_epi64 (a,b);
+  }
+  NETGEN_INLINE __m256i my_mm256_cvtepi32_epi64 (__m128i a)
+  {
+    return _mm256_cvtepi32_epi64 (a);
+  }
+#else
+  NETGEN_INLINE __m256i my_mm256_cmpeq_epi64 (__m256i a, __m256i b)
+  {
+    __m128i rlo = _mm_cmpeq_epi64(_mm256_extractf128_si256(a, 0),
+                                  _mm256_extractf128_si256(b, 0));
+    __m128i rhi = _mm_cmpeq_epi64(_mm256_extractf128_si256(a, 1),
+                                  _mm256_extractf128_si256(b, 1));
+    return _mm256_insertf128_si256 (_mm256_castsi128_si256(rlo), rhi, 1);
+  }
+  NETGEN_INLINE __m256i my_mm256_cmpgt_epi64 (__m256i a, __m256i b)
+  {
+    __m128i rlo = _mm_cmpgt_epi64(_mm256_extractf128_si256(a, 0),
+                                  _mm256_extractf128_si256(b, 0));
+    __m128i rhi = _mm_cmpgt_epi64(_mm256_extractf128_si256(a, 1),
+                                  _mm256_extractf128_si256(b, 1));
+    return _mm256_insertf128_si256 (_mm256_castsi128_si256(rlo), rhi, 1);
+  }
+  NETGEN_INLINE __m256i my_mm256_cvtepi32_epi64 (__m128i a)
+  {
+    __m128i rlo = _mm_cvtepi32_epi64(a);   // First two 32-bit integers
+    __m128i rhi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 2, 3, 2))); // Next two 32-bit integers
+    return _mm256_insertf128_si256 (_mm256_castsi128_si256(rlo), rhi, 1);
+  }
+#endif
+  template <>
+  class SIMD<mask64,4>
+  {
+    __m256i mask;
+  public:
+    SIMD (int64_t i)
+      : mask(my_mm256_cmpgt_epi64(_mm256_set1_epi64x(i),
+                                  _mm256_set_epi64x(3, 2, 1, 0)))
+    { ; }
+    SIMD (__m256i _mask) : mask(_mask) { ; }
+    SIMD (__m256d _mask) : mask(_mm256_castpd_si256(_mask)) { ; }
+    __m256i Data() const { return mask; }
+    static constexpr int Size() { return 4; }
+    static SIMD<mask64, 4> GetMaskFromBits (unsigned int i);
+  };
+  static SIMD<mask64, 4> masks_from_4bits[16] = {
+    _mm256_set_epi64x (0,0,0,0), _mm256_set_epi64x (0,0,0,-1),
+    _mm256_set_epi64x (0,0,-1,0), _mm256_set_epi64x (0,0,-1,-1),
+    _mm256_set_epi64x (0,-1,0,0), _mm256_set_epi64x (0,-1,0,-1),
+    _mm256_set_epi64x (0,-1,-1,0), _mm256_set_epi64x (0,-1,-1,-1),
+    _mm256_set_epi64x (-1,0,0,0), _mm256_set_epi64x (-1,0,0,-1),
+    _mm256_set_epi64x (-1,0,-1,0), _mm256_set_epi64x (-1,0,-1,-1),
+    _mm256_set_epi64x (-1,-1,0,0), _mm256_set_epi64x (-1,-1,0,-1),
+    _mm256_set_epi64x (-1,-1,-1,0), _mm256_set_epi64x (-1,-1,-1,-1)
+  };
+  NETGEN_INLINE SIMD<mask64, 4> SIMD<mask64, 4> :: GetMaskFromBits (unsigned int i)
+  {
+    return masks_from_4bits[i & 15];
+  }
+  template<>
+  class alignas(32) SIMD<int64_t,4>
+  {
+    __m256i data;
+  public:
+    static constexpr int Size() { return 4; }
+    SIMD () {}
+    SIMD (const SIMD &) = default;
+    SIMD & operator= (const SIMD &) = default;
+    SIMD (int64_t val) { data = _mm256_set1_epi64x(val); }
+    SIMD (int64_t v0, int64_t v1, int64_t v2, int64_t v3) { data = _mm256_set_epi64x(v3,v2,v1,v0); }
+    SIMD (std::array<int64_t,4> a)
+      : data{_mm256_set_epi64x(a[3],a[2],a[1],a[0])}
+    {}
+    SIMD (SIMD<int64_t,2> v0, SIMD<int64_t,2> v1)
+        : data(_mm256_set_m128i(v1.Data(),v0.Data()))
+      {}
+    SIMD (__m256i _data) { data = _data; }
+    NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; }
+    NETGEN_INLINE __m256i Data() const { return data; }
+    NETGEN_INLINE __m256i & Data() { return data; }
+    SIMD<int64_t,2> Lo() const { return _mm256_extractf128_si256(data, 0); }
+    SIMD<int64_t,2> Hi() const { return _mm256_extractf128_si256(data, 1); }
+    static SIMD FirstInt(int n0=0) { return { n0+0, n0+1, n0+2, n0+3 }; }
+    template <int I>
+    double Get() const
+    {
+      static_assert(I>=0 && I<4, "Index out of range");
+      return (*this)[I];
+    }
+  };
+  NETGEN_INLINE SIMD<int64_t,4> operator-(SIMD<int64_t,4> a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a.Data()); }
+#ifdef __AVX2__
+  NETGEN_INLINE SIMD<int64_t,4> operator+ (SIMD<int64_t,4> a, SIMD<int64_t,4> b) { return _mm256_add_epi64(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<int64_t,4> operator- (SIMD<int64_t,4> a, SIMD<int64_t,4> b) { return _mm256_sub_epi64(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<int64_t,4> operator& (SIMD<int64_t,4> a, SIMD<int64_t,4> b)
+  { return _mm256_castpd_si256(_mm256_and_pd (_mm256_castsi256_pd(a.Data()),_mm256_castsi256_pd( b.Data()))); }
+  template <int N>
+  SIMD<int64_t,4> operator<< (SIMD<int64_t,4> a, IC<N> n) { return _mm256_sll_epi64(a.Data(),_mm_set_epi32(0,0,0,N)); }
+#endif // __AVX2__
+  template<>
+  class alignas(32) SIMD<double,4>
+  {
+    __m256d data;
+  public:
+    static constexpr int Size() { return 4; }
+    SIMD () {}
+    SIMD (const SIMD &) = default;
+    SIMD & operator= (const SIMD &) = default;
+    SIMD (double val) { data = _mm256_set1_pd(val); }
+    SIMD (int val)    { data = _mm256_set1_pd(val); }
+    SIMD (size_t val) { data = _mm256_set1_pd(val); }
+    SIMD (double v0, double v1, double v2, double v3) { data = _mm256_set_pd(v3,v2,v1,v0); }
+    SIMD (SIMD<double,2> v0, SIMD<double,2> v1) : SIMD(v0[0], v0[1], v1[0], v1[1]) { ; }
+    SIMD (double const * p) { data = _mm256_loadu_pd(p); }
+    SIMD (double const * p, SIMD<mask64,4> mask) { data = _mm256_maskload_pd(p, mask.Data()); }
+    SIMD (__m256d _data) { data = _data; }
+    SIMD (std::array<double,4> a)
+      : data{_mm256_set_pd(a[3],a[2],a[1],a[0])}
+    {}
+    void Store (double * p) { _mm256_storeu_pd(p, data); }
+    void Store (double * p, SIMD<mask64,4> mask) { _mm256_maskstore_pd(p, mask.Data(), data); }
+    template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
+    SIMD (const T & func)
+    {
+      data = _mm256_set_pd(func(3), func(2), func(1), func(0));
+    }
+    NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
+    NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
+    // [[deprecated("don't write to individual elements of SIMD")]]
+    // NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
+    NETGEN_INLINE __m256d Data() const { return data; }
+    NETGEN_INLINE __m256d & Data() { return data; }
+    SIMD<double,2> Lo() const { return _mm256_extractf128_pd(data, 0); }
+    SIMD<double,2> Hi() const { return _mm256_extractf128_pd(data, 1); }
+    operator std::tuple<double&,double&,double&,double&> ()
+    { return std::tuple<double&,double&,double&,double&>((*this)[0], (*this)[1], (*this)[2], (*this)[3]); }
+    template <int I>
+    double Get() const
+    {
+      static_assert(I>=0 && I<4, "Index out of range");
+      return (*this)[I];
+    }
+  };
+  NETGEN_INLINE auto Unpack (SIMD<double,4> a, SIMD<double,4> b)
+  {
+    return std::make_tuple(SIMD<double,4>(_mm256_unpacklo_pd(a.Data(),b.Data())),
+                      SIMD<double,4>(_mm256_unpackhi_pd(a.Data(),b.Data())));
+  }
+  NETGEN_INLINE SIMD<double,4> operator- (SIMD<double,4> a) { return _mm256_xor_pd(a.Data(), _mm256_set1_pd(-0.0)); }
+  NETGEN_INLINE SIMD<double,4> operator+ (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_add_pd(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<double,4> operator- (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_sub_pd(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<double,4> operator* (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_mul_pd(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<double,4> operator/ (SIMD<double,4> a, SIMD<double,4> b) { return _mm256_div_pd(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<double,4> operator* (double a, SIMD<double,4> b) { return _mm256_set1_pd(a)*b.Data(); }
+  NETGEN_INLINE SIMD<double,4> operator* (SIMD<double,4> b, double a) { return _mm256_set1_pd(a)*b.Data(); }
+  NETGEN_INLINE SIMD<double,4> sqrt (SIMD<double,4> a) { return _mm256_sqrt_pd(a.Data()); }
+  NETGEN_INLINE SIMD<double,4> floor (SIMD<double,4> a) { return _mm256_floor_pd(a.Data()); }
+  NETGEN_INLINE SIMD<double,4> ceil (SIMD<double,4> a) { return _mm256_ceil_pd(a.Data()); }
+  NETGEN_INLINE SIMD<double,4> fabs (SIMD<double,4> a) { return _mm256_max_pd(a.Data(), (-a).Data()); }
+  NETGEN_INLINE SIMD<double,4> round(SIMD<double,4> a) { return _mm256_round_pd(a.Data(), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); }
+  NETGEN_INLINE SIMD<int64_t,4> lround (SIMD<double,4> a)
+  {
+    return my_mm256_cvtepi32_epi64(_mm256_cvtpd_epi32(_mm256_round_pd(a.Data(), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)));
+  }
+#ifdef __FMA__
+  NETGEN_INLINE SIMD<double,4> FMA (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
+  {
+    return _mm256_fmadd_pd (a.Data(), b.Data(), c.Data());
+  }
+  NETGEN_INLINE SIMD<double,4> FMA (const double & a, SIMD<double,4> b, SIMD<double,4> c)
+  {
+    return _mm256_fmadd_pd (_mm256_set1_pd(a), b.Data(), c.Data());
+  }
+  NETGEN_INLINE SIMD<double,4> FNMA (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
+  {
+    return _mm256_fnmadd_pd (a.Data(), b.Data(), c.Data());
+  }
+  NETGEN_INLINE SIMD<double,4> FNMA (const double & a, SIMD<double,4> b, SIMD<double,4> c)
+  {
+    return _mm256_fnmadd_pd (_mm256_set1_pd(a), b.Data(), c.Data());
+  }
+#endif
+#if defined(__FMA__) && !defined(__AVX512F__)
+  // make sure to use the update-version of fma
+  // important in matrix kernels using 12 sum-registers, 3 a-values and updated b-value
+  // avx512 has enough registers, and gcc seems to use only the first 16 z-regs
+  NETGEN_INLINE void FMAasm (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> & sum)
+  {
+    asm ("vfmadd231pd %[a], %[b], %[sum]"
+         : [sum] "+x" (sum.Data())
+         : [a] "x" (a.Data()), [b] "x" (b.Data())
+         );
+  }
+  NETGEN_INLINE void FNMAasm (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> & sum)
+  {
+    asm ("vfnmadd231pd %[a], %[b], %[sum]"
+         : [sum] "+x" (sum.Data())
+         : [a] "x" (a.Data()), [b] "x" (b.Data())
+         );
+  }
+#endif
+#if defined(__FMA__)
+  NETGEN_INLINE SIMD<double,4> FMAddSub (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
+  {
+    return _mm256_fmaddsub_pd(a.Data(), b.Data(), c.Data());
+  }
+#endif
+  NETGEN_INLINE SIMD<double,4> SwapPairs (SIMD<double,4> a)
+  {
+    return _mm256_shuffle_pd (a.Data(), a.Data(), 0b0101);
+  }
+  NETGEN_INLINE SIMD<mask64,4> operator<= (SIMD<double,4> a , SIMD<double,4> b)
+  { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_LE_OQ); }
+  NETGEN_INLINE SIMD<mask64,4> operator< (SIMD<double,4> a , SIMD<double,4> b)
+  { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_LT_OQ); }
+  NETGEN_INLINE SIMD<mask64,4> operator>= (SIMD<double,4> a , SIMD<double,4> b)
+  { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_GE_OQ); }
+  NETGEN_INLINE SIMD<mask64,4> operator> (SIMD<double,4> a , SIMD<double,4> b)
+  { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_GT_OQ); }
+  NETGEN_INLINE SIMD<mask64,4> operator== (SIMD<double,4> a , SIMD<double,4> b)
+  { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_EQ_OQ); }
+  NETGEN_INLINE SIMD<mask64,4> operator!= (SIMD<double,4> a , SIMD<double,4> b)
+  { return _mm256_cmp_pd (a.Data(), b.Data(), _CMP_NEQ_OQ); }
+  NETGEN_INLINE SIMD<mask64,4> operator<= (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
+  { return  _mm256_xor_si256(_mm256_cmpgt_epi64(a.Data(),b.Data()),_mm256_set1_epi32(-1)); }
+  NETGEN_INLINE SIMD<mask64,4> operator< (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
+  { return  my_mm256_cmpgt_epi64(b.Data(),a.Data()); }
+  NETGEN_INLINE SIMD<mask64,4> operator>= (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
+  { return  _mm256_xor_si256(_mm256_cmpgt_epi64(b.Data(),a.Data()),_mm256_set1_epi32(-1)); }
+  NETGEN_INLINE SIMD<mask64,4> operator> (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
+  { return  my_mm256_cmpgt_epi64(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<mask64,4> operator== (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
+  { return  my_mm256_cmpeq_epi64(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<mask64,4> operator!= (SIMD<int64_t,4> a , SIMD<int64_t,4> b)
+  { return  _mm256_xor_si256(my_mm256_cmpeq_epi64(a.Data(),b.Data()),_mm256_set1_epi32(-1)); }
+#ifdef __AVX2__
+  NETGEN_INLINE SIMD<mask64,4> operator&& (SIMD<mask64,4> a, SIMD<mask64,4> b)
+  { return _mm256_and_si256 (a.Data(), b.Data()); }
+  NETGEN_INLINE SIMD<mask64,4> operator|| (SIMD<mask64,4> a, SIMD<mask64,4> b)
+  { return _mm256_or_si256 (a.Data(), b.Data()); }
+  NETGEN_INLINE SIMD<mask64,4> operator! (SIMD<mask64,4> a)
+  { return _mm256_xor_si256 (a.Data(), _mm256_cmpeq_epi64(a.Data(),a.Data())); }
+#else //AVX2 is a superset of AVX. Without it, it is necessary to reinterpret the types
+  NETGEN_INLINE SIMD<mask64,4> operator&& (SIMD<mask64,4> a, SIMD<mask64,4> b)
+  { return _mm256_castpd_si256(_mm256_and_pd (_mm256_castsi256_pd(a.Data()),_mm256_castsi256_pd( b.Data()))); }
+  NETGEN_INLINE SIMD<mask64,4> operator|| (SIMD<mask64,4> a, SIMD<mask64,4> b)
+  { return _mm256_castpd_si256(_mm256_or_pd (_mm256_castsi256_pd(a.Data()), _mm256_castsi256_pd(b.Data()))); }
+  NETGEN_INLINE SIMD<mask64,4> operator! (SIMD<mask64,4> a)
+  { return _mm256_castpd_si256(_mm256_xor_pd (_mm256_castsi256_pd(a.Data()),_mm256_castsi256_pd( _mm256_cmpeq_epi64(a.Data(),a.Data())))); }
+#endif
+  template <>
+  NETGEN_INLINE SIMD<double,4> Reinterpret (SIMD<int64_t,4> a)
+  {
+    return _mm256_castsi256_pd (a.Data());
+  }
+  NETGEN_INLINE SIMD<double,4> If (SIMD<mask64,4> a, SIMD<double,4> b, SIMD<double,4> c)
+  { return _mm256_blendv_pd(c.Data(), b.Data(), _mm256_castsi256_pd(a.Data())); }
+  NETGEN_INLINE SIMD<double,4> IfPos (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
+  {
+    auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_GT_OS);
+    return _mm256_blendv_pd(c.Data(), b.Data(), cp);
+  }
+  NETGEN_INLINE SIMD<double,4> IfZero (SIMD<double,4> a, SIMD<double,4> b, SIMD<double,4> c)
+  {
+    auto cp = _mm256_cmp_pd (a.Data(), _mm256_setzero_pd(), _CMP_EQ_OS);
+    return _mm256_blendv_pd(c.Data(), b.Data(), cp);
+  }
+  NETGEN_INLINE double HSum (SIMD<double,4> sd)
+  {
+    // __m128d hv = _mm_add_pd (_mm256_extractf128_pd(sd.Data(),0), _mm256_extractf128_pd(sd.Data(),1));
+    __m128d hv = (sd.Lo()+sd.Hi()).Data();
+    return _mm_cvtsd_f64 (_mm_hadd_pd (hv, hv));
+  }
+  NETGEN_INLINE auto HSum (SIMD<double,4> sd1, SIMD<double,4> sd2)
+  {
+    __m256d hv = _mm256_hadd_pd(sd1.Data(), sd2.Data());
+    __m128d hv2 = _mm_add_pd (_mm256_extractf128_pd(hv,0), _mm256_extractf128_pd(hv,1));
+    return SIMD<double,2>(_mm_cvtsd_f64 (hv2),  _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
+  }
+  NETGEN_INLINE auto HSum (SIMD<double,4> v1, SIMD<double,4> v2, SIMD<double,4> v3, SIMD<double,4> v4)
+  {
+    __m256d hsum1 = _mm256_hadd_pd (v1.Data(), v2.Data());
+    __m256d hsum2 = _mm256_hadd_pd (v3.Data(), v4.Data());
+    SIMD<double,4> hsum = _mm256_add_pd (_mm256_permute2f128_pd (hsum1, hsum2, 1+2*16),
+                                         _mm256_blend_pd (hsum1, hsum2, 12));
+    return hsum;
+    // return make_tuple(hsum[0], hsum[1], hsum[2], hsum[3]);
+  }
+  /*
+    // untested ...
+    NETGEN_INLINE SIMD<double,4> rsqrt (SIMD<double,4> x)
+  {
+    // return 1.0 / sqrt(x);
+    // SIMD<double,4> y = _mm256_rsqrt14_pd(x.Data());  // only avx512
+    SIMD<double,4> y = _mm256_cvtps_pd ( _mm_rsqrt_ps ( _mm256_cvtpd_ps (x.Data())));
+    auto x_half = 0.5*x;
+    y = y * (1.5 - (x_half * y * y));
+    y = y * (1.5 - (x_half * y * y));
+    return y;
+  }
+  */
+  NETGEN_INLINE SIMD<int64_t,4> If (SIMD<mask64,4> a, SIMD<int64_t,4> b, SIMD<int64_t,4> c)
+  { return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(c.Data()), _mm256_castsi256_pd(b.Data()),
+                                                _mm256_castsi256_pd(a.Data()))); }
+}
+#endif // NETGEN_CORE_SIMD_AVX_HPP

netgen/include/core/simd_avx512.hpp ADDED Viewed

@@ -0,0 +1,285 @@
+#ifndef NETGEN_CORE_SIMD_AVX512_HPP
+#define NETGEN_CORE_SIMD_AVX512_HPP
+/**************************************************************************/
+/* File:   simd_avx512.hpp                                                */
+/* Author: Joachim Schoeberl, Matthias Hochsteger                         */
+/* Date:   25. Mar. 16                                                    */
+/**************************************************************************/
+#include <immintrin.h>
+namespace ngcore
+{
+  template <>
+  class SIMD<mask64,8>
+  {
+    __mmask8 mask;
+  public:
+    SIMD (size_t i)
+      : mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i),
+                                     _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0)))
+    { ; }
+    SIMD (int i)
+      : mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i),
+                                     _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0)))
+    { ; }
+    SIMD (int64_t i)
+      : mask(_mm512_cmpgt_epi64_mask(_mm512_set1_epi64(i),
+                                     _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0)))
+    { ; }
+    SIMD (__mmask8 _mask) : mask(_mask) { ; }
+    __mmask8 Data() const { return mask; }
+    static constexpr int Size() { return 8; }
+    static NETGEN_INLINE SIMD<mask64, 8> GetMaskFromBits (unsigned int i)
+    {
+      return SIMD<mask64, 8>(__mmask8(i));
+    }
+  };
+  template<>
+  class alignas(64) SIMD<int64_t,8>
+  {
+    __m512i data;
+  public:
+    static constexpr int Size() { return 8; }
+    SIMD () {}
+    SIMD (const SIMD &) = default;
+    SIMD & operator= (const SIMD &) = default;
+    SIMD (int64_t val) { data = _mm512_set1_epi64(val); }
+    SIMD (int64_t v0, int64_t v1, int64_t v2, int64_t v3, int64_t v4, int64_t v5, int64_t v6, int64_t v7) { data = _mm512_set_epi64(v7,v6,v5,v4,v3,v2,v1,v0); }
+    SIMD (__m512i _data) { data = _data; }
+    template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int64_t(int)>>::value, int>::type = 0>
+      SIMD (const T & func)
+    {
+      data = _mm512_set_epi64(func(7), func(6), func(5), func(4), func(3), func(2), func(1), func(0));
+    }
+    SIMD (SIMD<int64_t,4> v0, SIMD<int64_t,4> v1)
+        : data(_mm512_castsi256_si512(v0.Data()))
+      {
+        data = _mm512_inserti64x4(data, v1.Data(), 1);
+      }
+    SIMD<int64_t,4> Lo() const { return _mm512_castsi512_si256(data); }
+    SIMD<int64_t,4> Hi() const { return _mm512_extracti64x4_epi64(data, 1); }
+    NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; }
+    NETGEN_INLINE auto & operator[] (int i) { return ((int64_t*)(&data))[i]; }
+    NETGEN_INLINE __m512i Data() const { return data; }
+    NETGEN_INLINE __m512i & Data() { return data; }
+    static SIMD FirstInt() { return { 0, 1, 2, 3, 4, 5, 6, 7 }; }
+  };
+  NETGEN_INLINE SIMD<int64_t,8> operator-(SIMD<int64_t,8> a) { return _mm512_sub_epi64(_mm512_setzero_si512(), a.Data()); }
+  NETGEN_INLINE SIMD<int64_t,8> operator+ (SIMD<int64_t,8> a, SIMD<int64_t,8> b) { return _mm512_add_epi64(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<int64_t,8> operator- (SIMD<int64_t,8> a, SIMD<int64_t,8> b) { return _mm512_sub_epi64(a.Data(),b.Data()); }
+   NETGEN_INLINE SIMD<int64_t,8> If (SIMD<mask64,8> a, SIMD<int64_t,8> b, SIMD<int64_t,8> c)
+  { return _mm512_mask_blend_epi64(a.Data(), c.Data(), b.Data()); }
+  template<>
+  class alignas(64) SIMD<double,8>
+  {
+    __m512d data;
+  public:
+    static constexpr int Size() { return 8; }
+    SIMD () {}
+    SIMD (const SIMD &) = default;
+    SIMD & operator= (const SIMD &) = default;
+    SIMD (double val) { data = _mm512_set1_pd(val); }
+    SIMD (int val)    { data = _mm512_set1_pd(val); }
+    SIMD (size_t val) { data = _mm512_set1_pd(val); }
+    SIMD (double const * p) { data = _mm512_loadu_pd(p); }
+    SIMD (double const * p, SIMD<mask64,8> mask)
+      { data = _mm512_mask_loadu_pd(_mm512_setzero_pd(), mask.Data(), p); }
+    SIMD (__m512d _data) { data = _data; }
+    SIMD (SIMD<double,4> v0, SIMD<double,4> v1)
+        : data(_mm512_set_pd(v1[3], v1[2], v1[1], v1[0], v0[3], v0[2], v0[1], v0[0]))
+    {}
+    SIMD (SIMD<double,6> v0, SIMD<double,2> v1)
+        : data(_mm512_set_pd(v1[1], v1[0], v0[5], v0[4], v0[3], v0[2], v0[1], v0[0]))
+    {}
+    template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
+    SIMD (const T & func)
+    {
+      data = _mm512_set_pd(func(7), func(6), func(5), func(4), func(3), func(2), func(1), func(0));
+    }
+    void Store (double * p) { _mm512_storeu_pd(p, data); }
+    void Store (double * p, SIMD<mask64,8> mask) { _mm512_mask_storeu_pd(p, mask.Data(), data); }
+    template <typename Function>
+    void SIMD_function (const Function & func, std::true_type)
+    {
+      data = (__m512d){ func(7), func(6), func(5), func(4),
+                       func(3), func(2), func(1), func(0) };
+    }
+    // not a function
+    void SIMD_function (double const * p, std::false_type)
+    {
+      data = _mm512_loadu_pd(p);
+    }
+    void SIMD_function (double val, std::false_type)
+    {
+      data = _mm512_set1_pd(val);
+    }
+    void SIMD_function (__m512d _data, std::false_type)
+    {
+      data = _data;
+    }
+    NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
+    NETGEN_INLINE double & operator[] (int i) { return ((double*)(&data))[i]; }
+    NETGEN_INLINE __m512d Data() const { return data; }
+    NETGEN_INLINE __m512d & Data() { return data; }
+    SIMD<double,4> Lo() const { return _mm512_extractf64x4_pd(data, 0); }
+    SIMD<double,4> Hi() const { return _mm512_extractf64x4_pd(data, 1); }
+    template <int I>
+    double Get() const
+    {
+      static_assert(I>=0 && I<8, "Index out of range");
+      return (*this)[I];
+    }
+  };
+  NETGEN_INLINE SIMD<double,8> operator- (SIMD<double,8> a) { return _mm512_xor_pd(a.Data(), _mm512_set1_pd(-0.0)); } //{ return -a.Data(); }
+  NETGEN_INLINE SIMD<double,8> operator+ (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_add_pd(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<double,8> operator- (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_sub_pd(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<double,8> operator* (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_mul_pd(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<double,8> operator/ (SIMD<double,8> a, SIMD<double,8> b) { return _mm512_div_pd(a.Data(),b.Data()); }
+  NETGEN_INLINE SIMD<double,8> operator* (double a, SIMD<double,8> b) { return _mm512_set1_pd(a)*b.Data(); }
+  NETGEN_INLINE SIMD<double,8> operator* (SIMD<double,8> b, double a) { return _mm512_set1_pd(a)*b.Data(); }
+  NETGEN_INLINE SIMD<double,8> sqrt (SIMD<double,8> a) { return _mm512_sqrt_pd(a.Data()); }
+  NETGEN_INLINE SIMD<double,8> floor (SIMD<double,8> a) { return _mm512_floor_pd(a.Data()); }
+  NETGEN_INLINE SIMD<double,8> ceil (SIMD<double,8> a) { return _mm512_ceil_pd(a.Data()); }
+  NETGEN_INLINE SIMD<double,8> fabs (SIMD<double,8> a) { return _mm512_max_pd(a.Data(), ( - a).Data()); }
+  NETGEN_INLINE SIMD<mask64,8> operator<= (SIMD<double,8> a , SIMD<double,8> b)
+  { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_LE_OQ); }
+  NETGEN_INLINE SIMD<mask64,8> operator< (SIMD<double,8> a , SIMD<double,8> b)
+  { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_LT_OQ); }
+  NETGEN_INLINE SIMD<mask64,8> operator>= (SIMD<double,8> a , SIMD<double,8> b)
+  { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_GE_OQ); }
+  NETGEN_INLINE SIMD<mask64,8> operator> (SIMD<double,8> a , SIMD<double,8> b)
+  { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_GT_OQ); }
+  NETGEN_INLINE SIMD<mask64,8> operator== (SIMD<double,8> a , SIMD<double,8> b)
+  { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_EQ_OQ); }
+  NETGEN_INLINE SIMD<mask64,8> operator!= (SIMD<double,8> a , SIMD<double,8> b)
+  { return _mm512_cmp_pd_mask (a.Data(), b.Data(), _CMP_NEQ_OQ); }
+  NETGEN_INLINE SIMD<mask64,8> operator<= (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
+  { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_LE); }
+  NETGEN_INLINE SIMD<mask64,8> operator< (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
+  { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_LT); }
+  NETGEN_INLINE SIMD<mask64,8> operator>= (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
+  { return _mm512_cmp_epi64_mask (a.Data(), b.Data(),  _MM_CMPINT_NLT); }
+  NETGEN_INLINE SIMD<mask64,8> operator> (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
+  { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NLE); }
+  NETGEN_INLINE SIMD<mask64,8> operator== (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
+  { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_EQ); }
+  NETGEN_INLINE SIMD<mask64,8> operator!= (SIMD<int64_t,8> a , SIMD<int64_t,8> b)
+  { return _mm512_cmp_epi64_mask (a.Data(), b.Data(), _MM_CMPINT_NE); }
+  NETGEN_INLINE SIMD<mask64,8> operator&& (SIMD<mask64,8> a, SIMD<mask64,8> b)
+  { return (__mmask8)(a.Data() & b.Data()); }
+  NETGEN_INLINE SIMD<mask64,8> operator|| (SIMD<mask64,8> a, SIMD<mask64,8> b)
+  { return (__mmask8)(a.Data() | b.Data()); }
+  NETGEN_INLINE SIMD<mask64,8> operator! (SIMD<mask64,8> a)
+  { return (__mmask8)(~a.Data()); }
+  NETGEN_INLINE SIMD<double,8> If (SIMD<mask64,8> a, SIMD<double,8> b, SIMD<double,8> c)
+  { return _mm512_mask_blend_pd(a.Data(), c.Data(), b.Data()); }
+  NETGEN_INLINE SIMD<double,8> IfPos (SIMD<double,8> a, SIMD<double> b, SIMD<double> c)
+  {
+    auto k = _mm512_cmp_pd_mask(a.Data(),_mm512_setzero_pd(), _CMP_GT_OS);
+    return _mm512_mask_blend_pd(k,c.Data(),b.Data());
+  }
+  NETGEN_INLINE SIMD<double,8> IfZero (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
+  {
+    auto k = _mm512_cmp_pd_mask(a.Data(),_mm512_setzero_pd(), _CMP_EQ_OS);
+    return _mm512_mask_blend_pd(k,c.Data(),b.Data());
+  }
+  NETGEN_INLINE auto Unpack (SIMD<double,8> a, SIMD<double,8> b)
+  {
+    return std::make_tuple(SIMD<double,8>(_mm512_unpacklo_pd(a.Data(),b.Data())),
+                      SIMD<double,8>(_mm512_unpackhi_pd(a.Data(),b.Data())));
+  }
+  NETGEN_INLINE double HSum (SIMD<double,8> sd)
+  {
+    SIMD<double,4> low = _mm512_extractf64x4_pd(sd.Data(),0);
+    SIMD<double,4> high = _mm512_extractf64x4_pd(sd.Data(),1);
+    return HSum(low)+HSum(high);
+  }
+  NETGEN_INLINE auto HSum (SIMD<double,8> sd1, SIMD<double,8> sd2)
+  {
+    return SIMD<double,2>(HSum(sd1), HSum(sd2));
+  }
+  NETGEN_INLINE SIMD<double,4> HSum (SIMD<double,8> v1, SIMD<double,8> v2, SIMD<double,8> v3, SIMD<double,8> v4)
+  {
+    SIMD<double> lo,hi;
+    std::tie(lo,hi) = Unpack(v1, v2);
+    SIMD<double> sum01 = lo+hi;
+    std::tie(lo,hi) = Unpack(v3, v4);
+    SIMD<double> sum23 = lo+hi;
+    // sum01  b a b a b a b a
+    // sum23  d c d c d c d c
+    // __m512 perm = _mm512_permutex2var_pd (sum01.Data(), _mm512_set_epi64(1,2,3,4,5,6,7,8), sum23.Data());
+    SIMD<double,4> ab =  _mm512_extractf64x4_pd(sum01.Data(),0) + _mm512_extractf64x4_pd(sum01.Data(),1);
+    SIMD<double,4> cd =  _mm512_extractf64x4_pd(sum23.Data(),0) + _mm512_extractf64x4_pd(sum23.Data(),1);
+    return _mm256_add_pd (_mm256_permute2f128_pd (ab.Data(), cd.Data(), 1 + 2 * 16), _mm256_blend_pd(ab.Data(), cd.Data(), 12));
+  }
+  NETGEN_INLINE SIMD<double,8> FMA (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
+  {
+    return _mm512_fmadd_pd (a.Data(), b.Data(), c.Data());
+  }
+  NETGEN_INLINE SIMD<double,8> FMA (const double & a, SIMD<double,8> b, SIMD<double,8> c)
+  {
+    return _mm512_fmadd_pd (_mm512_set1_pd(a), b.Data(), c.Data());
+  }
+  NETGEN_INLINE SIMD<double,8> FNMA (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
+  {
+    return _mm512_fnmadd_pd (a.Data(), b.Data(), c.Data());
+  }
+  NETGEN_INLINE SIMD<double,8> FNMA (const double & a, SIMD<double,8> b, SIMD<double,8> c)
+  {
+    return _mm512_fnmadd_pd (_mm512_set1_pd(a), b.Data(), c.Data());
+  }
+  NETGEN_INLINE SIMD<double,8> FMAddSub (SIMD<double,8> a, SIMD<double,8> b, SIMD<double,8> c)
+  {
+    return _mm512_fmaddsub_pd(a.Data(), b.Data(), c.Data());
+  }
+  NETGEN_INLINE SIMD<double,8> SwapPairs (SIMD<double,8> a)
+  {
+    return _mm512_shuffle_pd (a.Data(), a.Data(), 0b01010101);
+  }
+}
+#endif // NETGEN_CORE_SIMD_AVX512_HPP