PyPI - ngsolve - Versions diffs - 6.2.2506.post74.dev0__cp314-cp314-macosx_10_15_universal2.whl - Mend

ngsolve 6.2.2506.post74.dev0__cp314-cp314-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ngsolve might be problematic. Click here for more details.

Files changed (315) hide show

netgen/include/analytic_integrals.hpp +10 -0
netgen/include/arnoldi.hpp +55 -0
netgen/include/bandmatrix.hpp +334 -0
netgen/include/basematrix.hpp +957 -0
netgen/include/basevector.hpp +1268 -0
netgen/include/bdbequations.hpp +2805 -0
netgen/include/bdbintegrator.hpp +1660 -0
netgen/include/bem_diffops.hpp +475 -0
netgen/include/bessel.hpp +1064 -0
netgen/include/bilinearform.hpp +963 -0
netgen/include/bla.hpp +29 -0
netgen/include/blockalloc.hpp +95 -0
netgen/include/blockjacobi.hpp +328 -0
netgen/include/bspline.hpp +116 -0
netgen/include/calcinverse.hpp +141 -0
netgen/include/cg.hpp +368 -0
netgen/include/chebyshev.hpp +44 -0
netgen/include/cholesky.hpp +720 -0
netgen/include/clapack.h +7254 -0
netgen/include/code_generation.hpp +296 -0
netgen/include/coefficient.hpp +2033 -0
netgen/include/coefficient_impl.hpp +19 -0
netgen/include/coefficient_stdmath.hpp +167 -0
netgen/include/commutingAMG.hpp +106 -0
netgen/include/comp.hpp +79 -0
netgen/include/compatibility.hpp +41 -0
netgen/include/complex_wrapper.hpp +73 -0
netgen/include/compressedfespace.hpp +110 -0
netgen/include/contact.hpp +235 -0
netgen/include/diagonalmatrix.hpp +154 -0
netgen/include/differentialoperator.hpp +276 -0
netgen/include/diffop.hpp +1286 -0
netgen/include/diffop_impl.hpp +328 -0
netgen/include/diffopwithfactor.hpp +123 -0
netgen/include/discontinuous.hpp +84 -0
netgen/include/dump.hpp +949 -0
netgen/include/ectypes.hpp +121 -0
netgen/include/eigen.hpp +60 -0
netgen/include/eigensystem.hpp +18 -0
netgen/include/elasticity_equations.hpp +595 -0
netgen/include/elementbyelement.hpp +195 -0
netgen/include/elementtopology.hpp +1760 -0
netgen/include/elementtransformation.hpp +339 -0
netgen/include/evalfunc.hpp +405 -0
netgen/include/expr.hpp +1686 -0
netgen/include/facetfe.hpp +175 -0
netgen/include/facetfespace.hpp +180 -0
netgen/include/facethofe.hpp +111 -0
netgen/include/facetsurffespace.hpp +112 -0
netgen/include/fe_interfaces.hpp +32 -0
netgen/include/fem.hpp +87 -0
netgen/include/fesconvert.hpp +14 -0
netgen/include/fespace.hpp +1449 -0
netgen/include/finiteelement.hpp +286 -0
netgen/include/globalinterfacespace.hpp +77 -0
netgen/include/globalspace.hpp +115 -0
netgen/include/gridfunction.hpp +525 -0
netgen/include/h1amg.hpp +124 -0
netgen/include/h1hofe.hpp +188 -0
netgen/include/h1hofe_impl.hpp +1262 -0
netgen/include/h1hofefo.hpp +148 -0
netgen/include/h1hofefo_impl.hpp +185 -0
netgen/include/h1hofespace.hpp +167 -0
netgen/include/h1lofe.hpp +1240 -0
netgen/include/h1lumping.hpp +41 -0
netgen/include/hcurl_equations.hpp +1381 -0
netgen/include/hcurlcurlfe.hpp +2241 -0
netgen/include/hcurlcurlfespace.hpp +78 -0
netgen/include/hcurlfe.hpp +259 -0
netgen/include/hcurlfe_utils.hpp +107 -0
netgen/include/hcurlhdiv_dshape.hpp +857 -0
netgen/include/hcurlhdivfes.hpp +308 -0
netgen/include/hcurlhofe.hpp +175 -0
netgen/include/hcurlhofe_impl.hpp +1871 -0
netgen/include/hcurlhofespace.hpp +193 -0
netgen/include/hcurllofe.hpp +1146 -0
netgen/include/hdiv_equations.hpp +880 -0
netgen/include/hdivdivfe.hpp +2923 -0
netgen/include/hdivdivsurfacespace.hpp +76 -0
netgen/include/hdivfe.hpp +206 -0
netgen/include/hdivfe_utils.hpp +717 -0
netgen/include/hdivfes.hpp +75 -0
netgen/include/hdivhofe.hpp +447 -0
netgen/include/hdivhofe_impl.hpp +1107 -0
netgen/include/hdivhofefo.hpp +229 -0
netgen/include/hdivhofespace.hpp +177 -0
netgen/include/hdivhosurfacefespace.hpp +106 -0
netgen/include/hdivlofe.hpp +773 -0
netgen/include/hidden.hpp +74 -0
netgen/include/householder.hpp +181 -0
netgen/include/hypre_ams_precond.hpp +123 -0
netgen/include/hypre_precond.hpp +73 -0
netgen/include/integrator.hpp +2012 -0
netgen/include/integratorcf.hpp +253 -0
netgen/include/interpolate.hpp +49 -0
netgen/include/intrule.hpp +2542 -0
netgen/include/intrules_SauterSchwab.hpp +25 -0
netgen/include/irspace.hpp +49 -0
netgen/include/jacobi.hpp +153 -0
netgen/include/kernels.hpp +762 -0
netgen/include/l2hofe.hpp +194 -0
netgen/include/l2hofe_impl.hpp +564 -0
netgen/include/l2hofefo.hpp +542 -0
netgen/include/l2hofespace.hpp +344 -0
netgen/include/la.hpp +38 -0
netgen/include/linearform.hpp +266 -0
netgen/include/matrix.hpp +2140 -0
netgen/include/memusage.hpp +41 -0
netgen/include/meshaccess.hpp +1359 -0
netgen/include/mgpre.hpp +204 -0
netgen/include/mp_coefficient.hpp +145 -0
netgen/include/mptools.hpp +2281 -0
netgen/include/multigrid.hpp +42 -0
netgen/include/multivector.hpp +447 -0
netgen/include/mumpsinverse.hpp +187 -0
netgen/include/mycomplex.hpp +361 -0
netgen/include/ng_lapack.hpp +1661 -0
netgen/include/ngblas.hpp +1232 -0
netgen/include/ngs_defines.hpp +30 -0
netgen/include/ngs_stdcpp_include.hpp +106 -0
netgen/include/ngs_utils.hpp +121 -0
netgen/include/ngsobject.hpp +1019 -0
netgen/include/ngsstream.hpp +113 -0
netgen/include/ngstd.hpp +72 -0
netgen/include/nodalhofe.hpp +96 -0
netgen/include/nodalhofe_impl.hpp +141 -0
netgen/include/normalfacetfe.hpp +223 -0
netgen/include/normalfacetfespace.hpp +98 -0
netgen/include/normalfacetsurfacefespace.hpp +84 -0
netgen/include/order.hpp +251 -0
netgen/include/parallel_matrices.hpp +222 -0
netgen/include/paralleldofs.hpp +340 -0
netgen/include/parallelngs.hpp +23 -0
netgen/include/parallelvector.hpp +269 -0
netgen/include/pardisoinverse.hpp +200 -0
netgen/include/periodic.hpp +129 -0
netgen/include/plateaufespace.hpp +25 -0
netgen/include/pml.hpp +275 -0
netgen/include/pmltrafo.hpp +631 -0
netgen/include/postproc.hpp +142 -0
netgen/include/potentialtools.hpp +22 -0
netgen/include/precomp.hpp +60 -0
netgen/include/preconditioner.hpp +602 -0
netgen/include/prolongation.hpp +377 -0
netgen/include/python_comp.hpp +107 -0
netgen/include/python_fem.hpp +89 -0
netgen/include/python_linalg.hpp +58 -0
netgen/include/python_ngstd.hpp +386 -0
netgen/include/recursive_pol.hpp +4896 -0
netgen/include/recursive_pol_tet.hpp +395 -0
netgen/include/recursive_pol_trig.hpp +492 -0
netgen/include/reorderedfespace.hpp +81 -0
netgen/include/sample_sort.hpp +105 -0
netgen/include/scalarfe.hpp +335 -0
netgen/include/shapefunction_utils.hpp +113 -0
netgen/include/simd_complex.hpp +329 -0
netgen/include/smoother.hpp +253 -0
netgen/include/solve.hpp +89 -0
netgen/include/sparsecholesky.hpp +313 -0
netgen/include/sparsematrix.hpp +1038 -0
netgen/include/sparsematrix_dyn.hpp +90 -0
netgen/include/sparsematrix_impl.hpp +1013 -0
netgen/include/special_matrix.hpp +463 -0
netgen/include/specialelement.hpp +125 -0
netgen/include/statushandler.hpp +33 -0
netgen/include/stringops.hpp +12 -0
netgen/include/superluinverse.hpp +136 -0
netgen/include/symbolicintegrator.hpp +850 -0
netgen/include/symmetricmatrix.hpp +144 -0
netgen/include/tangentialfacetfe.hpp +224 -0
netgen/include/tangentialfacetfespace.hpp +91 -0
netgen/include/tensor.hpp +522 -0
netgen/include/tensorcoefficient.hpp +446 -0
netgen/include/tensorproductintegrator.hpp +113 -0
netgen/include/thcurlfe.hpp +128 -0
netgen/include/thcurlfe_impl.hpp +380 -0
netgen/include/thdivfe.hpp +80 -0
netgen/include/thdivfe_impl.hpp +492 -0
netgen/include/tpdiffop.hpp +461 -0
netgen/include/tpfes.hpp +133 -0
netgen/include/tpintrule.hpp +224 -0
netgen/include/triangular.hpp +465 -0
netgen/include/tscalarfe.hpp +245 -0
netgen/include/tscalarfe_impl.hpp +1029 -0
netgen/include/umfpackinverse.hpp +148 -0
netgen/include/vector.hpp +1273 -0
netgen/include/voxelcoefficientfunction.hpp +41 -0
netgen/include/vtkoutput.hpp +198 -0
netgen/include/vvector.hpp +208 -0
netgen/include/webgui.hpp +92 -0
netgen/libngbla.dylib +0 -0
netgen/libngcomp.dylib +0 -0
netgen/libngfem.dylib +0 -0
netgen/libngla.dylib +0 -0
netgen/libngsbem.dylib +0 -0
netgen/libngsolve.dylib +0 -0
netgen/libngstd.dylib +0 -0
ngsolve/TensorProductTools.py +210 -0
ngsolve/__console.py +94 -0
ngsolve/__expr.py +181 -0
ngsolve/__init__.py +148 -0
ngsolve/__init__.pyi +233 -0
ngsolve/_scikit_build_core_dependencies.py +30 -0
ngsolve/bla.pyi +1153 -0
ngsolve/bvp.py +78 -0
ngsolve/bvp.pyi +32 -0
ngsolve/cmake/NGSolveConfig.cmake +102 -0
ngsolve/cmake/ngsolve-targets-release.cmake +79 -0
ngsolve/cmake/ngsolve-targets.cmake +163 -0
ngsolve/comp/__init__.pyi +5449 -0
ngsolve/comp/pml.pyi +89 -0
ngsolve/config/__init__.py +1 -0
ngsolve/config/__init__.pyi +43 -0
ngsolve/config/__main__.py +4 -0
ngsolve/config/config.py +60 -0
ngsolve/config/config.pyi +45 -0
ngsolve/demos/TensorProduct/__init__.py +0 -0
ngsolve/demos/TensorProduct/tp_dg_1d_1d.py +80 -0
ngsolve/demos/TensorProduct/tp_dg_1d_2d.py +73 -0
ngsolve/demos/TensorProduct/tp_dg_2d_1d.py +72 -0
ngsolve/demos/TensorProduct/tp_dg_2d_2d.py +66 -0
ngsolve/demos/__init__.py +0 -0
ngsolve/demos/howto/__init__.py +0 -0
ngsolve/demos/howto/hhj.py +44 -0
ngsolve/demos/howto/hybrid_dg.py +53 -0
ngsolve/demos/howto/mixed.py +30 -0
ngsolve/demos/howto/nonlin.py +29 -0
ngsolve/demos/howto/pickling.py +26 -0
ngsolve/demos/howto/pml.py +31 -0
ngsolve/demos/howto/taskmanager.py +20 -0
ngsolve/demos/howto/tdnns.py +47 -0
ngsolve/demos/howto/timeDG-skeleton.py +45 -0
ngsolve/demos/howto/timeDG.py +38 -0
ngsolve/demos/howto/timeDGlap.py +42 -0
ngsolve/demos/howto/timeDGwave.py +61 -0
ngsolve/demos/intro/__init__.py +0 -0
ngsolve/demos/intro/adaptive.py +123 -0
ngsolve/demos/intro/cmagnet.py +59 -0
ngsolve/demos/intro/elasticity.py +76 -0
ngsolve/demos/intro/navierstokes.py +74 -0
ngsolve/demos/intro/poisson.ipynb +170 -0
ngsolve/demos/intro/poisson.py +41 -0
ngsolve/demos/mpi/__init__.py +0 -0
ngsolve/demos/mpi/mpi_cmagnet.py +87 -0
ngsolve/demos/mpi/mpi_navierstokes.py +117 -0
ngsolve/demos/mpi/mpi_poisson.py +89 -0
ngsolve/demos/mpi/mpi_timeDG.py +82 -0
ngsolve/directsolvers.py +26 -0
ngsolve/directsolvers.pyi +15 -0
ngsolve/eigenvalues.py +364 -0
ngsolve/eigenvalues.pyi +30 -0
ngsolve/fem.pyi +1647 -0
ngsolve/internal.py +89 -0
ngsolve/krylovspace.py +1013 -0
ngsolve/krylovspace.pyi +298 -0
ngsolve/la.pyi +1230 -0
ngsolve/meshes.py +748 -0
ngsolve/ngs2petsc.py +310 -0
ngsolve/ngscxx.py +42 -0
ngsolve/ngslib.so +0 -0
ngsolve/ngstd.pyi +59 -0
ngsolve/nonlinearsolvers.py +203 -0
ngsolve/nonlinearsolvers.pyi +95 -0
ngsolve/preconditioners.py +11 -0
ngsolve/preconditioners.pyi +7 -0
ngsolve/solve.pyi +109 -0
ngsolve/solve_implementation.py +168 -0
ngsolve/solve_implementation.pyi +42 -0
ngsolve/solvers.py +7 -0
ngsolve/solvers.pyi +14 -0
ngsolve/timestepping.py +185 -0
ngsolve/timestepping.pyi +28 -0
ngsolve/timing.py +108 -0
ngsolve/timing.pyi +54 -0
ngsolve/utils.py +167 -0
ngsolve/utils.pyi +273 -0
ngsolve/webgui.py +670 -0
ngsolve-6.2.2506.post74.dev0.data/data/Netgen.icns +0 -0
ngsolve-6.2.2506.post74.dev0.data/data/bin/ngscxx +17 -0
ngsolve-6.2.2506.post74.dev0.data/data/bin/ngsld +13 -0
ngsolve-6.2.2506.post74.dev0.data/data/bin/ngsolve.tcl +648 -0
ngsolve-6.2.2506.post74.dev0.data/data/bin/ngspy +2 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/beam.geo +17 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/beam.vol +240 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/chip.in2d +41 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/chip.vol +614 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/coil.geo +12 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/coil.vol +2560 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/coilshield.geo +24 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/coilshield.vol +3179 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/cube.geo +19 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/cube.vol +1832 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d10_DGdoubleglazing.pde +50 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d11_chip_nitsche.pde +40 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d1_square.pde +43 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d2_chip.pde +35 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d3_helmholtz.pde +22 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d4_cube.pde +46 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d5_beam.pde +74 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d6_shaft.pde +73 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d7_coil.pde +50 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d8_coilshield.pde +49 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/d9_hybridDG.pde +72 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/doubleglazing.in2d +27 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/doubleglazing.vol +737 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/piezo2d40round4.vol.gz +0 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/shaft.geo +73 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/shaft.vol +4291 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/square.in2d +17 -0
ngsolve-6.2.2506.post74.dev0.data/data/share/ngsolve/square.vol +149 -0
ngsolve-6.2.2506.post74.dev0.dist-info/METADATA +13 -0
ngsolve-6.2.2506.post74.dev0.dist-info/RECORD +315 -0
ngsolve-6.2.2506.post74.dev0.dist-info/WHEEL +5 -0
ngsolve-6.2.2506.post74.dev0.dist-info/licenses/LICENSE +504 -0
ngsolve-6.2.2506.post74.dev0.dist-info/top_level.txt +2 -0

netgen/include/tscalarfe_impl.hpp ADDED Viewed

@@ -0,0 +1,1029 @@
+#ifdef __CUDA_ARCH__
+#include "tscalarfe_impl_cuda.hpp"
+#endif
+#ifndef FILE_TSCALARFE_IMPL
+#define FILE_TSCALARFE_IMPL
+#include "tscalarfe.hpp"
+#include "recursive_pol.hpp"
+#include "shapefunction_utils.hpp"
+namespace ngfem
+{
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcShape (const IntegrationPoint & ip, BareSliceVector<> shape) const
+  {
+    T_CalcShape (GetTIP<DIM>(ip), shape);
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcDShape (const IntegrationPoint & ip,
+              BareSliceMatrix<> dshape) const
+  {
+    T_CalcShape (GetTIPGrad<DIM> (ip),
+                 SBLambda ([dshape] (int i, auto shape)
+                 { dshape.Row(i) = ngfem::GetGradient(shape); }));
+  }
+#ifndef FASTCOMPILE
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcShape (const IntegrationRule & ir, BareSliceMatrix<> shape) const
+  {
+    for (int i = 0; i < ir.Size(); i++)
+      T_CalcShape (GetTIP<DIM>(ir[i]), shape.Col(i));
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcShape (const SIMD_IntegrationRule & ir, BareSliceMatrix<SIMD<double>> shapes) const
+  {
+    /*
+    for (size_t i = 0; i < ir.Size(); i++)
+      T_CalcShape (GetTIP<DIM>(ir[i]),
+                   SBLambda([&](size_t j, SIMD<double> shape)
+                            { shapes(j,i) = shape; } ));
+    */
+    for (size_t i = 0; i < ir.Size(); i++)
+      T_CalcShape (GetTIP<DIM>(ir[i]), shapes.Col(i));
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  double T_ScalarFiniteElement<FEL,ET,BASE> ::
+  Evaluate (const IntegrationPoint & ip, BareSliceVector<double> x) const
+  {
+    double sum = 0;
+    T_CalcShape (GetTIP<DIM>(ip),
+                 SBLambda ([x,&sum](size_t i, double val) { sum += x(i)*val; } ));
+    return sum;
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  Evaluate (const IntegrationRule & ir, BareSliceVector<double> coefs, BareSliceVector<double> vals) const
+  {
+    for (size_t i = 0; i < ir.GetNIP(); i++)
+      {
+        double sum = 0;
+        T_CalcShape (GetTIP<DIM>(ir[i]),
+                     SBLambda ( [coefs,&sum](size_t j, double shape) { sum += coefs(j)*shape; } ));
+        vals(i) = sum;
+      }
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  Evaluate (const SIMD_IntegrationRule & ir, BareSliceVector<> coefs, BareVector<SIMD<double>> values) const
+  {
+    FlatArray<SIMD<IntegrationPoint>> hir = ir;
+    size_t i = 0;
+    for ( ; i+2 <= hir.Size(); i+=2)
+      {
+        MultiSIMD<2,double> sum = 0;
+        auto tip1 = GetTIP<DIM>(hir[i]);
+        auto tip2 = GetTIP<DIM>(hir[i+1]);
+        TIP<DIM,MultiSIMD<2,double>> tip(tip1,tip2);
+        double * pcoefs = coefs.Data();
+        size_t dist = coefs.Dist();
+        T_CalcShape (tip,
+                     SBLambda ( [&](size_t j, MultiSIMD<2,double> shape)
+                                {
+                                  // sum += *pcoefs * shape;
+                                  sum = FMA(MultiSIMD<2,double>(*pcoefs), shape, sum);
+                                  pcoefs += dist; }
+                                ));
+        // std::tie(values(i), values(i+1)) = sum;
+        values(i) = sum.Lo();
+        values(i+1) = sum.Hi();
+      }
+    if (i < hir.Size())
+      {
+        SIMD<double> sum = 0;
+        double * pcoefs = coefs.Data();
+        size_t dist = coefs.Dist();
+        T_CalcShape (GetTIP<DIM>(hir[i]),
+                     SBLambda ( [&](int j, SIMD<double> shape)
+                                { sum += (*pcoefs)*shape; pcoefs += dist; } ));
+        values(i) = sum;
+      }
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  Evaluate (const SIMD_IntegrationRule & ir,
+            SliceMatrix<> coefs,
+            BareSliceMatrix<SIMD<double>> values) const
+  {
+    FlatArray<SIMD<IntegrationPoint>> hir = ir;
+    size_t j = 0;
+    for ( ; j+4 <= coefs.Width(); j+=4)
+      {
+        for (size_t i = 0; i < hir.Size(); i++)
+          {
+            SIMD<double> sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0;
+            // double * pcoefs = &coefs(j);
+            double * pcoefs = coefs.Addr(0,j);
+            size_t dist = coefs.Dist();
+            T_CalcShape (GetTIP<DIM>(hir[i]),
+                         SBLambda ( [&pcoefs, dist, &sum1, &sum2, &sum3, &sum4](int j, SIMD<double> shape)
+                                    {
+                                      sum1 += pcoefs[0]*shape;
+                                      sum2 += pcoefs[1]*shape;
+                                      sum3 += pcoefs[2]*shape;
+                                      sum4 += pcoefs[3]*shape;
+                                      pcoefs += dist;
+                                    } ));
+            values(j,i) = sum1;
+            values(j+1,i) = sum2;
+            values(j+2,i) = sum3;
+            values(j+3,i) = sum4;
+          }
+      }
+    switch (coefs.Width()&3)
+      {
+      case 0: break;
+      case 1: Evaluate (ir, coefs.Col(j), values.Row(j)); break;
+      case 2:
+        {
+          for (size_t i = 0; i < hir.Size(); i++)
+            {
+              SIMD<double> sum1 = 0, sum2 = 0;
+              // double * pcoefs = &coefs(j);
+              double * pcoefs = coefs.Addr(0,j);
+              size_t dist = coefs.Dist();
+              T_CalcShape (GetTIP<DIM>(hir[i]),
+                           SBLambda ( [&pcoefs, dist,&sum1, &sum2](int j, SIMD<double> shape)
+                                      {
+                                        sum1 += pcoefs[0]*shape;
+                                        sum2 += pcoefs[1]*shape;
+                                        pcoefs += dist;
+                                      } ));
+              values(j,i) = sum1;
+              values(j+1,i) = sum2;
+            }
+          break;
+        case 3:
+          {
+            for (size_t i = 0; i < hir.Size(); i++)
+              {
+                SIMD<double> sum1 = 0, sum2 = 0, sum3 = 0;
+                // double * pcoefs = &coefs(j);
+                double * pcoefs = coefs.Addr(0,j);
+                size_t dist = coefs.Dist();
+                T_CalcShape (GetTIP<DIM>(hir[i]),
+                             SBLambda ( [&pcoefs, dist, &sum1,&sum2,&sum3](int j, SIMD<double> shape)
+                                        {
+                                          sum1 += pcoefs[0]*shape;
+                                          sum2 += pcoefs[1]*shape;
+                                          sum3 += pcoefs[2]*shape;
+                                          pcoefs += dist;
+                                        } ));
+                values(j,i) = sum1;
+                values(j+1,i) = sum2;
+                values(j+2,i) = sum3;
+              }
+            break;
+          }
+        }
+      }
+        /*
+    for ( ; j < coefs.Width(); j++)
+      Evaluate (ir, coefs.Col(j), values.Row(j));
+        */
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  Evaluate (const IntegrationRule & ir, SliceMatrix<> coefs, BareSliceMatrix<> values) const
+  {
+    for (size_t i = 0; i < ir.GetNIP(); i++)
+      {
+        auto hrow = values.Row(i).Range(coefs.Width());
+        hrow = 0.0;
+        T_CalcShape (GetTIP<DIM>(ir[i]),
+                     SBLambda ( [&](size_t j, double shape)
+                                {
+                                  hrow += shape * coefs.Row(j);
+                                }));
+      }
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  EvaluateTrans (const IntegrationRule & ir, BareSliceVector<> vals, BareSliceVector<double> coefs) const
+  {
+    coefs.Range(0,ndof) = 0.0;
+    for (size_t i = 0; i < ir.GetNIP(); i++)
+      {
+        double vali = vals(i);
+        T_CalcShape (GetTIP<DIM>(ir[i]),
+                     SBLambda ( [coefs, vali](size_t j, double shape)
+                                { coefs(j) += vali*shape; } ));
+      }
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  AddTrans (const SIMD_IntegrationRule & ir, BareVector<SIMD<double>> values,
+            BareSliceVector<> coefs) const
+  {
+    FlatArray<SIMD<IntegrationPoint>> hir = ir;
+    /*
+    for (int i = 0; i < hir.Size(); i++)
+      {
+        Vec<DIM,SIMD<double>> pt = hir[i];
+        SIMD<double> val = values.Get(i);
+        T_CalcShape (&pt(0), SBLambda ( [&](int j, SIMD<double> shape) { coefs(j) += HSum(val*shape); } ));
+      }
+    */
+    size_t i = 0;
+    for ( ; i+2 <= hir.Size(); i+=2)
+      {
+        TIP<DIM,SIMD<double>> tip1 = hir[i].TIp<DIM>();
+        TIP<DIM,SIMD<double>> tip2 = hir[i+1].TIp<DIM>();
+        TIP<DIM,MultiSIMD<2,double>> tip(tip1,tip2);
+        MultiSIMD<2,double> val (values(i), values(i+1));
+        double * pcoefs = coefs.Data();
+        size_t dist = coefs.Dist();
+        /*
+        T_CalcShape (tip,
+                     SBLambda
+                     ([&](int j, MultiSIMD<2,double> shape)
+                      { *pcoefs += HSum(val*shape); pcoefs += dist; }
+                      ));
+        */
+        T_CalcShape (tip,
+                     SBLambdaDuo
+                     ([&](int j, MultiSIMD<2,double> shape)
+                      {
+                        *pcoefs += HSum(val*shape); pcoefs += dist;
+                      },
+                      [&](int j, MultiSIMD<2,double> shape, int j2, MultiSIMD<2,double> shape2)
+                      {
+                        auto v2 = HSum(val*shape, val*shape2);
+                        *pcoefs += get<0>(v2); pcoefs += dist;
+                        *pcoefs += get<1>(v2); pcoefs += dist;
+                      }
+                      ));
+      }
+    for ( ; i < hir.Size(); i++)
+      {
+        TIP<DIM,SIMD<double>> tip = hir[i].TIp<DIM>();
+        SIMD<double> val (values(i));
+        double * pcoefs = coefs.Data();
+        size_t dist = coefs.Dist();
+        T_CalcShape (tip,
+                     SBLambdaDuo
+                     ([&](int j, SIMD<double> shape)
+                      {
+                        *pcoefs += HSum(val*shape); pcoefs += dist;
+                      },
+                      [&](int j, SIMD<double> shape, int j2, SIMD<double> shape2)
+                      {
+                        auto v2 = HSum(val*shape, val*shape2);
+                        *pcoefs += get<0>(v2); pcoefs += dist;
+                        *pcoefs += get<1>(v2); pcoefs += dist;
+                      }
+                      ));
+      }
+    /*
+    for (int i = 0; i < hir.Size(); i+=3)
+      {
+        Vec<DIM,SIMD<double>> pt1 = hir[i];
+        Vec<DIM,SIMD<double>> pt2 = hir[(i+1 < hir.Size()) ? i+1 : i];
+        Vec<DIM,SIMD<double>> pt3 = hir[(i+2 < hir.Size()) ? i+2 : i];
+        Vec<DIM,MultiSIMD<3,double>> pt;
+        for (int i = 0; i < DIM; i++)
+          pt(i) = MultiSIMD<3,double> (pt1(i), pt2(i), pt3(i));
+        MultiSIMD<3,double> val (values.Get(i),
+                                 i+1 < hir.Size() ? values.Get(i+1) : SIMD<double> (0.0),
+                                 i+2 < hir.Size() ? values.Get(i+2) : SIMD<double> (0.0));
+        // T_CalcShape (&pt(0), SBLambda ( [&](int j, MultiSIMD<3,double> shape) { coefs(j) += HSum(val*shape); } ));
+        double * pcoefs = &coefs(0);
+        size_t dist = coefs.Dist();
+        T_CalcShape (TIP<DIM,MultiSIMD<3,double>> (pt),
+                     SBLambda ( [&](int j, MultiSIMD<3,double> shape)
+                                { *pcoefs += HSum(val*shape); pcoefs += dist; } ));
+      }
+    */
+  }
+  // #endif // FASTCOMPILE
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  AddDualTrans (const SIMD_IntegrationRule & ir, BareVector<SIMD<double>> values,
+            BareSliceVector<> coefs) const
+  {
+    FlatArray<SIMD<IntegrationPoint>> hir = ir;
+    for (int i = 0; i < hir.Size(); i++)
+      {
+        TIP<DIM,SIMD<double>> tip = hir[i].TIp<DIM>();
+        SIMD<double> val = values(i);
+        static_cast<const FEL*> (this)->
+          T_CalcDualShape (tip, SBLambda ( [&](int j, SIMD<double> shape) { coefs(j) += HSum(val*shape); } ));
+      }
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  AddDualTrans (const IntegrationRule & ir, BareSliceVector<double> values,
+            BareSliceVector<> coefs) const
+  {
+    FlatArray<IntegrationPoint> hir = ir;
+    for (int i = 0; i < hir.Size(); i++)
+      {
+        TIP<DIM,double> tip = hir[i].TIp<DIM>();
+        double val = values(i);
+        static_cast<const FEL*> (this)->
+          T_CalcDualShape (tip, SBLambda ( [&](int j, double shape) { coefs(j) += val*shape; } ));
+      }
+  }
+  // #ifndef FASTCOMPILE
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  AddTrans (const SIMD_IntegrationRule & ir,
+            BareSliceMatrix<SIMD<double>> values,
+            SliceMatrix<> coefs) const
+  {
+    FlatArray<SIMD<IntegrationPoint>> hir = ir;
+    size_t j = 0;
+    for ( ; j+4 <= coefs.Width(); j+=4)
+      {
+        for (size_t i = 0; i < hir.Size(); i++)
+          {
+            TIP<DIM,SIMD<double>> pt = hir[i].TIp<DIM>();
+            SIMD<double> val1 = values(j,i);
+            SIMD<double> val2 = values(j+1,i);
+            SIMD<double> val3 = values(j+2,i);
+            SIMD<double> val4 = values(j+3,i);
+            double * pcoefs = &coefs(j);
+            size_t dist = coefs.Dist();
+            T_CalcShape (pt,
+                         SBLambda ( [&](int j, SIMD<double> shape)
+                                    {
+                                      auto val = HSum(shape*val1, shape*val2, shape*val3, shape*val4);
+                                      val += SIMD<double,4> (pcoefs);
+                                      // _mm256_storeu_pd (pcoefs, val.Data());
+                                      val.Store(pcoefs);
+                                      pcoefs += dist;
+                                    } ));
+          }
+      }
+    switch (coefs.Width()&3)
+      {
+      case 0: break;
+      case 1: AddTrans (ir, values.Row(j), coefs.Col(j)); break;
+      case 2:
+        {
+          /*
+          for (size_t i = 0; i < hir.Size(); i++)
+            {
+              TIP<DIM,SIMD<double>> pt = hir[i].TIp<DIM>();
+              SIMD<double> val1 = values(j,i);
+              SIMD<double> val2 = values(j+1,i);
+              __m256i mask = _mm256_set_epi64x(0, 0, -1, -1);
+              double * pcoefs = &coefs(j);
+              size_t dist = coefs.Dist();
+              T_CalcShape (pt,
+                           SBLambda ( [&](int j, SIMD<double> shape)
+                                      {
+                                        auto val = HSum(shape*val1, shape*val2, shape*val2, shape*val2);
+                                        val += SIMD<double,4> (_mm256_maskload_pd (pcoefs, mask));
+                                        _mm256_maskstore_pd (pcoefs, mask, val.Data());
+                                        pcoefs += dist;
+                                      } ));
+            }
+          */
+          /*
+          SIMD<mask64,4> mask(2);
+          for (size_t i = 0; i < hir.Size(); i++)
+            {
+              TIP<DIM,SIMD<double>> pt = hir[i].TIp<DIM>();
+              SIMD<double> val1 = values(j,i);
+              SIMD<double> val2 = values(j+1,i);
+              double * pcoefs = &coefs(j);
+              size_t dist = coefs.Dist();
+              T_CalcShape (pt,
+                           SBLambda ( [val1,val2,mask,&pcoefs,dist](int j, SIMD<double> shape)
+                                      {
+                                        auto val = HSum(shape*val1, shape*val2, shape*val2, shape*val2);
+                                        val += SIMD<double,4> (pcoefs, mask);
+                                        val.Store(pcoefs, mask);
+                                        pcoefs += dist;
+                                      } ));
+            }
+          */
+          for (size_t i = 0; i < hir.Size(); i++)
+            {
+              TIP<DIM,SIMD<double>> pt = hir[i].TIp<DIM>();
+              SIMD<double> val1 = values(j,i);
+              SIMD<double> val2 = values(j+1,i);
+              double * pcoefs = &coefs(j);
+              size_t dist = coefs.Dist();
+              T_CalcShape (pt,
+                           SBLambda ( [val1,val2,&pcoefs,dist](int j, SIMD<double> shape)
+                                      {
+                                        auto val = HSum(shape*val1, shape*val2);
+                                        val += SIMD<double,2> (pcoefs);
+                                        val.Store(pcoefs);
+                                        pcoefs += dist;
+                                      } ));
+            }
+          break;
+        }
+      case 3:
+        {
+          SIMD<mask64,4> mask(3);
+          for (size_t i = 0; i < hir.Size(); i++)
+            {
+              TIP<DIM,SIMD<double>> pt = hir[i].TIp<DIM>();
+              SIMD<double> val1 = values(j,i);
+              SIMD<double> val2 = values(j+1,i);
+              SIMD<double> val3 = values(j+2,i);
+              double * pcoefs = &coefs(j);
+              size_t dist = coefs.Dist();
+              T_CalcShape (pt,
+                           SBLambda ( [val1,val2,val3,mask,dist,&pcoefs](int j, SIMD<double> shape)
+                                      {
+                                        auto val = HSum(shape*val1, shape*val2, shape*val3, shape*val3);
+                                        val += SIMD<double,4> (pcoefs, mask);
+                                        val.Store(pcoefs, mask);
+                                        pcoefs += dist;
+                                      } ));
+            }
+          break;
+        }
+      }
+        /*
+    for ( ; j < coefs.Width(); j++)
+      Evaluate (ir, coefs.Col(j), values.Row(j));
+        */
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  auto T_ScalarFiniteElement<FEL,ET,BASE> ::
+  EvaluateGrad (const IntegrationPoint & ip, BareSliceVector<double> coefs) const -> Vec<DIM>
+  {
+    Vec<DIM> sum = 0.0;
+    T_CalcShape (GetTIPGrad<DIM>(ip),
+                 SBLambda ( [&](int i, auto val)
+                            {
+                              sum += coefs(i) * ngfem::GetGradient(val);
+                            }));
+    return sum;
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  EvaluateGrad (const IntegrationRule & ir, BareSliceVector<double> coefs,
+                BareSliceMatrix<> vals) const
+  {
+    for (int i = 0; i < ir.GetNIP(); i++)
+      {
+        Vec<DIM> sum = 0.0;
+        T_CalcShape (GetTIPGrad<DIM>(ir[i]),
+                     SBLambda ([&sum, coefs] (size_t j, auto shape)
+                               { sum += coefs(j) * ngfem::GetGradient(shape); }));
+        vals.Row(i) = sum;
+      }
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  EvaluateGrad (const SIMD_BaseMappedIntegrationRule & bmir,
+                BareSliceVector<> coefs,
+                BareSliceMatrix<SIMD<double>> values) const
+  {
+    Switch<4-DIM>
+      (bmir.DimSpace()-DIM, [this,&bmir,coefs,values] (auto CODIM)
+       {
+         constexpr int DIMSPACE = DIM+CODIM.value;
+         auto & mir = static_cast<const SIMD_MappedIntegrationRule<DIM,DIMSPACE>&> (bmir);
+         for (size_t i = 0; i < mir.Size(); i++)
+           {
+             double *pcoefs = &coefs(0);
+             const size_t dist = coefs.Dist();
+             Vec<DIMSPACE,SIMD<double>> sum(0.0);
+             this->T_CalcShape (GetTIP(mir[i]),
+                                SBLambda ([&pcoefs,dist,&sum]
+                                          (size_t j, auto shape)
+                                          {
+                                            sum += *pcoefs * ngfem::GetGradient(shape);
+                                            pcoefs += dist;
+                                          }));
+             values.Col(i).Range(DIMSPACE) = sum;
+           }
+       });
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  EvaluateGrad (const SIMD_IntegrationRule & ir,
+                BareSliceVector<> coefs,
+                BareSliceMatrix<SIMD<double>> values) const
+  {
+    for (int i = 0; i < ir.Size(); i++)
+      {
+        Vec<DIM,SIMD<double>> sum(0.0);
+        T_CalcShape (GetTIPGrad<DIM> (ir[i]),
+                     SBLambda ([&sum, coefs] (size_t j, auto shape)
+                               { sum += coefs(j) * ngfem::GetGradient(shape); }));
+        values.Col(i).Range(DIM) = sum;
+      }
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  EvaluateGradTrans (const IntegrationRule & ir,
+                     BareSliceMatrix<> vals, BareSliceVector<double> coefs) const
+  {
+    coefs.Range(0,ndof) = 0.0;
+    for (int i = 0; i < ir.GetNIP(); i++)
+      {
+        Vec<DIM> vali = vals.Row(i);
+        T_CalcShape (GetTIPGrad<DIM>(ir[i]),
+                     SBLambda ([coefs, vali] (int j, auto shape)
+                               { coefs(j) += InnerProduct (vali, ngfem::GetGradient(shape)); }));
+      }
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  EvaluateGradTrans (const IntegrationRule & ir, SliceMatrix<> values, SliceMatrix<> coefs) const
+  {
+    int nels = coefs.Width();
+    coefs = 0.0;
+    for (int i = 0; i < ir.GetNIP(); i++)
+      {
+        // Vec<DIM, AutoDiff<DIM>> adp = ir[i];
+        T_CalcShape (// TIP<DIM, AutoDiff<DIM>> (adp),
+                     GetTIPGrad<DIM>(ir[i]),
+                     SBLambda ([&] (int j, auto shape)
+                               {
+                                 FlatMatrixFixWidth<DIM> mvals(nels, &values(i,0));
+                                 coefs.Row(j) += mvals * ngfem::GetGradient(shape);
+                               }));
+      }
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  AddGradTrans (const SIMD_BaseMappedIntegrationRule & bmir,
+                BareSliceMatrix<SIMD<double>> values,
+                BareSliceVector<> coefs) const
+  {
+    if constexpr (DIM == 0) return;
+    Iterate<4-DIM>
+      ([&](auto CODIM)
+       {
+         constexpr auto DIMSPACE = DIM+CODIM.value;
+         if (bmir.DimSpace() == DIMSPACE)
+           {
+             auto & mir = static_cast<const SIMD_MappedIntegrationRule<DIM,DIMSPACE>&> (bmir);
+             for (size_t i = 0; i < mir.Size(); i++)
+               {
+                 // Directional derivative
+                 [[maybe_unused]]
+                   Vec<DIM, SIMD<double>> jac_dir = mir[i].GetJacobianInverse() * values.Col(i);
+                 const auto &ip = mir[i].IP();
+                 TIP<DIM,AutoDiff<1,SIMD<double>>>adp(ip.FacetNr(), ip.VB());
+                 if constexpr(DIM>0)
+                     adp.x = AutoDiff<1, SIMD<double>>( ip(0), jac_dir(0) );
+                 if constexpr(DIM>1)
+                     adp.y = AutoDiff<1, SIMD<double>>( ip(1), jac_dir(1) );
+                 if constexpr(DIM>2)
+                     adp.z = AutoDiff<1, SIMD<double>>( ip(2), jac_dir(2) );
+                 double * pcoef = &coefs(0);
+                 size_t dist = coefs.Dist();
+                 this->T_CalcShape (adp,
+                                    SBLambda ([dist,&pcoef] (size_t j, auto shape)
+                                              {
+                                                *pcoef += HSum(shape.DValue(0));
+                                                pcoef += dist;
+                                              }));
+               }
+           }
+       });
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  AddGradTrans (const SIMD_BaseMappedIntegrationRule & bmir,
+                BareSliceMatrix<SIMD<double>> values,
+                SliceMatrix<> coefs) const
+  {
+    Iterate<4-DIM>
+      ([&](auto CODIM)
+       {
+         constexpr auto DIMSPACE = DIM+CODIM.value;
+         if (bmir.DimSpace() == DIMSPACE)
+           {
+             auto & mir = static_cast<const SIMD_MappedIntegrationRule<DIM,DIMSPACE>&> (bmir);
+             size_t j = 0;
+             for ( ; j+4 <= coefs.Width(); j+=4)
+               {
+                 for (size_t i = 0; i < mir.Size(); i++)
+                   {
+                     TIP<DIM,AutoDiff<DIMSPACE,SIMD<double>>>adp = GetTIP(mir[i]);
+                     double * pcoef = &coefs(0,j);
+                     size_t dist = coefs.Dist();
+                     // Vec<4*DIMSPACE,SIMD<double>> vals = values.Col(i).Range(j*DIMSPACE, (j+4)*DIMSPACE);
+                     Vec<DIMSPACE,SIMD<double>> vals1 = values.Col(i).Range(j*DIMSPACE, (j+1)*DIMSPACE);
+                     Vec<DIMSPACE,SIMD<double>> vals2 = values.Col(i).Range((j+1)*DIMSPACE, (j+2)*DIMSPACE);
+                     Vec<DIMSPACE,SIMD<double>> vals3 = values.Col(i).Range((j+2)*DIMSPACE, (j+3)*DIMSPACE);
+                     Vec<DIMSPACE,SIMD<double>> vals4 = values.Col(i).Range((j+3)*DIMSPACE, (j+4)*DIMSPACE);
+                     this->T_CalcShape (adp,
+                                        SBLambda ([=,&pcoef] (size_t j, auto shape)
+                                                  {
+                                                    auto grad = ngfem::GetGradient(shape);
+                                                    SIMD<double> sum1 = InnerProduct(vals1, grad);
+                                                    SIMD<double> sum2 = InnerProduct(vals2, grad);
+                                                    SIMD<double> sum3 = InnerProduct(vals3, grad);
+                                                    SIMD<double> sum4 = InnerProduct(vals4, grad);
+                                                    SIMD<double,4> allsum = HSum(sum1, sum2, sum3, sum4);
+                                                    allsum += SIMD<double,4> (pcoef);
+                                                    allsum.Store(pcoef);
+                                                    pcoef += dist;
+                                                  }));
+                   }
+               }
+             for ( ; j+1 <= coefs.Width(); j++)
+               {
+                 for (size_t i = 0; i < mir.Size(); i++)
+                   {
+                     // TIP<DIM,AutoDiff<DIMSPACE,SIMD<double>>>adp = GetTIP(mir[i]);
+                     double * pcoef = &coefs(0,j);
+                     size_t dist = coefs.Dist();
+                     Vec<DIMSPACE,SIMD<double>> vals = values.Col(i).Range(j*DIMSPACE, (j+1)*DIMSPACE);
+                     this->T_CalcShape (GetTIP(mir[i]),   // adp
+                                        SBLambda ([=,&pcoef] (size_t j, auto shape)
+                                                  {
+                                                    *pcoef += HSum(InnerProduct(ngfem::GetGradient(shape), vals));
+                                                    pcoef += dist;
+                                                  }));
+                   }
+               }
+           }
+       });
+  }
+  /*
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcDShape (const IntegrationPoint & ip,
+	      const std::function<void(int,Vec<DIM>)> & callback) const
+  {
+    Vec<DIM, AutoDiff<DIM> > adp;
+    for (int i = 0; i < DIM; i++)
+      adp[i] = AutoDiff<DIM> (ip(i), i);
+    // DShapeAssign<DIM> ds(dshape);
+    // T_CalcShape (&adp(0), ds);
+    T_CalcShape (&adp(0), SBLambda ([&] (int i, AutoDiff<DIM> shape)
+                                    {
+				      Vec<DIM> v;
+				      shape.StoreGradient (&v(0));
+				      callback (i,v);
+				    }));
+  }
+  */
+  /*
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcMappedDShape (const MappedIntegrationPoint<DIM,DIM> & mip,
+		    FlatMatrixFixWidth<DIM> dshape) const
+  {
+    Vec<DIM, AutoDiff<DIM> > adp;
+    for (int i = 0; i < DIM; i++)
+      adp[i].Value() = mip.IP()(i);
+    for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++)
+	adp[i].DValue(j) = mip.GetJacobianInverse()(i,j);
+    T_CalcShape (&adp(0), SBLambda ([&] (int i, AutoDiff<DIM> shape)
+                                    { shape.StoreGradient (&dshape(i,0)) ; }));
+  }
+  */
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcMappedDShape (const BaseMappedIntegrationPoint & bmip,
+		    BareSliceMatrix<> dshape) const
+  {
+    Switch<4-DIM>
+      (bmip.DimSpace()-DIM, [&bmip, dshape, this](auto CODIM)
+       {
+         constexpr int DIM_ = DIM;
+         constexpr int DIMSPACE = int(DIM)+int(CODIM.value);
+         static_assert(DIM<=DIMSPACE, "dim<=dimspace");
+         auto & mip = static_cast<const MappedIntegrationPoint<DIM_,DIMSPACE> &> (bmip);
+         auto dshapes = dshape.AddSize(ndof, DIMSPACE);
+         this->T_CalcShape (GetTIP(mip),
+                            SBLambda ([dshapes] (size_t i, auto shape)
+                                      { dshapes.Row(i) = ngfem::GetGradient(shape); }));
+       });
+    /*
+    if (bmip.DimSpace() == DIM)
+      {
+        auto & mip = static_cast<const MappedIntegrationPoint<DIM,DIM> &> (bmip);
+        auto dshapes = dshape.AddSize(ndof, DIM);
+        T_CalcShape (GetTIP(mip),
+                     SBLambda ([dshapes] (int i, auto shape)
+                               { dshapes.Row(i) = ngfem::GetGradient(shape); }));
+      }
+    else if (bmip.DimSpace() == DIM+1)
+      {
+        constexpr int DIM1 = DIM<3 ? DIM+1 : DIM;
+        auto & mip = static_cast<const MappedIntegrationPoint<DIM,DIM1> &> (bmip);
+        auto dshapes = dshape.AddSize(ndof, DIM1);
+        T_CalcShape (GetTIP(mip),
+                     SBLambda ([dshapes] (int i, auto shape)
+                               {dshapes.Row(i) = ngfem::GetGradient(shape);}));
+      }
+    else
+      {
+        cout << "CalcMappedDShape called for bboundary (not implemented)" << endl;
+      }
+    */
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcMappedDShape (const BaseMappedIntegrationRule & bmir,
+		    BareSliceMatrix<> dshape) const
+  {
+    /*
+    // auto & mir = static_cast<const MappedIntegrationRule<DIM,DIM> &> (bmir);
+    for (size_t i = 0; i < bmir.Size(); i++)
+      T_ScalarFiniteElement::CalcMappedDShape (bmir[i], dshape.Cols(i*DIM,(i+1)*DIM));
+    */
+    Switch<4-DIM>
+      (bmir.DimSpace()-DIM, [&bmir, dshape, this](auto CODIM)
+       {
+         constexpr int DIM_ = DIM;
+         constexpr int DIMSPACE = int(DIM)+int(CODIM.value);
+         auto & mir = static_cast<const MappedIntegrationRule<DIM_,DIMSPACE> &> (bmir);
+         for (size_t i = 0; i < mir.Size(); i++)
+           {
+             auto dshapes = dshape.Cols(i*DIMSPACE, (i+1)*DIMSPACE).AddSize(ndof, DIMSPACE);
+             this->T_CalcShape (GetTIP(mir[i]),
+                                SBLambda ([dshapes] (size_t j, auto shape)
+                                          { dshapes.Row(j) = ngfem::GetGradient(shape); }));
+           }
+       });
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcMappedDShape (const SIMD_BaseMappedIntegrationRule & bmir,
+                    BareSliceMatrix<SIMD<double>> dshapes) const
+  {
+   if (bmir.DimSpace() == DIM)
+      {
+        auto & mir = static_cast<const SIMD_MappedIntegrationRule<DIM,DIM>&> (bmir);
+        for (size_t i = 0; i < mir.Size(); i++)
+          {
+            SIMD<double> * pdshapes = dshapes.Col(i).Data();
+            size_t dist = dshapes.Dist();
+            // TIP<DIM,AutoDiff<DIM,SIMD<double>>> adp = GetTIP(mir[i]);
+            T_CalcShape (GetTIP(mir[i]), // adp,
+                         SBLambda ([&] (size_t j, AutoDiff<DIM,SIMD<double>> shape)
+                                   {
+                                     Iterate<DIM> ( [&] (size_t ii) {
+                                         *pdshapes = shape.DValue(ii);
+                                         pdshapes += dist;
+                                       });
+                                   }));
+          }
+      }
+   else if (bmir.DimSpace() == DIM+1)
+     {
+       constexpr int DIM1 = DIM<3 ? DIM+1 : DIM;
+       auto & mir = static_cast<const SIMD_MappedIntegrationRule<DIM,DIM1>&> (bmir);
+       for (size_t i = 0; i < mir.Size(); i++)
+         {
+           SIMD<double> * pdshapes = dshapes.Col(i).Data();
+           size_t dist = dshapes.Dist();
+           // TIP<DIM,AutoDiff<DIM1,SIMD<double>>> adp = GetTIP(mir[i]);
+           T_CalcShape (GetTIP(mir[i]), // adp,
+                        SBLambda ([&] (size_t j, AutoDiff<DIM1,SIMD<double>> shape)
+                                  {
+                                    /*
+                                    Iterate<DIM1> ( [&] (size_t ii) {
+                                        *pdshapes = shape.DValue(ii);
+                                        pdshapes += dist;
+                                      });
+                                    */
+                                    for (size_t k = 0; k < DIM1; k++)
+                                      {
+                                        *pdshapes = shape.DValue(k);
+                                        pdshapes += dist;
+                                      }
+                                  }));
+         }
+     }
+   else
+     {
+       cout << "EvaluateGrad(simd) called for bboundary (not implemented)" << endl;
+     }
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcDDShape (const IntegrationPoint & ip,
+               BareSliceMatrix<> ddshape) const
+  {
+    TIP<DIM, AutoDiff<DIM>> t1 = ip;
+    TIP<DIM, AutoDiffDiff<DIM>> tip = t1;
+    T_CalcShape (tip,
+                 SBLambda ([ddshape] (size_t i, auto shape)
+                           {
+                             auto row = ddshape.Row(i);
+                             for (int d1 = 0; d1 < DIM; d1++)
+                               for (int d2 = 0; d2 < DIM; d2++)
+                                 row(d1*DIM+d2) = shape.DDValue(d1,d2);
+                           }));
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcMappedDDShape (const BaseMappedIntegrationPoint & bmip,
+                     BareSliceMatrix<> ddshape) const
+  {
+    /*
+    auto & mip = static_cast<const MappedIntegrationPoint<DIM,DIM>&> (bmip);
+    T_CalcShape (GetTIPHesse (mip),
+                 SBLambda ([ddshape] (size_t i, auto shape)
+                           {
+                             auto row = ddshape.Row(i);
+                             for (int d1 = 0; d1 < DIM; d1++)
+                               for (int d2 = 0; d2 < DIM; d2++)
+                                 row(d1*DIM+d2) = shape.DDValue(d1,d2);
+                           }));
+    */
+    /*
+    Iterate<4-DIM>
+      ([&](auto CODIM)
+      {
+        constexpr auto DIMSPACE = DIM+CODIM.value;
+        if (bmip.DimSpace() == DIMSPACE)
+    */
+    Switch<4-DIM>
+      (bmip.DimSpace()-DIM, [&] (auto CODIM)
+      {
+        constexpr int DIM_ = DIM;
+        constexpr int DIMSPACE = int(DIM)+int(CODIM.value);
+        auto & mip = static_cast<const MappedIntegrationPoint<DIM_,DIMSPACE>&> (bmip);
+        T_CalcShape (GetTIPHesse (mip),
+                     SBLambda ([ddshape,DIMSPACE] (size_t i, auto shape)
+                     {
+                       auto row = ddshape.Row(i);
+                       for (int d1 = 0; d1 < DIMSPACE; d1++)
+                         for (int d2 = 0; d2 < DIMSPACE; d2++)
+                           row(d1*DIMSPACE+d2) = shape.DDValue(d1,d2);
+                     }));
+      });
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcMappedDDShape (const SIMD<BaseMappedIntegrationPoint> & bmip,
+                     BareSliceMatrix<SIMD<double>> ddshape) const
+  {
+    /*
+    Iterate<4-DIM>
+      ([&](auto CODIM)
+      {
+        constexpr auto DIMSPACE = DIM+CODIM.value;
+        if (bmip.DimSpace() == DIMSPACE)
+          {
+    */
+    Switch<4-DIM>
+      (bmip.DimSpace()-DIM, [&] (auto CODIM)
+      {
+        constexpr int DIMSPACE = DIM+CODIM.value;
+        auto & mip = static_cast<const SIMD<MappedIntegrationPoint<DIM,DIMSPACE>>&> (bmip);
+        T_CalcShape (GetTIPHesse (mip),
+                     SBLambda ([ddshape,DIMSPACE] (size_t i, auto shape)
+                     {
+                       auto row = ddshape.Row(i);
+                       for (int d1 = 0; d1 < DIMSPACE; d1++)
+                         for (int d2 = 0; d2 < DIMSPACE; d2++)
+                           row(d1*DIMSPACE+d2) = shape.DDValue(d1,d2);
+                     }));
+      });
+  }
+#endif
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  bool T_ScalarFiniteElement<FEL,ET,BASE> :: GetDiagDualityMassInverse (FlatVector<> diag) const
+  {
+    return static_cast<const FEL*>(this)->GetDiagDualityMassInverse2(diag);
+  }
+  template <class FEL, ELEMENT_TYPE ET, class BASE>
+  void T_ScalarFiniteElement<FEL,ET,BASE> ::
+  CalcDualShape (const BaseMappedIntegrationPoint & mip, BareSliceVector<> shape) const
+  {
+    // static_cast<const FEL*>(this) -> CalcDualShape2 (mip, shape);
+    /*
+    try
+      {
+        static_cast<const FEL*>(this) -> CalcDualShape2 (mip, shape);
+      }
+    catch (const Exception& e)
+      {
+        double imeas = 1.0/mip.GetMeasure();
+        shape = 0.0;
+        static_cast<const FEL*> (this)->
+          T_CalcDualShape (GetTIP<DIM>(mip.IP()), SBLambda ( [&](int j, double val) { shape(j) = imeas * val; }));
+      }
+    */
+    double imeas = 1.0/mip.GetMeasure();
+    shape.Range(ndof) = 0.0;
+    static_cast<const FEL*> (this)->
+      T_CalcDualShape (GetTIP<DIM>(mip.IP()), SBLambda ( [&](int j, double val) { shape(j) = imeas * val; }));
+  }
+}
+#endif