PyPI - ngsolve - Versions diffs - 6.2.2502__cp311-cp311-win_amd64.whl → 6.2.2601__cp311-cp311-win_amd64.whl - Mend

ngsolve 6.2.2502__cp311-cp311-win_amd64.whl → 6.2.2601__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

netgen/include/analytic_integrals.hpp +10 -0
netgen/include/basematrix.hpp +6 -0
netgen/include/bdbequations.hpp +55 -0
netgen/include/bem_diffops.hpp +475 -0
netgen/include/bilinearform.hpp +4 -1
netgen/include/blockjacobi.hpp +17 -5
netgen/include/bspline.hpp +2 -0
netgen/include/cholesky.hpp +2 -2
netgen/include/code_generation.hpp +2 -2
netgen/include/complex_wrapper.hpp +30 -2
netgen/include/contact.hpp +8 -0
netgen/include/diagonalmatrix.hpp +6 -0
netgen/include/diffop_impl.hpp +3 -1
netgen/include/diffopwithfactor.hpp +123 -0
netgen/include/elementbyelement.hpp +9 -3
netgen/include/expr.hpp +45 -7
netgen/include/fespace.hpp +12 -4
netgen/include/gridfunction.hpp +3 -3
netgen/include/h1amg.hpp +85 -2
netgen/include/h1lumping.hpp +6 -0
netgen/include/hcurl_equations.hpp +29 -0
netgen/include/hcurlcurlfe.hpp +20 -0
netgen/include/hdiv_equations.hpp +15 -0
netgen/include/hdivfe_utils.hpp +1 -0
netgen/include/hdivhofespace.hpp +2 -0
netgen/include/integrator.hpp +4 -16
netgen/include/intrule.hpp +2 -1
netgen/include/intrules_SauterSchwab.hpp +25 -0
netgen/include/jacobi.hpp +35 -18
netgen/include/kernels.hpp +724 -0
netgen/include/l2hofe.hpp +1 -0
netgen/include/matrix.hpp +8 -3
netgen/include/meshaccess.hpp +4 -3
netgen/include/mp_coefficient.hpp +145 -0
netgen/include/mptools.hpp +1331 -368
netgen/include/mycomplex.hpp +1 -1
netgen/include/ngblas.hpp +116 -7
netgen/include/potentialtools.hpp +22 -0
netgen/include/preconditioner.hpp +23 -23
netgen/include/prolongation.hpp +132 -6
netgen/include/recursive_pol.hpp +63 -11
netgen/include/simd_complex.hpp +45 -0
netgen/include/sparsecholesky.hpp +6 -2
netgen/include/sparsefactorization_interface.hpp +159 -0
netgen/include/sparsematrix.hpp +21 -7
netgen/include/sparsematrix_dyn.hpp +6 -7
netgen/include/sparsematrix_impl.hpp +175 -40
netgen/include/special_matrix.hpp +2 -0
netgen/include/statushandler.hpp +8 -8
netgen/include/symbolicintegrator.hpp +2 -1
netgen/include/tangentialfacetfespace.hpp +7 -22
netgen/include/thdivfe_impl.hpp +66 -0
netgen/include/tscalarfe.hpp +1 -1
netgen/include/vector.hpp +272 -47
netgen/lib/libngsolve.lib +0 -0
netgen/libngsolve.dll +0 -0
netgen/ngscxx.bat +1 -1
netgen/ngsld.bat +1 -1
ngsolve/__init__.py +1 -0
ngsolve/cmake/NGSolveConfig.cmake +8 -8
ngsolve/cmake/ngsolve-targets.cmake +24 -18
ngsolve/config/config.py +7 -7
ngsolve/demos/intro/cmagnet.py +19 -22
ngsolve/directsolvers.py +9 -21
ngsolve/krylovspace.py +172 -3
ngsolve/ngslib.lib +0 -0
ngsolve/ngslib.pyd +0 -0
ngsolve/nonlinearsolvers.py +2 -2
ngsolve/preconditioners.py +1 -0
ngsolve/solve_implementation.py +168 -0
ngsolve/{solvers.py → solvers/__init__.py} +1 -1
ngsolve/solvers/cudss.py +112 -0
ngsolve/webgui.py +2 -0
{ngsolve-6.2.2502.dist-info → ngsolve-6.2.2601.dist-info}/METADATA +2 -2
{ngsolve-6.2.2502.dist-info → ngsolve-6.2.2601.dist-info}/RECORD +107 -97
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/Scripts/ngsolve.tcl +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/beam.geo +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/beam.vol +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/chip.in2d +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/chip.vol +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coil.geo +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coil.vol +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coilshield.geo +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coilshield.vol +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/cube.geo +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/cube.vol +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d10_DGdoubleglazing.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d11_chip_nitsche.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d1_square.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d2_chip.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d3_helmholtz.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d4_cube.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d5_beam.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d6_shaft.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d7_coil.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d8_coilshield.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d9_hybridDG.pde +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/doubleglazing.in2d +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/doubleglazing.vol +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/piezo2d40round4.vol.gz +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/shaft.geo +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/shaft.vol +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/square.in2d +0 -0
{ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/square.vol +0 -0
{ngsolve-6.2.2502.dist-info → ngsolve-6.2.2601.dist-info}/LICENSE +0 -0
{ngsolve-6.2.2502.dist-info → ngsolve-6.2.2601.dist-info}/WHEEL +0 -0
{ngsolve-6.2.2502.dist-info → ngsolve-6.2.2601.dist-info}/top_level.txt +0 -0

netgen/include/mptools.hpp CHANGED Viewed

@@ -10,8 +10,77 @@
 #include <coefficient.hpp>
 #include <recursive_pol.hpp>
-namespace ngfem
+namespace ngsbem
 {
+  using namespace ngfem;
+  template<typename T>
+  constexpr int VecLength = 1;  // Default: Complex has length 1
+  template<int N>
+  constexpr int VecLength<Vec<N, Complex>> = N;  // Specialization: Vec<N,Complex> has length N
+  constexpr int FMM_SW = 4;
+  // ************************ SIMD - creation (should end up in simd.hpp) *************
+  template <int S, typename T, int SW>
+  Vec<S,T> HSum (Vec<S,SIMD<T,SW>> v)
+  {
+    Vec<S,T> res;
+    for (int i = 0; i < S; i++)
+      res(i) = HSum(v(i));
+    // Iterate<S> ([&](auto i) {
+    // res.HTData().template Elem<i.value>() = HSum(v.HTData().template Elem<i.value>());
+    // });
+    return res;
+  }
+  class NGS_DLL_HEADER PrecomputedSqrts
+  {
+  public:
+    Array<double> sqrt_int;
+    // Array<double> inv_sqrt_int;
+    Array<double> sqrt_n_np1;    // sqrt(n*(n+1))
+    Array<double> inv_sqrt_2np1_2np3;  // 1/sqrt( (2n+1)*(2n+3) )
+    PrecomputedSqrts();
+  };
+  extern NGS_DLL_HEADER PrecomputedSqrts presqrt;
+  class FMM_Parameters
+  {
+  public:
+    int maxdirect = 100;
+    int minorder = 20;    // order = minorder + 2 kappa r
+  };
+  inline std::tuple<double, double, double> SphericalCoordinates(Vec<3> dist){
+    double len, theta, phi;
+    len = L2Norm(dist);
+    if (len < 1e-30)
+      theta = 0;
+    else
+      theta = acos (dist(2) / len);
+    if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
+      phi = 0;
+    else
+      phi = atan2(dist(1), dist(0));
+    return {len, theta, phi};
+  }
   template <typename entry_type = Complex>
@@ -77,23 +146,91 @@ namespace ngfem
     void Calc (Vec<3> x, FlatVector<Complex> shapes);
+    void FlipZ ();
     void RotateZ (double alpha);
-    void RotateY (double alpha);
+    template <typename FUNC>
+    void RotateZ (double alpha, FUNC func) const
+    {
+      if (order < 0) return;
+      Vector<Complex> exp_imalpha(order+1);
+      Complex exp_ialpha(cos(alpha), sin(alpha));
+      Complex prod = 1.0;
+      for (int i = 0; i <= order; i++)
+        {
+          exp_imalpha(i) = prod;
+          prod *= exp_ialpha;
+        }
+      int ii = 0;
+      for (int n = 0; n <= order; n++)
+        {
+          for (int m = -n; m < 0; m++, ii++)
+            func(ii, conj(exp_imalpha(-m)));
+          for (int m = 0; m <= n; m++, ii++)
+            func(ii, exp_imalpha(m));
+        };
+    };
+    template <typename FUNC>
+    void RotateZFlip (double alpha, bool flip, FUNC func) const
+    {
+      if (order < 0) return;
+      Vector<Complex> exp_imalpha(order+1);
+      Complex exp_ialpha(cos(alpha), sin(alpha));
+      Complex prod = 1.0;
+      for (int i = 0; i <= order; i++)
+        {
+          exp_imalpha(i) = prod;
+          prod *= exp_ialpha;
+        }
+      int ii = 0;
+      auto FlipFactor = [] (int n, int m, bool flip)->double
+      {
+        if (flip)
+          return ((n-m)%2) == 1 ? -1 : 1;
+        return 1.0;
+      };
+      for (int n = 0; n <= order; n++)
+        {
+          for (int m = -n; m < 0; m++, ii++)
+            func(ii, FlipFactor(n,m,flip)*conj(exp_imalpha(-m)));
+          for (int m = 0; m <= n; m++, ii++)
+            func(ii, FlipFactor(n,m,flip)*exp_imalpha(m));
+        };
+    };
+    void RotateY (double alpha, bool parallel = false);
     static double CalcAmn (int m, int n)
     {
       if (m < 0) m=-m;
       if (n < m) return 0;
-      return sqrt( (n+1.0+m)*(n+1.0-m) / ( (2*n+1)*(2*n+3) ));
+      if (2*n+1 < presqrt.sqrt_int.Size())
+        return presqrt.sqrt_int[n+1+m]*presqrt.sqrt_int[n+1-m] * presqrt.inv_sqrt_2np1_2np3[n];
+      else
+        return sqrt( (n+1.0+m)*(n+1.0-m) / ( (2*n+1)*(2*n+3) ));
     }
     static double CalcBmn (int m, int n)
     {
       double sgn = (m >= 0) ? 1 : -1;
-      if ( (m > n) || (-m > n) ) return 0;
-      return sgn * sqrt( (n-m-1.0)*(n-m) / ( (2*n-1.0)*(2*n+1)));
+      if ( (m >= n) || (-m > n) ) return 0;
+      if (n <= presqrt.inv_sqrt_2np1_2np3.Size())
+        return sgn * presqrt.sqrt_n_np1[n-m-1] * presqrt.inv_sqrt_2np1_2np3[n-1];
+      else
+        return sgn * sqrt( (n-m-1.0)*(n-m) / ( (2*n-1.0)*(2*n+1)));
     }
     static double CalcDmn (int m, int n)
@@ -112,11 +249,11 @@ namespace ngfem
   // https://fortran-lang.discourse.group/t/looking-for-spherical-bessel-and-hankel-functions-of-first-and-second-kind-and-arbitrary-order/2308/2
   NGS_DLL_HEADER
   void besseljs3d (int nterms, double z, double scale,
-                   FlatVector<double> fjs, FlatVector<double> fjder);
+                   SliceVector<double> fjs, SliceVector<double> fjder = FlatVector<double>(0, nullptr));
   NGS_DLL_HEADER
   void besseljs3d (int nterms, Complex z, double scale,
-                   FlatVector<Complex> fjs, FlatVector<Complex> fjder);
+                   SliceVector<Complex> fjs, SliceVector<Complex> fjder = FlatVector<Complex>(0, nullptr));
   /*
@@ -135,14 +272,17 @@ namespace ngfem
                FlatVector<double> jp,
                FlatVector<double> yp);
   template <typename T>
   void SphericalBessel (int n, double rho, double scale, T && values)
   {
+    besseljs3d (n, rho, scale,  values);
+    /*
     Vector<double> j(n+1), jp(n+1);
     besseljs3d (n, rho, scale,  j, jp);
     values = j;
+    */
   }
@@ -166,21 +306,6 @@ namespace ngfem
         return;
       }
     Vector j(n+1), y(n+1), jp(n+1), yp(n+1);
-    // SBESJY (rho, n, j, y, jp, yp);
-    /*
-    values = j + Complex(0,1) * y;
-    if (scale != 1.0)
-      {
-        double prod = 1.0;
-        for (int i = 0; i <= n; i++)
-          {
-            values(i) *= prod;
-            prod *= scale;
-          }
-      }
-    */
     // the bessel-evaluation with scale
     besseljs3d (n, rho, 1/scale,  j, jp);
@@ -208,7 +333,7 @@ namespace ngfem
   // hn1 = jn+ i*yn
-  class MPSingular
+  class Singular
   {
   public:
     template <typename T>
@@ -216,48 +341,80 @@ namespace ngfem
     {
       SphericalHankel1(order, r, scale,  values);
     }
+    template <typename T>
+    static void Eval (int order, double kappa, double r, double rtyp, T && values)
+    {
+      double scale = Scale(kappa, rtyp);
+      SphericalHankel1(order, r*kappa, scale,  values);
+    }
+    static double Scale (double kappa, double rtyp)
+    {
+      // return min(1.0, rtyp*kappa);
+      return min(1.0, 0.5*rtyp*kappa);
+    }
   };
   // jn
-  class MPRegular
+  class Regular
   {
-  public:
+  public:
     template <typename T>
     static void Eval (int order, double r, double scale, T && values)
     {
       SphericalBessel (order, r, 1.0/scale, values);
     }
+    template <typename T>
+    static void Eval (int order, double kappa, double r, double rtyp, T && values)
+    {
+      double scale = Scale(kappa, rtyp);
+      SphericalBessel (order, r*kappa, 1.0/scale, values);
+    }
+    static double Scale (double kappa, double rtyp)
+    {
+      // return 1.0/ min(1.0, 0.25*rtyp*kappa);
+      return 1.0/ min(1.0, 0.5*rtyp*kappa);
+    }
   };
   template <typename RADIAL, typename entry_type=Complex>
-  class NGS_DLL_HEADER MultiPole
+  class NGS_DLL_HEADER SphericalExpansion
   {
     SphericalHarmonics<entry_type> sh;
     double kappa;
-    double scale;
+    double rtyp;
   public:
-    MultiPole (int aorder, double akappa, double ascale = 1)
-      : sh(aorder), kappa(akappa), scale(ascale) { }
+    SphericalExpansion (int aorder, double akappa, double artyp)
+    : sh(aorder), kappa(akappa), rtyp(artyp) { }
     entry_type & Coef(int n, int m) { return sh.Coef(n,m); }
     auto & SH() { return sh; }
     const auto & SH() const { return sh; }
     double Kappa() const { return kappa; }
-    double Scale() const { return scale; }
+    double Scale() const { return RADIAL::Scale(kappa, rtyp); }
+    double RTyp() const { return rtyp; }
     int Order() const { return sh.Order(); }
-    MultiPole Truncate(int neworder) const
+    SphericalExpansion Truncate(int neworder) const
     {
       if (neworder > sh.Order()) neworder=sh.Order();
-      MultiPole nmp(neworder, kappa);
+      SphericalExpansion nmp(neworder, kappa, rtyp);
       nmp.sh.Coefs() = sh.Coefs().Range(sqr(neworder+1));
       return nmp;
     }
-    MultiPole & operator+= (const MultiPole & mp2)
+    SphericalExpansion & operator+= (const SphericalExpansion & mp2)
     {
       size_t commonsize = min(SH().Coefs().Size(), mp2.SH().Coefs().Size());
       SH().Coefs().Range(commonsize) += mp2.SH().Coefs().Range(commonsize);
@@ -268,18 +425,27 @@ namespace ngfem
     entry_type EvalDirectionalDerivative (Vec<3> x, Vec<3> d) const;
     void AddCharge (Vec<3> x, entry_type c);
-    void AddDipole (Vec<3> x, Vec<3> d, entry_type c);
-    void AddCurrent (Vec<3> ap, Vec<3> ep, Complex j, int num=100);
+    void AddDipole (Vec<3> x, Vec<3> dir, entry_type c);
+    void AddChargeDipole (Vec<3> x, entry_type c, Vec<3> dir, entry_type c2)
+    {
+      // TODO: add them at once
+      AddCharge (x, c);
+      AddDipole (x, dir, c2);
+    }
+    void AddPlaneWave (Vec<3> d, entry_type c);
+    void AddCurrent (Vec<3> ap, Vec<3> ep, Complex j, int num=100);
-    void ChangeScaleTo (double newscale)
+    void ChangeRTypTo (double new_rtyp)
     {
-      double fac = scale/newscale;
+      double fac = RADIAL::Scale(kappa, rtyp) / RADIAL::Scale(kappa, new_rtyp);
       double prod = 1;
       for (int n = 0; n <= sh.Order(); n++, prod*= fac)
         sh.CoefsN(n) *= prod;
-      scale = newscale;
+      rtyp = new_rtyp;
     }
     Vector<double> Spectrum (bool scaled) const
     {
@@ -288,14 +454,14 @@ namespace ngfem
       for (int n = 0; n <= Order(); n++)
         {
           spec(n) = fac * L2Norm2(sh.CoefsN(n));
-          if (!scaled) fac *= sqr(scale);
+          if (!scaled) fac *= sqr(Scale());
         }
       return spec;
     }
     template <typename TARGET>
-    void Transform (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
+    void Transform (SphericalExpansion<TARGET,entry_type> & target, Vec<3> dist) const
     {
       if (target.SH().Order() < 0) return;
       if (SH().Order() < 0)
@@ -304,24 +470,16 @@ namespace ngfem
           return;
         }
-      static Timer t("mptool Transform "+ToString(typeid(RADIAL).name())+ToString(typeid(TARGET).name()));
-      RegionTimer reg(t);
+      // static Timer t("mptool Transform "+ToString(typeid(RADIAL).name())+ToString(typeid(TARGET).name()));
+      // RegionTimer reg(t);
-      double len = L2Norm(dist);
-      double theta, phi;
-      if (len < 1e-30)
-        theta = 0;
-      else
-        theta = acos (dist(2) / len);
-      if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
-        phi = 0;
-      else
-        phi = atan2(dist(1), dist(0));
+      auto [len, theta, phi] = SphericalCoordinates(dist);
-      MultiPole<RADIAL,entry_type> tmp(*this);
+      // SphericalExpansion<RADIAL,entry_type> tmp{*this};
+      SphericalExpansion<RADIAL,entry_type> tmp(Order(), kappa, rtyp);
+      tmp.SH().Coefs() = SH().Coefs();
       tmp.SH().RotateZ(phi);
       tmp.SH().RotateY(theta);
@@ -332,60 +490,213 @@ namespace ngfem
     }
     template <typename TARGET>
-    void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
+    void TransformAdd (SphericalExpansion<TARGET,entry_type> & target, Vec<3> dist, bool atomic = false) const
     {
       if (SH().Order() < 0) return;
       if (target.SH().Order() < 0) return;
-      MultiPole<TARGET,entry_type> tmp{target};
+      SphericalExpansion<TARGET,entry_type> tmp{target};
       Transform(tmp, dist);
-      target.SH().Coefs() += tmp.SH().Coefs();
+      if (!atomic)
+        target.SH().Coefs() += tmp.SH().Coefs();
+      else
+        for (int j = 0; j < target.SH().Coefs().Size(); j++)
+          AtomicAdd(target.SH().Coefs()[j], tmp.SH().Coefs()[j]);
     }
     template <typename TARGET>
-    void ShiftZ (double z, MultiPole<TARGET,entry_type> & target);
+    void ShiftZ (double z, SphericalExpansion<TARGET,entry_type> & target);
+    template <typename TARGET>
+    void In2Out (SphericalExpansion<TARGET,entry_type> & target, double r) const
+    {
+      Vector<Complex> rad(Order()+1);
+      Vector<Complex> radout(target.Order()+1);
+      RADIAL::Eval(Order(), kappa, r, RTyp(), rad);
+      TARGET::Eval(target.Order(), kappa, r, target.RTyp(), radout);
+      target.SH().Coefs() = 0;
+      for (int j = 0; j <= std::min(Order(), target.Order()); j++)
+        target.SH().CoefsN(j) = rad(j)/radout(j) * SH().CoefsN(j);
+    }
   };
   // ***************** parameters ****************
+  /*
   static constexpr int MPOrder (double rho_kappa)
   {
-    return max (20, int(2*rho_kappa));
+    // return max (20, int(2*rho_kappa));
+    return 20+int(2*rho_kappa);
   }
   static constexpr int maxdirect = 100;
+  */
+  template <typename SCAL, auto S>
+  inline auto VecVector2Matrix (FlatVector<Vec<S,SCAL>> vec)
+  {
+    return FlatMatrixFixWidth<S,SCAL> (vec.Size(), vec.Data()->Data());
+  }
+  inline auto VecVector2Matrix (FlatVector<Complex> vec)
+  {
+    return FlatMatrixFixWidth<1,Complex> (vec.Size(), vec.Data());
+  }
   template <typename entry_type=Complex>
-  class SingularMLMultiPole
+  class SingularMLExpansion
   {
+    using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
     static Array<size_t> nodes_on_level;
+    struct RecordingSS
+    {
+      const SphericalExpansion<Singular,entry_type> * mp_source;
+      SphericalExpansion<Singular,entry_type> * mp_target;
+      Vec<3> dist;
+      double len, theta, phi;
+      bool flipz;
+    public:
+      RecordingSS() = default;
+      RecordingSS (const SphericalExpansion<Singular,entry_type> * amp_source,
+                   SphericalExpansion<Singular,entry_type> * amp_target,
+                   Vec<3> adist)
+        : mp_source(amp_source), mp_target(amp_target), dist(adist)
+      {
+        std::tie(len, theta, phi) = SphericalCoordinates(adist);
+        // flipz = false;
+        flipz = theta > M_PI/2;
+        if (flipz) theta = M_PI-theta;
+      }
+    };
+    static void ProcessBatchSS(FlatArray<RecordingSS*> batch, double len, double theta) {
+      constexpr int vec_length = VecLength<entry_type>;
+      int batch_size = batch.Size();
+      int N = batch_size * vec_length;
+      // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(entry_type).name() << ", len = " << len << ", theta = " << theta << endl;
+      if (N <= 1 || batch_size <= 1) {
+        for (auto* rec : batch) {
+          rec->mp_source->TransformAdd(*rec->mp_target, rec->dist, true);
+        }
+      }
+      else if (N <= 3) {
+        ProcessVectorizedBatchSS<3, vec_length>(batch, len, theta);
+      }
+      else if (N <= 4) {
+        ProcessVectorizedBatchSS<4, vec_length>(batch, len, theta);
+      }
+      else if (N <= 6) {
+        ProcessVectorizedBatchSS<6, vec_length>(batch, len, theta);
+      }
+      else if (N <= 12) {
+        ProcessVectorizedBatchSS<12, vec_length>(batch, len, theta);
+      }
+      else if (N <= 24) {
+        ProcessVectorizedBatchSS<24, vec_length>(batch, len, theta);
+      }
+      else if (N <= 48) {
+        ProcessVectorizedBatchSS<48, vec_length>(batch, len, theta);
+      }
+      else if (N <= 96) {
+        ProcessVectorizedBatchSS<96, vec_length>(batch, len, theta);
+      }
+      else if (N <= 192) {
+        ProcessVectorizedBatchSS<192, vec_length>(batch, len, theta);
+      }
+      else {
+        // Split large batches
+        ProcessBatchSS(batch.Range(0, 192 / vec_length), len, theta);
+        ProcessBatchSS(batch.Range(192 / vec_length, batch_size), len, theta);
+      }
+    }
+    template<int N, int vec_length>
+    static void ProcessVectorizedBatchSS(FlatArray<RecordingSS*> batch, double len, double theta) {
+      // *testout << "Processing vectorized S->S batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
+      double kappa = batch[0]->mp_source->Kappa();
+      int so = batch[0]->mp_source->Order();
+      int to = batch[0]->mp_target->Order();
+      SphericalExpansion<Singular, Vec<N,Complex>> vec_source(so, kappa, batch[0]->mp_source->RTyp());
+      SphericalExpansion<Singular, Vec<N,Complex>> vec_target(to, kappa, batch[0]->mp_target->RTyp());
+      // Copy multipoles into vectorized multipole
+      for (int i = 0; i < batch.Size(); i++)
+        {
+          auto source_i = VecVector2Matrix (batch[i]->mp_source->SH().Coefs());
+          auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
+          batch[i]->mp_source->SH().RotateZFlip(batch[i]->phi, batch[i]->flipz,
+                                            [source_i, source_mati] (size_t ii, Complex factor)
+                                            {
+                                              source_mati.Row(ii) = factor * source_i.Row(ii);
+                                            });
+        }
+      vec_source.SH().RotateY(theta, vec_source.SH().Order() >= 100);
+      vec_source.ShiftZ(-len, vec_target);
+      vec_target.SH().RotateY(-theta, vec_target.SH().Order() >= 100);
+      // Copy vectorized multipole into individual multipoles
+      for (int i = 0; i < batch.Size(); i++)
+        {
+          auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
+          auto target_mati = VecVector2Matrix (batch[i]->mp_target->SH().Coefs());
+          batch[i]->mp_target->SH().RotateZFlip(-batch[i]->phi, batch[i]->flipz,
+                                      [source_mati, target_mati] (size_t ii, Complex factor)
+                                      {
+                                        AtomicAdd (target_mati.Row(ii), factor * source_mati.Row(ii));
+                                      });
+      }
+    }
     struct Node
     {
       Vec<3> center;
       double r;
       int level;
       std::array<unique_ptr<Node>,8> childs;
-      MultiPole<MPSingular, entry_type> mp;
+      SphericalExpansion<Singular, entry_type> mp;
       Array<tuple<Vec<3>, entry_type>> charges;
       Array<tuple<Vec<3>, Vec<3>, entry_type>> dipoles;
+      Array<tuple<Vec<3>, entry_type, Vec<3>, entry_type>> chargedipoles;
       Array<tuple<Vec<3>, Vec<3>, Complex,int>> currents;
+      using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
+      Array<tuple<Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_charges;
+      Array<tuple<Vec<3,SIMD<double,FMM_SW>>, Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_dipoles;
+      Array<tuple<Vec<3,SIMD<double,FMM_SW>>, simd_entry_type,
+                  Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_chargedipoles;
       int total_sources;
+      const FMM_Parameters & fmm_params;
+      std::mutex node_mutex;
+      atomic<bool> have_childs{false};
-      Node (Vec<3> acenter, double ar, int alevel, int order, double kappa)
-        : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*kappa), kappa, min(1.0, r*kappa))
-          // : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*kappa), kappa, 1.0)
+      Node (Vec<3> acenter, double ar, int alevel, double akappa, const FMM_Parameters & afmm_params)
+      // : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*akappa), akappa, ar), fmm_params(afmm_params)
+        : center(acenter), r(ar), level(alevel), mp(afmm_params.minorder+2*ar*akappa, akappa, ar), fmm_params(afmm_params)
       {
-        // cout << "singml, add node, level = " << level << endl;
         if (level < nodes_on_level.Size())
           nodes_on_level[level]++;
       }
+      int GetChildNum (Vec<3> x) const
+      {
+        int childnum  = 0;
+        if (x(0) > center(0)) childnum += 1;
+        if (x(1) > center(1)) childnum += 2;
+        if (x(2) > center(2)) childnum += 4;
+        return childnum;
+      }
       void CreateChilds()
       {
         if (childs[0]) throw Exception("have already childs");
@@ -395,78 +706,127 @@ namespace ngfem
             cc(0) += (i&1) ? r/2 : -r/2;
             cc(1) += (i&2) ? r/2 : -r/2;
             cc(2) += (i&4) ? r/2 : -r/2;
-            childs[i] = make_unique<Node> (cc, r/2, level+1, max(mp.SH().Order()/2, 8), mp.Kappa());
+            childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa(), fmm_params);
           }
+        have_childs = true;
       }
+      void SendSourcesToChilds()
+      {
+        CreateChilds();
+        for (auto [x,c] : charges)
+          AddCharge (x,c);
+        for (auto [x,d,c] : dipoles)
+          AddDipole (x,d,c);
+        for (auto [x,c,d,c2] : chargedipoles)
+          AddChargeDipole (x,c,d,c2);
+        for (auto [sp,ep,j,num] : currents)
+          AddCurrent (sp,ep,j,num);
+        charges.DeleteAll();
+        dipoles.DeleteAll();
+        chargedipoles.DeleteAll();
+        currents.DeleteAll();
+      }
       void AddCharge (Vec<3> x, entry_type c)
       {
-        if (childs[0])
+        if (have_childs) // quick check without locking
           {
             // directly send to childs:
-            int childnum  = 0;
-            if (x(0) > center(0)) childnum += 1;
-            if (x(1) > center(1)) childnum += 2;
-            if (x(2) > center(2)) childnum += 4;
+            int childnum = GetChildNum(x);
             childs[childnum] -> AddCharge(x, c);
             return;
           }
-        charges.Append( tuple{x,c} );
+        lock_guard<mutex> guard(node_mutex);
-        if (r*mp.Kappa() < 1e-8) return;
-        if (charges.Size() < maxdirect && r*mp.Kappa() < 1)
-          return;
+        if (have_childs) // test again after locking
+          {
+            int childnum  = GetChildNum(x);
+            childs[childnum] -> AddCharge(x, c);
+            return;
+          }
-        CreateChilds();
+        charges.Append( tuple{x,c} );
-        for (auto [x,c] : charges)
-          AddCharge (x,c);
-        for (auto [x,d,c] : dipoles)
-          AddDipole (x,d,c);
-        for (auto [sp,ep,j,num] : currents)
-          AddCurrent (sp,ep,j,num);
+        // if (r*mp.Kappa() < 1e-8) return;
+        if (level > 20) return;
+        if (charges.Size() < fmm_params.maxdirect && r*mp.Kappa() < 5)
+          return;
-        charges.SetSize0();
-        dipoles.SetSize0();
-        currents.SetSize0();
+        SendSourcesToChilds();
       }
       void AddDipole (Vec<3> x, Vec<3> d, entry_type c)
       {
-        if (childs[0])
+        if (have_childs)
           {
             // directly send to childs:
-            int childnum  = 0;
-            if (x(0) > center(0)) childnum += 1;
-            if (x(1) > center(1)) childnum += 2;
-            if (x(2) > center(2)) childnum += 4;
+            int childnum = GetChildNum(x);
             childs[childnum] -> AddDipole(x, d, c);
             return;
           }
-        dipoles.Append (tuple{x,d,c});
+        lock_guard<mutex> guard(node_mutex);
-        if (dipoles.Size() < maxdirect || r < 1e-8)
+        if (have_childs)
+          {
+            // directly send to childs:
+            int childnum = GetChildNum(x);
+            childs[childnum] -> AddDipole(x, d, c);
+            return;
+          }
+        dipoles.Append (tuple{x,d,c});
+        if (level > 20) return;
+        if (dipoles.Size() < fmm_params.maxdirect)
           return;
+        SendSourcesToChilds();
+      }
+      void AddChargeDipole (Vec<3> x, entry_type c, Vec<3> dir, entry_type c2)
+      {
+        if (have_childs)
+          {
+            // directly send to childs:
+            int childnum = GetChildNum(x);
+            childs[childnum] -> AddChargeDipole(x, c, dir, c2);
+            return;
+          }
+        lock_guard<mutex> guard(node_mutex);
+        if (have_childs)
+          {
+            // directly send to childs:
+            int childnum = GetChildNum(x);
+            childs[childnum] -> AddChargeDipole(x, c, dir, c2);
+            return;
+          }
-        CreateChilds();
+        chargedipoles.Append (tuple{x,c,dir,c2});
-        for (auto [x,c] : charges)
-          AddCharge (x,c);
-        for (auto [x,d,c] : dipoles)
-          AddDipole (x,d,c);
-        for (auto [sp,ep,j,num] : currents)
-          AddCurrent (sp,ep,j,num);
+        if (chargedipoles.Size() < fmm_params.maxdirect || r < 1e-8)
+          return;
-        charges.SetSize0();
-        dipoles.SetSize0();
-        currents.SetSize0();
+        SendSourcesToChilds();
+        /*
+        AddCharge (x, c);
+        AddDipole (x, dir, c2);
+        */
       }
+      // not parallel yet
       void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
       {
         if (childs[0])
@@ -475,7 +835,7 @@ namespace ngfem
             Array<double> split;
             split.Append(0);
             for (int i = 0; i < 3; i++)
-              if (sp(i) < center(i) != ep(i) < center(i))
+              if ((sp(i) < center(i)) != (ep(i) < center(i)))
                 split += (center(i)-sp(i)) / (ep(i)-sp(i));  // segment cuts i-th coordinate plane
             split.Append(1);
             BubbleSort(split);
@@ -496,9 +856,15 @@ namespace ngfem
                 }
             return;
           }
         currents.Append (tuple{sp,ep,j,num});
+        // if (currents.Size() < maxdirect || r < 1e-8)
+        if (currents.Size() < 4 || r < 1e-8)
+          return;
+        SendSourcesToChilds();
+        /*
         // if (currents.Size() < maxdirect || r < 1e-8)
         if (currents.Size() < 4 || r < 1e-8)
           return;
@@ -515,6 +881,7 @@ namespace ngfem
         charges.SetSize0();
         dipoles.SetSize0();
         currents.SetSize0();
+        */
       }
@@ -530,47 +897,169 @@ namespace ngfem
             return sum;
           }
-        for (auto [x,c] : charges)
-          if (double rho = L2Norm(p-x); rho > 0)
-            sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
-        for (auto [x,d,c] : dipoles)
-          if (double rho = L2Norm(p-x); rho > 0)
-            {
-              Vec<3> drhodp = 1.0/rho * (p-x);
-              Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
-                (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
-              sum += dGdrho * InnerProduct(drhodp, d) * c;
-            }
-        for (auto [sp,ep,j,num] : currents)
+        if (simd_charges.Size())
           {
-            // should use explizit formula instead ...
+            // static Timer t("mptool singmp, evaluate, simd charges"); RegionTimer r(t);
+            // t.AddFlops (charges.Size());
-            Vec<3> tau = ep-sp;
-            Vec<3> tau_num = 1.0/num *  tau;
-            for (int i = 0; i < num; i++)
+            simd_entry_type vsum{0.0};
+            if (mp.Kappa() < 1e-12)
               {
-                Vec<3> x = sp+(i+0.5)*tau_num;
-                if (double rho = L2Norm(p-x); rho > 0)
+                for (auto [x,c] : simd_charges)
                   {
-                    Vec<3> drhodp = 1.0/rho * (p-x);
-                    Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
-                      (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
-                    if constexpr (std::is_same<entry_type, Vec<3,Complex>>())
-                      sum += j*dGdrho * Cross(drhodp, tau_num);
+                    auto rho = L2Norm(p-x);
+                    auto kernel = 1/(4*M_PI)/rho;
+                    kernel = If(rho > 0.0, kernel, SIMD<double,FMM_SW>(0.0));
+                    vsum += kernel * c;
+                    /*
+                    auto rho2 = L2Norm2(p-x);
+                    auto kernel = (1/(4*M_PI)) * rsqrt(rho2);
+                    kernel = If(rho2 > 0.0, kernel, SIMD<double,FMM_SW>(0.0));
+                    vsum += kernel * c;
+                    */
                   }
               }
-          }
+            else if (mp.Kappa() < 1e-8)
+              for (auto [x,c] : simd_charges)
+                {
+                  auto rho = L2Norm(p-x);
+                  auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW> (1,rho*mp.Kappa()) / rho;
+                  kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
+                  vsum += kernel * c;
+                }
+            else
+              for (auto [x,c] : simd_charges)
+                {
+                  auto rho = L2Norm(p-x);
+                  auto [si,co] = sincos(rho*mp.Kappa());
+                  auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) / rho;
+                  kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
+                  vsum += kernel * c;
+                }
+            sum += HSum(vsum);
+          }
+        else
+          {
+            if (mp.Kappa() < 1e-8)
+              {
+                for (auto [x,c] : charges)
+                  if (double rho = L2Norm(p-x); rho > 0)
+                    sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
+              }
+            else
+              for (auto [x,c] : charges)
+                if (double rho = L2Norm(p-x); rho > 0)
+                  sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
+          }
+        if (simd_dipoles.Size())
+          {
+            // static Timer t("mptool singmp, evaluate, simd dipoles"); RegionTimer r(t);
+            simd_entry_type vsum{0.0};
+            for (auto [x,d,c] : simd_dipoles)
+              {
+                auto rho = L2Norm(p-x);
+                auto drhodp = (1.0/rho) * (p-x);
+                auto [si,co] = sincos(rho*mp.Kappa());
+                auto dGdrho = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) *
+                  (-1.0/(rho*rho) + SIMD<Complex,FMM_SW>(0, mp.Kappa())/rho);
+                auto kernel = dGdrho * InnerProduct(drhodp, d);
+                kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
+                vsum += kernel * c;
+              }
+            sum += HSum(vsum);
+          }
+        else
+          {
+            for (auto [x,d,c] : dipoles)
+              if (double rho = L2Norm(p-x); rho > 0)
+                {
+                  Vec<3> drhodp = 1.0/rho * (p-x);
+                  Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
+                    (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
+                  sum += dGdrho * InnerProduct(drhodp, d) * c;
+                }
+          }
+      if (simd_chargedipoles.Size())
+        {
+          // static Timer t("mptool singmp, evaluate, simd chargedipoles"); RegionTimer r(t);
+          // t.AddFlops (simd_chargedipoles.Size()*FMM_SW);
+          simd_entry_type vsum{0.0};
+          for (auto [x,c,d,c2] : simd_chargedipoles)
+            {
+              auto rho = L2Norm(p-x);
+              auto rhokappa = rho*mp.Kappa();
+              auto invrho = If(rho>0.0, 1.0/rho, SIMD<double,FMM_SW>(0.0));
+              auto [si,co] = sincos(rhokappa);
+              auto kernelc = (1/(4*M_PI))*invrho*SIMD<Complex,FMM_SW>(co,si);
+              vsum += kernelc * c;
+              auto kernel =
+                invrho*invrho * InnerProduct(p-x, d) *
+                kernelc * SIMD<Complex,FMM_SW>(-1.0, rhokappa);
+              vsum += kernel * c2;
+            }
+          sum += HSum(vsum);
+        }
+      else
+        {
+          // static Timer t("mptool singmp, evaluate, chargedipoles"); RegionTimer r(t);
+          // t.AddFlops (chargedipoles.Size());
+          for (auto [x,c,d,c2] : chargedipoles)
+            if (double rho = L2Norm(p-x); rho > 0)
+              {
+                sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
+                Vec<3> drhodp = 1.0/rho * (p-x);
+                Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
+                  (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
+                sum += dGdrho * InnerProduct(drhodp, d) * c2;
+              }
+        }
+        for (auto [sp,ep,j,num] : currents)
+          {
+            // should use explizit formula instead ...
+            Vec<3> tau = ep-sp;
+            Vec<3> tau_num = 1.0/num *  tau;
+            for (int i = 0; i < num; i++)
+              {
+                Vec<3> x = sp+(i+0.5)*tau_num;
+                if (double rho = L2Norm(p-x); rho > 0)
+                  {
+                    Vec<3> drhodp = 1.0/rho * (p-x);
+                    Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
+                      (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
+                    if constexpr (std::is_same<entry_type, Vec<3,Complex>>())
+                      sum += j*dGdrho * Cross(drhodp, tau_num);
+                  }
+              }
+          }
         return sum;
       }
       entry_type EvaluateDeriv(Vec<3> p, Vec<3> d) const
       {
-        entry_type sum = 0;
+        entry_type sum{0.0};
         if (childs[0])
         {
             for (auto & child : childs)
@@ -579,7 +1068,16 @@ namespace ngfem
         }
         if (dipoles.Size())
-            throw Exception("EvaluateDeriv not implemented for dipoles in SingularMLMultiPole");
+          {
+            static int cnt = 0;
+            cnt++;
+            if (cnt < 3)
+              cout << "we know what we do - evaluateDeriv not implemented for dipoles in SingularMLExpansion" << endl;
+            // return sum;
+            // throw Exception("EvaluateDeriv not implemented for dipoles in SingularMLExpansion");
+          }
+        if (chargedipoles.Size())
+            throw Exception("EvaluateDeriv not implemented for dipoles in SingularMLExpansion");
         for (auto [x,c] : charges)
           if (double rho = L2Norm(p-x); rho > 0)
@@ -594,7 +1092,7 @@ namespace ngfem
       void CalcTotalSources()
       {
-        total_sources = charges.Size() + dipoles.Size();
+        total_sources = charges.Size() + dipoles.Size() + chargedipoles.Size();
         for (auto & child : childs)
           if (child)
             {
@@ -603,46 +1101,111 @@ namespace ngfem
             }
       }
-      void CalcMP()
+      void CalcMP(Array<RecordingSS> * recording, Array<Node*> * nodes_to_process)
       {
-        mp.SH().Coefs() = 0.0;
+        // mp.SH().Coefs() = 0.0;
         if (childs[0])
           {
-            if (total_sources < 1000)
+            if (total_sources < 1000 || recording)
               for (auto & child : childs)
-                child->CalcMP();
+                child->CalcMP(recording, nodes_to_process);
             else
               ParallelFor (8, [&] (int nr)
                            {
-                             childs[nr] -> CalcMP();
+                             childs[nr] -> CalcMP(recording, nodes_to_process);
                            });
-            for (auto & child : childs)
-              child->mp.TransformAdd(mp, center-child->center);
+            for (auto & child : childs){
+              if (recording && child->mp.SH().Coefs().Size() > 0)
+                *recording += RecordingSS(&child->mp, &mp, center-child->center);
+              else
+                child->mp.TransformAdd(mp, center-child->center);
+            }
           }
         else
           {
-            if (charges.Size()+dipoles.Size()+currents.Size() == 0)
+            if (charges.Size()+dipoles.Size()+chargedipoles.Size()+currents.Size() == 0)
               {
-                mp = MultiPole<MPSingular,entry_type> (-1, mp.Kappa());
+                mp = SphericalExpansion<Singular,entry_type> (-1, mp.Kappa(), 1.);
                 return;
               }
-            for (auto [x,c] : charges)
-              mp.AddCharge (x-center,c);
-            for (auto [x,d,c] : dipoles)
-              mp.AddDipole (x-center, d, c);
+            // make simd charges, comment this block for testing ...
+            simd_charges.SetSize( (charges.Size()+FMM_SW-1)/FMM_SW);
+            size_t i = 0, ii = 0;
+            for ( ; i+FMM_SW <= charges.Size(); i+=FMM_SW, ii++)
+              {
+                std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
+                for (int j = 0; j < FMM_SW; j++) ca[j] = charges[i+j];
+                simd_charges[ii] = MakeSimd(ca);
+              }
+            if (i < charges.Size())
+              {
+                std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
+                int j = 0;
+                for ( ; i+j < charges.Size(); j++) ca[j] = charges[i+j];
+                for ( ; j < FMM_SW; j++) ca[j] = tuple( get<0>(ca[0]), entry_type{0.0} );
+                simd_charges[ii] = MakeSimd(ca);
+              }
+            simd_dipoles.SetSize( (dipoles.Size()+FMM_SW-1)/FMM_SW);
+            i = 0, ii = 0;
+            for ( ; i+FMM_SW <= dipoles.Size(); i+=FMM_SW, ii++)
+              {
+                std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
+                for (int j = 0; j < FMM_SW; j++) di[j] = dipoles[i+j];
+                simd_dipoles[ii] = MakeSimd(di);
+              }
+            if (i < dipoles.Size())
+              {
+                std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
+                int j = 0;
+                for ( ; i+j < dipoles.Size(); j++) di[j] = dipoles[i+j];
+                for ( ; j < FMM_SW; j++) di[j] = tuple( get<0>(di[0]), get<1>(di[0]), entry_type{0.0} );
+                simd_dipoles[ii] = MakeSimd(di);
+              }
-            for (auto [sp,ep,j,num] : currents)
-              mp.AddCurrent (sp-center, ep-center, j, num);
+            simd_chargedipoles.SetSize( (chargedipoles.Size()+FMM_SW-1)/FMM_SW);
+            i = 0, ii = 0;
+            for ( ; i+FMM_SW <= chargedipoles.Size(); i+=FMM_SW, ii++)
+              {
+                std::array<tuple<Vec<3>,entry_type,Vec<3>,entry_type>, FMM_SW> di;
+                for (int j = 0; j < FMM_SW; j++) di[j] = chargedipoles[i+j];
+                simd_chargedipoles[ii] = MakeSimd(di);
+              }
+            if (i < chargedipoles.Size())
+              {
+                std::array<tuple<Vec<3>,entry_type,Vec<3>,entry_type>, FMM_SW> di;
+                int j = 0;
+                for ( ; i+j < chargedipoles.Size(); j++) di[j] = chargedipoles[i+j];
+                for ( ; j < FMM_SW; j++) di[j] = tuple( get<0>(di[0]), entry_type{0.0}, get<2>(di[0]), entry_type{0.0} );
+                simd_chargedipoles[ii] = MakeSimd(di);
+              }
+            if (nodes_to_process)
+                *nodes_to_process += this;
+            else {
+              for (auto [x,c] : charges)
+                mp.AddCharge (x-center,c);
+              for (auto [x,d,c] : dipoles)
+                mp.AddDipole (x-center, d, c);
+              for (auto [x,c,d,c2] : chargedipoles)
+                mp.AddChargeDipole (x-center, c, d, c2);
+              for (auto [sp,ep,j,num] : currents)
+                mp.AddCurrent (sp-center, ep-center, j, num);
+            }
           }
       }
       entry_type EvaluateMP(Vec<3> p) const
       {
-        if (charges.Size() || dipoles.Size())
+        if (charges.Size() || dipoles.Size() || chargedipoles.Size())
           return Evaluate(p);
         if (L2Norm(p-center) > 3*r)
@@ -662,7 +1225,7 @@ namespace ngfem
         // cout << "EvaluateMPDeriv Singular, p = " << p << ", d = " << d << ", r = " << r << ", center = " << center <<  endl;
         // cout << "Norm: " << L2Norm(p-center) << " > " << 3*r << endl;
         // cout << "charges.Size() = " << charges.Size() << ", dipoles.Size() = " << dipoles.Size() << endl;
-        if (charges.Size() || dipoles.Size() || !childs[0])
+        if (charges.Size() || dipoles.Size() || chargedipoles.Size() || !childs[0])
           return EvaluateDeriv(p, d);
         if (L2Norm(p-center) > 3*r)
@@ -685,6 +1248,8 @@ namespace ngfem
           ost << "xi = " << x << ", ci = " << c << endl;
         for (auto [x,d,c] : dipoles)
           ost << "xi = " << x << ", di = " << d << ", ci = " << c << endl;
+        for (auto [x,c,d,c2] : chargedipoles)
+          ost << "xi = " << x << ", c = " << c << ", di = " << d << ", ci = " << c2 << endl;
         for (int i = 0; i < 8; i++)
           if (childs[i]) childs[i] -> Print (ost, i);
@@ -707,14 +1272,23 @@ namespace ngfem
             num += ch->NumCoefficients();
         return num;
       }
+      void TraverseTree (const std::function<void(Node&)> & func)
+      {
+        func(*this);
+        for (auto & child : childs)
+          if (child)
+            child->TraverseTree(func);
+      }
     };
-    Node root;
+    FMM_Parameters fmm_params;
+    Node root;
     bool havemp = false;
   public:
-    SingularMLMultiPole (Vec<3> center, double r, int order, double kappa)
-      : root(center, r, 0, order, kappa)
+    SingularMLExpansion (Vec<3> center, double r, double kappa, FMM_Parameters _params = FMM_Parameters())
+      : fmm_params(_params), root(center, r, 0, kappa, fmm_params)
     {
       nodes_on_level = 0;
       nodes_on_level[0] = 1;
@@ -722,16 +1296,21 @@ namespace ngfem
     double Kappa() const { return root.mp.Kappa(); }
-    void AddCharge(Vec<3> x, Complex c)
+    void AddCharge(Vec<3> x, entry_type c)
     {
       root.AddCharge(x, c);
     }
-    void AddDipole(Vec<3> x, Vec<3> d, Complex c)
+    void AddDipole(Vec<3> x, Vec<3> d, entry_type c)
     {
       root.AddDipole(x, d, c);
     }
+    void AddChargeDipole(Vec<3> x, entry_type c, Vec<3> dir, entry_type c2)
+    {
+      root.AddChargeDipole(x, c, dir, c2);
+    }
     void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
     {
       if constexpr (!std::is_same<entry_type, Vec<3,Complex>>())
@@ -756,7 +1335,7 @@ namespace ngfem
         }
       */
     }
     void Print (ostream & ost) const
     {
       root.Print(ost);
@@ -775,6 +1354,10 @@ namespace ngfem
     void CalcMP()
     {
       static Timer t("mptool compute singular MLMP"); RegionTimer rg(t);
+      static Timer ts2mp("mptool compute singular MLMP - source2mp");
+      static Timer tS2S("mptool compute singular MLMP - S->S");
+      static Timer trec("mptool comput singular recording");
+      static Timer tsort("mptool comput singular sort");
       /*
       int maxlevel = 0;
@@ -784,9 +1367,96 @@ namespace ngfem
       for (int i = 0; i <= maxlevel; i++)
         cout << "sing " <<  i << ": " << nodes_on_level[i] << endl;
       */
       root.CalcTotalSources();
-      root.CalcMP();
+      if constexpr (false)
+        // direct evaluation of S->S
+        root.CalcMP(nullptr, nullptr);
+      else
+        {
+          Array<RecordingSS> recording;
+          Array<Node*> nodes_to_process;
+          {
+            RegionTimer reg(trec);
+            root.CalcMP(&recording, &nodes_to_process);
+          }
+          {
+            RegionTimer rs2mp(ts2mp);
+            ParallelFor(nodes_to_process.Size(), [&](int i)
+            {
+              auto node = nodes_to_process[i];
+              for (auto [x,c]: node->charges)
+                node->mp.AddCharge(x-node->center, c);
+              for (auto [x,d,c]: node->dipoles)
+                node->mp.AddDipole(x-node->center, d, c);
+              for (auto [x,c,d,c2]: node->chargedipoles)
+                node->mp.AddChargeDipole(x-node->center, c, d, c2);
+              for (auto [sp,ep,j,num]: node->currents)
+                node->mp.AddCurrent(sp-node->center, ep-node->center, j, num);
+            }, TasksPerThread(4));
+          }
+          {
+            RegionTimer reg(tsort);
+            QuickSort (recording, [] (auto & a, auto & b)
+            {
+              if (a.len < (1-1e-8) * b.len) return true;
+              if (a.len > (1+1e-8) * b.len) return false;
+              return a.theta < b.theta;
+            });
+          }
+          double current_len = -1e100;
+          double current_theta = -1e100;
+          Array<RecordingSS*> current_batch;
+          Array<Array<RecordingSS*>> batch_group;
+          Array<double> group_lengths;
+          Array<double> group_thetas;
+          for (auto & record : recording)
+            {
+              bool len_changed = fabs(record.len - current_len) > 1e-8;
+              bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
+              if ((len_changed || theta_changed) && current_batch.Size() > 0) {
+                batch_group.Append(current_batch);
+                group_lengths.Append(current_len);
+                group_thetas.Append(current_theta);
+                current_batch.SetSize(0);
+              }
+              current_len = record.len;
+              current_theta = record.theta;
+              current_batch.Append(&record);
+            }
+          if (current_batch.Size() > 0) {
+            batch_group.Append(current_batch);
+            group_lengths.Append(current_len);
+            group_thetas.Append(current_theta);
+          }
+          {
+            RegionTimer rS2S(tS2S);
+            // ParallelFor(batch_group.Size(), [&](int i) {
+            for (int i = 0; i < batch_group.Size(); i++){
+              // *testout << "Processing batch " << i << " of size " << batch_group[i].Size() << ", with len = " << group_lengths[i] << ", theta = " << group_thetas[i] << endl;
+              int chunk_size = 24;
+              if (batch_group[i].Size() < chunk_size)
+                ProcessBatchSS(batch_group[i], group_lengths[i], group_thetas[i]);
+              else
+                ParallelForRange(IntRange(batch_group[i].Size()), [&](IntRange range) {
+                  auto sub_batch = batch_group[i].Range(range.First(), range.Next());
+                  ProcessBatchSS(sub_batch, group_lengths[i], group_thetas[i]);
+                }, TasksPerThread(4));
+            }
+          }
+        }
+      // cout << "have singular:" << endl;
+      // PrintStatistics (cout);
       havemp = true;
     }
@@ -798,23 +1468,198 @@ namespace ngfem
         return root.Evaluate(p);
     }
+    void PrintStatistics (ostream & ost)
+    {
+      int levels = 0;
+      int cnt = 0;
+      root.TraverseTree( [&](Node & node) {
+        levels = max(levels, node.level);
+        cnt++;
+      });
+      ost << "levels: " << levels << endl;
+      ost << "nodes: " << cnt << endl;
+      Array<int> num_on_level(levels+1);
+      Array<int> order_on_level(levels+1);
+      Array<size_t> coefs_on_level(levels+1);
+      num_on_level = 0;
+      order_on_level = 0;
+      root.TraverseTree( [&](Node & node) {
+        num_on_level[node.level]++;
+        order_on_level[node.level] = max(order_on_level[node.level],node.mp.Order());
+        coefs_on_level[node.level] += node.mp.SH().Coefs().Size();
+      });
+      cout << "num on level" << endl;
+      for (int i = 0; i < num_on_level.Size(); i++)
+        cout << i << ": " << num_on_level[i] << ", order = " << order_on_level[i] << ", coefs " << coefs_on_level[i] << endl;
+      size_t totcoefs = 0;
+      for (auto n : coefs_on_level)
+        totcoefs += n;
+      cout << "total mem in coefs: " << sizeof(entry_type)*totcoefs / sqr(1024) << " MB" << endl;
+    }
     template <typename entry_type2>
-    friend class RegularMLMultiPole;
+    friend class RegularMLExpansion;
   };
   template <typename entry_type>
-  inline ostream & operator<< (ostream & ost, const SingularMLMultiPole<entry_type> & mlmp)
+  inline ostream & operator<< (ostream & ost, const SingularMLExpansion<entry_type> & mlmp)
   {
     mlmp.Print(ost);
     return ost;
   }
+  // *********************************** Regular multilevel Expansion
   template <typename elem_type=Complex>
-  class NGS_DLL_HEADER RegularMLMultiPole
+  class NGS_DLL_HEADER RegularMLExpansion
   {
     static Array<size_t> nodes_on_level;
+    struct RecordingRS
+    {
+      const SphericalExpansion<Singular,elem_type> * mpS;
+      SphericalExpansion<Regular,elem_type> * mpR;
+      Vec<3> dist;
+      double len, theta, phi;
+    public:
+      RecordingRS() = default;
+      RecordingRS (const SphericalExpansion<Singular,elem_type> * ampS,
+                   SphericalExpansion<Regular,elem_type> * ampR,
+                   Vec<3> adist)
+        : mpS(ampS), mpR(ampR), dist(adist)
+      {
+        std::tie(len, theta, phi) = SphericalCoordinates(dist);
+      }
+    };
+    static void ProcessBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
+      // static Timer t("ProcessBatchRS"); RegionTimer reg(t, batch.Size());
+      constexpr int vec_length = VecLength<elem_type>;
+      int batch_size = batch.Size();
+      int N = batch_size * vec_length;
+      // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(elem_type).name() << ", len = " << len << ", theta = " << theta << endl;
+      if (N <= 1 || batch_size <= 1) {
+        for (auto* rec : batch) {
+          rec->mpS->TransformAdd(*rec->mpR, rec->dist);
+        }
+      }
+      else if (N <= 3) {
+        ProcessVectorizedBatchRS<3, vec_length>(batch, len, theta);
+      }
+      else if (N <= 4) {
+        ProcessVectorizedBatchRS<4, vec_length>(batch, len, theta);
+      }
+      else if (N <= 6) {
+        ProcessVectorizedBatchRS<6, vec_length>(batch, len, theta);
+      }
+      else if (N <= 12) {
+        ProcessVectorizedBatchRS<12, vec_length>(batch, len, theta);
+      }
+      else if (N <= 24) {
+        ProcessVectorizedBatchRS<24, vec_length>(batch, len, theta);
+      }
+      else if (N <= 48) {
+        ProcessVectorizedBatchRS<48, vec_length>(batch, len, theta);
+      }
+      else if (N <= 96) {
+        ProcessVectorizedBatchRS<96, vec_length>(batch, len, theta);
+      }
+      else if (N <= 192) {
+        ProcessVectorizedBatchRS<192, vec_length>(batch, len, theta);
+      }
+      else {
+        // Split large batches
+        /*
+        ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
+        ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
+        */
+        /*
+        ParallelFor (2, [&] (int i)
+        {
+          if (i == 0)
+            ProcessBatchRS(batch.Range(0, 192 / vec_length), len, theta);
+          else
+            ProcessBatchRS(batch.Range(192 / vec_length, batch_size), len, theta);
+        }, 2);
+        */
+        size_t chunksize = 192/vec_length;
+        size_t num = (batch.Size()+chunksize-1) / chunksize;
+        ParallelFor (num, [&](int i)
+        {
+          ProcessBatchRS(batch.Range(i*chunksize, min((i+1)*chunksize, batch.Size())), len, theta);
+        }, num);
+      }
+    }
+    template<int N, int vec_length>
+    static void ProcessVectorizedBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
+      // static Timer t("ProcessVectorizedBatch, N = "+ToString(N) + ", vec_len = " + ToString(vec_length));
+      // RegionTimer reg(t, batch[0]->mpS->SH().Order());
+      // static Timer ttobatch("mptools - copy to batch 2");
+      // static Timer tfrombatch("mptools - copy from batch 2");
+      // *testout << "Processing vectorized batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
+      SphericalExpansion<Singular, Vec<N,Complex>> vec_source(batch[0]->mpS->Order(), batch[0]->mpS->Kappa(), batch[0]->mpS->RTyp());
+      // SphericalExpansion<Singular, elem_type> tmp_source{*batch[0]->mpS};
+      SphericalExpansion<Regular, elem_type> tmp_target{*batch[0]->mpR};
+      SphericalExpansion<Regular, Vec<N,Complex>> vec_target(batch[0]->mpR->Order(), batch[0]->mpR->Kappa(), batch[0]->mpR->RTyp());
+      // Copy multipoles into vectorized multipole
+      // ttobatch.Start();
+      for (int i = 0; i < batch.Size(); i++)
+      {
+        auto source_i = VecVector2Matrix (batch[i]->mpS->SH().Coefs());
+        auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
+        batch[i]->mpS->SH().RotateZ(batch[i]->phi,
+            [source_i, source_mati] (size_t ii, Complex factor)
+            {
+                source_mati.Row(ii) = factor * source_i.Row(ii);
+            });
+      }
+      // ttobatch.Stop();
+      vec_source.SH().RotateY(theta);
+      vec_source.ShiftZ(-len, vec_target);
+      vec_target.SH().RotateY(-theta);
+      // Copy vectorized multipole into individual multipoles
+      // tfrombatch.Start();
+      for (int i = 0; i < batch.Size(); i++) {
+        // auto source_i = VecVector2Matrix (tmp_target.SH().Coefs());
+        auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
+        auto targeti = VecVector2Matrix(batch[i]->mpR->SH().Coefs());
+        tmp_target.SH().RotateZ(-batch[i]->phi,
+                                [source_mati, targeti] (size_t ii, Complex factor)
+                                          {
+                                            // source_i.Row(ii) = factor * source_mati.Row(ii);
+                                            AtomicAdd (VectorView(targeti.Row(ii)), factor * source_mati.Row(ii));
+                                          });
+        // for (int j = 0; j < tmp_target.SH().Coefs().Size(); j++)
+        // AtomicAdd(batch[i]->mpR->SH().Coefs()[j], tmp_target.SH().Coefs()[j]);
+      }
+      // tfrombatch.Stop();
+    }
     struct Node
     {
@@ -822,22 +1667,35 @@ namespace ngfem
       double r;
       int level;
       std::array<unique_ptr<Node>,8> childs;
-      MultiPole<MPRegular,elem_type> mp;
+      SphericalExpansion<Regular,elem_type> mp;
       Array<Vec<3>> targets;
+      Array<tuple<Vec<3>,double>> vol_targets;
       int total_targets;
+      std::mutex node_mutex;
+      atomic<bool> have_childs{false};
-      Array<const typename SingularMLMultiPole<elem_type>::Node*> singnodes;
+      Array<const typename SingularMLExpansion<elem_type>::Node*> singnodes;
+      const FMM_Parameters & params;
-      Node (Vec<3> acenter, double ar, int alevel, int order, double kappa)
-        : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*kappa), kappa, 1.0/min(1.0, 0.25*r*kappa))
+      Node (Vec<3> acenter, double ar, int alevel, double kappa, const FMM_Parameters & _params)
+        : center(acenter), r(ar), level(alevel),
+          // mp(MPOrder(ar*kappa), kappa, ar) // 1.0/min(1.0, 0.25*r*kappa))
+          mp(-1, kappa, ar), params(_params)
           // : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*kappa), kappa, 1.0)
       {
         if (level < nodes_on_level.Size())
           nodes_on_level[level]++;
       }
-      void CreateChilds()
+      void Allocate()
+      {
+        // mp = SphericalExpansion<Regular,elem_type>(MPOrder(r*mp.Kappa()), mp.Kappa(), r);
+        mp = SphericalExpansion<Regular,elem_type>(params.minorder+2*r*mp.Kappa(), mp.Kappa(), r);
+      }
+      void CreateChilds(bool allocate = false)
       {
         if (childs[0]) throw Exception("have already childs");
         // create children nodes:
@@ -847,15 +1705,19 @@ namespace ngfem
             cc(0) += (i&1) ? r/2 : -r/2;
             cc(1) += (i&2) ? r/2 : -r/2;
             cc(2) += (i&4) ? r/2 : -r/2;
-            childs[i] = make_unique<Node> (cc, r/2, level+1, max(mp.SH().Order()/2, 8), mp.Kappa());
+            childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa(), params);
+            if (allocate)
+              childs[i] -> Allocate();
           }
+        have_childs = true;
       }
-      void AddSingularNode (const typename SingularMLMultiPole<elem_type>::Node & singnode, bool allow_refine)
+      void AddSingularNode (const typename SingularMLExpansion<elem_type>::Node & singnode, bool allow_refine,
+                            Array<RecordingRS> * recording)
       {
         if (mp.SH().Order() < 0) return;
         if (singnode.mp.SH().Order() < 0) return;
-        if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
+        // if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
         if (level > 20)
           {
             singnodes.Append(&singnode);
@@ -874,12 +1736,15 @@ namespace ngfem
                 singnode.childs[0]->mp.Order() < singnode.mp.Order())
               {
                 for (auto & child : singnode.childs)
-                  AddSingularNode (*child, allow_refine);
+                  AddSingularNode (*child, allow_refine, recording);
                 return;
               }
             // static Timer t("mptool transform Helmholtz-criterion"); RegionTimer r(t);
-            singnode.mp.TransformAdd(mp, dist);
+            if (recording)
+              *recording += RecordingRS(&singnode.mp, &mp, dist);
+            else
+              singnode.mp.TransformAdd(mp, dist);
             return;
           }
@@ -895,70 +1760,70 @@ namespace ngfem
             if (allow_refine)
               {
                 if (!childs[0])
-                  CreateChilds();
+                  CreateChilds(true);
                 for (auto & ch : childs)
-                  ch -> AddSingularNode (singnode, allow_refine);
+                  ch -> AddSingularNode (singnode, allow_refine, recording);
               }
             else
               {
-                if (total_targets < 1000)
+                if (total_targets < 1000 || recording)
                   {
                     for (auto & ch : childs)
                       if (ch)
-                        ch -> AddSingularNode (singnode, allow_refine);
+                        ch -> AddSingularNode (singnode, allow_refine, recording);
                   }
                 else
                   ParallelFor (8, [&] (int nr)
                                {
                                  if (childs[nr])
-                                   childs[nr] -> AddSingularNode (singnode, allow_refine);
+                                   childs[nr] -> AddSingularNode (singnode, allow_refine, recording);
                                });
-                if (targets.Size())
+                if (targets.Size()+vol_targets.Size())
                   singnodes.Append(&singnode);
               }
           }
         else
           {
             for (auto & childsing : singnode.childs)
-              AddSingularNode (*childsing, allow_refine);
+              AddSingularNode (*childsing, allow_refine, recording);
           }
       }
       void LocalizeExpansion(bool allow_refine)
       {
         if (allow_refine)
-          if (mp.Order() > 20 && !childs[0])
-            CreateChilds();
+          if (mp.Order() > 30 && !childs[0])
+            CreateChilds(allow_refine);
         if (childs[0])
           {
-            for (auto & ch : childs)
+            if (total_targets < 1000)
               {
-                if (L2Norm(mp.SH().Coefs()) > 0)
-                  mp.TransformAdd (ch->mp, ch->center-center);
-                ch->LocalizeExpansion(allow_refine);
+                for (int nr = 0; nr < 8; nr++)
+                  {
+                    if (L2Norm(mp.SH().Coefs()) > 0)
+                      mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
+                    childs[nr]->LocalizeExpansion(allow_refine);
+                  }
               }
-            mp = MultiPole<MPRegular,elem_type>(-1, mp.Kappa());
+            else
+              ParallelFor(8, [&] (int nr)
+              {
+                if (L2Norm(mp.SH().Coefs()) > 0)
+                  mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
+                childs[nr]->LocalizeExpansion(allow_refine);
+              });
+            mp = SphericalExpansion<Regular,elem_type>(-1, mp.Kappa(), 1.);
             //mp.SH().Coefs()=0.0;
           }
       }
       elem_type Evaluate (Vec<3> p) const
       {
-        // *testout << "eval p = " << p << ", level = " << level << ", center = " << center <<  ", r = " << r << endl;
         elem_type sum{0.0};
-        /*
-        if (childs[0])
-          {
-            int childnum = 0;
-            if (p(0) > center(0)) childnum += 1;
-            if (p(1) > center(1)) childnum += 2;
-            if (p(2) > center(2)) childnum += 4;
-            sum = childs[childnum]->Evaluate(p);
-          }
-        */
         int childnum = 0;
         if (p(0) > center(0)) childnum += 1;
         if (p(1) > center(1)) childnum += 2;
@@ -966,13 +1831,16 @@ namespace ngfem
         if (childs[childnum])
           sum = childs[childnum]->Evaluate(p);
         else
-          sum = mp.Eval(p-center);
-        static Timer t("mptool direct evaluate"); RegionTimer r(t);
-        for (auto sn : singnodes)
-          sum += sn->EvaluateMP(p);
+          {
+            // static Timer t("mptool regmp, evaluate reg"); RegionTimer r(t);
+            sum = mp.Eval(p-center);
+          }
+        {
+          // static Timer t("mptool regmp, evaluate, singnode"); RegionTimer r(t);
+          for (auto sn : singnodes)
+            sum += sn->EvaluateMP(p);
+        }
         return sum;
       }
@@ -998,6 +1866,14 @@ namespace ngfem
         return sum;
       }
+      void TraverseTree (const std::function<void(Node&)> & func)
+      {
+        func(*this);
+        for (auto & child : childs)
+          if (child)
+            child->TraverseTree(func);
+      }
       double Norm() const
       {
         double norm = L2Norm(mp.SH().Coefs());
@@ -1015,36 +1891,100 @@ namespace ngfem
             num += ch->NumCoefficients();
         return num;
       }
+      int GetChildNum (Vec<3> x) const
+      {
+        int childnum  = 0;
+        if (x(0) > center(0)) childnum += 1;
+        if (x(1) > center(1)) childnum += 2;
+        if (x(2) > center(2)) childnum += 4;
+        return childnum;
+      }
       void AddTarget (Vec<3> x)
       {
-        if (childs[0])
+        // if (childs[0])
+        if (have_childs) // quick check without locking
           {
             // directly send to childs:
-            int childnum  = 0;
-            if (x(0) > center(0)) childnum += 1;
-            if (x(1) > center(1)) childnum += 2;
-            if (x(2) > center(2)) childnum += 4;
+            int childnum  = GetChildNum(x);
             childs[childnum] -> AddTarget( x );
             return;
           }
+        lock_guard<mutex> guard(node_mutex);
+        if (have_childs) // test again after locking
+        {
+          // directly send to childs:
+          int childnum  = GetChildNum(x);
+          childs[childnum] -> AddTarget(x);
+          return;
+        }
         targets.Append( x );
-        if (r*mp.Kappa() < 1e-8) return;
-        if (targets.Size() < maxdirect && r*mp.Kappa() < 1)
+        // if (r*mp.Kappa() < 1e-8) return;
+        if (level > 20) return;
+        if (targets.Size() < params.maxdirect && r*mp.Kappa() < 5)
+          return;
+        CreateChilds();
+        for (auto t : targets)
+          AddTarget (t);
+        for (auto [x,r] : vol_targets)
+          AddVolumeTarget (x,r);
+        targets.SetSize0();
+        vol_targets.SetSize0();
+      }
+      void AddVolumeTarget (Vec<3> x, double tr)
+      {
+        if (MaxNorm(x-center) > r+tr) return;
+        if (have_childs)
+          {
+            for (auto & child : childs)
+              child->AddVolumeTarget(x, tr);
+            return;
+          }
+        lock_guard<mutex> guard(node_mutex);
+        if (have_childs)
+          {
+            for (auto & child : childs)
+              child->AddVolumeTarget(x, tr);
+            return;
+          }
+        vol_targets.Append (tuple(x,tr));
+        if (level > 20) return;
+        if (vol_targets.Size() < params.maxdirect && (r*mp.Kappa() < 5))
           return;
         CreateChilds();
         for (auto t : targets)
           AddTarget (t);
+        for (auto [x,r] : vol_targets)
+          AddVolumeTarget (x,r);
         targets.SetSize0();
+        vol_targets.SetSize0();
       }
       void CalcTotalTargets()
       {
-        total_targets = targets.Size();
+        total_targets = targets.Size() + vol_targets.Size();
         for (auto & child : childs)
           if (child)
             {
@@ -1064,8 +2004,21 @@ namespace ngfem
             }
         if (total_targets == 0)
-          mp = MultiPole<MPRegular>(-1, mp.Kappa());
+          mp = SphericalExpansion<Regular,elem_type>(-1, mp.Kappa(),1.);
+      }
+      void AllocateMemory()
+      {
+        for (auto & child : childs)
+          if (child)
+            child->AllocateMemory();
+        if (total_targets > 0)
+          Allocate();
+        // mp = SphericalExpansion<Regular,elem_type>(MPOrder(r*mp.Kappa()), mp.Kappa(), r); // -1, mp.Kappa(),1.);
       }
       void Print (ostream & ost, size_t childnr = -1) const
@@ -1082,21 +2035,24 @@ namespace ngfem
       }
     };
+    FMM_Parameters fmm_params;
     Node root;
-    shared_ptr<SingularMLMultiPole<elem_type>> singmp;
+    shared_ptr<SingularMLExpansion<elem_type>> singmp;
   public:
-  RegularMLMultiPole (shared_ptr<SingularMLMultiPole<elem_type>> asingmp, Vec<3> center, double r, int order)
-      : root(center, r, 0, order, asingmp->Kappa()), singmp(asingmp)
-    {
+  RegularMLExpansion (shared_ptr<SingularMLExpansion<elem_type>> asingmp, Vec<3> center, double r,
+                      const FMM_Parameters & _params)
+  : fmm_params(_params), root(center, r, 0, asingmp->Kappa(), fmm_params), singmp(asingmp)
+  {
       if (!singmp->havemp) throw Exception("first call Calc for singular MP");
+      root.Allocate();
       nodes_on_level = 0;
       nodes_on_level[0] = 1;
       {
-        static Timer t("mptool compute regular MLMP"); RegionTimer rg(t);
-        root.AddSingularNode(singmp->root, true);
+        static Timer t("mptool compute regular MLMP"); RegionTimer rg(t);
+        root.AddSingularNode(singmp->root, true, nullptr);
         // cout << "norm after S->R conversion: " << root.Norm() << endl;
       }
@@ -1117,39 +2073,163 @@ namespace ngfem
       }
     }
-    RegularMLMultiPole (Vec<3> center, double r, int order, double kappa)
-      : root(center, r, 0, order, kappa)
-    {
-      nodes_on_level = 0;
-      nodes_on_level[0] = 1;
-    }
+  RegularMLExpansion (Vec<3> center, double r, double kappa, const FMM_Parameters & _params)
+  : fmm_params(_params), root(center, r, 0, kappa, fmm_params)
+  {
+    nodes_on_level = 0;
+    nodes_on_level[0] = 1;
+  }
     void AddTarget (Vec<3> t)
     {
       root.AddTarget (t);
     }
-    void CalcMP(shared_ptr<SingularMLMultiPole<elem_type>> asingmp)
+    void AddVolumeTarget (Vec<3> t, double r)
+    {
+      root.AddVolumeTarget (t, r);
+    }
+    void CalcMP(shared_ptr<SingularMLExpansion<elem_type>> asingmp, bool onlytargets = true)
     {
+      static Timer t("mptool regular MLMP"); RegionTimer rg(t);
+      static Timer tremove("removeempty");
+      static Timer trec("mptool regular MLMP - recording");
+      static Timer tsort("mptool regular MLMP - sort");
       singmp = asingmp;
       root.CalcTotalTargets();
-      root.RemoveEmptyTrees();
-      root.AddSingularNode(singmp->root, false);
+      // cout << "before remove empty trees:" << endl;
+      // PrintStatistics(cout);
+      /*
+      tremove.Start();
+      if (onlytargets)
+        root.RemoveEmptyTrees();
+      tremove.Stop();
+      */
+      root.AllocateMemory();
+      // cout << "after allocating regular:" << endl;
+      // PrintStatistics(cout);
+      // cout << "starting S-R converion" << endl;
+      // PrintStatistics(cout);
+      if constexpr (false)
+        {
+          root.AddSingularNode(singmp->root, !onlytargets, nullptr);
+        }
+      else
+        {  // use recording
+          Array<RecordingRS> recording;
+          {
+            RegionTimer rrec(trec);
+            root.AddSingularNode(singmp->root, !onlytargets, &recording);
+          }
+          // cout << "recorded: " << recording.Size() << endl;
+          {
+            RegionTimer reg(tsort);
+            QuickSort (recording, [] (auto & a, auto & b)
+            {
+              if (a.len < (1-1e-8) * b.len) return true;
+              if (a.len > (1+1e-8) * b.len) return false;
+              return a.theta < b.theta;
+            });
+          }
+          double current_len = -1e100;
+          double current_theta = -1e100;
+          Array<RecordingRS*> current_batch;
+          Array<Array<RecordingRS*>> batch_group;
+          Array<double> group_lengths;
+          Array<double> group_thetas;
+          for (auto & record : recording)
+            {
+              bool len_changed = fabs(record.len - current_len) > 1e-8;
+              bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
+              if ((len_changed || theta_changed) && current_batch.Size() > 0) {
+                // ProcessBatch(current_batch, current_len, current_theta);
+                batch_group.Append(current_batch);
+                group_lengths.Append(current_len);
+                group_thetas.Append(current_theta);
+                current_batch.SetSize(0);
+              }
+              current_len = record.len;
+              current_theta = record.theta;
+              current_batch.Append(&record);
+            }
+          if (current_batch.Size() > 0) {
+            // ProcessBatch(current_batch, current_len, current_theta);
+            batch_group.Append(current_batch);
+            group_lengths.Append(current_len);
+            group_thetas.Append(current_theta);
+          }
+          ParallelFor(batch_group.Size(), [&](int i) {
+            ProcessBatchRS(batch_group[i], group_lengths[i], group_thetas[i]);
+          }, TasksPerThread(4));
+        }
       /*
       int maxlevel = 0;
-      for (auto [i,num] : Enumerate(RegularMLMultiPole::nodes_on_level))
+      for (auto [i,num] : Enumerate(RegularMLExpansion::nodes_on_level))
         if (num > 0) maxlevel = i;
       for (int i = 0; i <= maxlevel; i++)
-        cout << "reg " << i << ": " << RegularMLMultiPole::nodes_on_level[i] << endl;
+        cout << "reg " << i << ": " << RegularMLExpansion::nodes_on_level[i] << endl;
       */
-      root.LocalizeExpansion(false);
+      // cout << "starting R-R converion" << endl;
+      // PrintStatistics(cout);
+      static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
+      root.LocalizeExpansion(!onlytargets);
+      // cout << "R-R conversion done" << endl;
+      // PrintStatistics(cout);
     }
+    void PrintStatistics (ostream & ost)
+    {
+      int levels = 0;
+      int cnt = 0;
+      root.TraverseTree( [&](Node & node) {
+        levels = max(levels, node.level);
+        cnt++;
+      });
+      ost << "levels: " << levels << endl;
+      ost << "nodes: " << cnt << endl;
+      Array<int> num_on_level(levels+1);
+      Array<int> order_on_level(levels+1);
+      Array<size_t> coefs_on_level(levels+1);
+      num_on_level = 0;
+      order_on_level = 0;
+      root.TraverseTree( [&](Node & node) {
+        num_on_level[node.level]++;
+        order_on_level[node.level] = max(order_on_level[node.level],node.mp.Order());
+        coefs_on_level[node.level] += node.mp.SH().Coefs().Size();
+      });
+      cout << "num on level" << endl;
+      for (int i = 0; i < num_on_level.Size(); i++)
+        cout << i << ": " << num_on_level[i] << ", order = " << order_on_level[i] << ", coefs " << coefs_on_level[i] << endl;
+      size_t totcoefs = 0;
+      for (auto n : coefs_on_level)
+        totcoefs += n;
+      cout << "total mem in coefs: " << sizeof(elem_type)*totcoefs / sqr(1024) << " MB" << endl;
+    }
     void Print (ostream & ost) const
     {
       root.Print(ost);
@@ -1168,7 +2248,10 @@ namespace ngfem
     elem_type Evaluate (Vec<3> p) const
     {
       // static Timer t("mptool Eval MLMP regular"); RegionTimer r(t);
-      if (L2Norm(p-root.center) > root.r) return elem_type{0.0};
+      // if (L2Norm(p-root.center) > root.r) return elem_type{0.0};
+      if (MaxNorm(p-root.center) > root.r)
+        return singmp->Evaluate(p);
       return root.Evaluate(p);
     }
@@ -1180,11 +2263,12 @@ namespace ngfem
   };
   template <typename elem_type>
-  inline ostream & operator<< (ostream & ost, const RegularMLMultiPole<elem_type> & mlmp)
+  inline ostream & operator<< (ostream & ost, const RegularMLExpansion<elem_type> & mlmp)
   {
     mlmp.Print(ost);
-    // ost << "RegularMLMultiPole" << endl;
+    // ost << "RegularMLExpansion" << endl;
     return ost;
   }
@@ -1193,126 +2277,5 @@ namespace ngfem
-  // ******************** Coefficient Functions *********************
-  class SphericalHarmonicsCF : public CoefficientFunction
-  {
-    SphericalHarmonics<Complex> sh;
-  public:
-    SphericalHarmonicsCF (int order)
-      : CoefficientFunction(1, true), sh(order) { }
-    Complex & Coef(int n, int m) { return sh.Coef(n,m); }
-    virtual double Evaluate (const BaseMappedIntegrationPoint & ip) const override
-    { throw Exception("real eval not available"); }
-    virtual void Evaluate (const BaseMappedIntegrationPoint & mip, FlatVector<Complex> values) const override
-    {
-      values(0) = sh.Eval(mip.GetPoint());
-    }
-    virtual void Evaluate (const BaseMappedIntegrationRule & ir, BareSliceMatrix<Complex> values) const override
-    {
-      for (int i = 0; i < ir.Size(); i++)
-        {
-          auto & mip = ir[i];
-          values(i,0) = sh.Eval(mip.GetPoint());
-        }
-    }
-    auto & SH() { return sh; }
-  };
-  template <typename RADIAL, typename entry_type=Complex>
-  class MultiPoleCF : public CoefficientFunction
-  {
-    MultiPole<RADIAL, entry_type> mp;
-    Vec<3> center;
-  public:
-    MultiPoleCF (int order, double kappa, Vec<3> acenter, double scale = 1)
-      : CoefficientFunction(sizeof(entry_type)/sizeof(Complex), true), mp(order, kappa, scale), center(acenter) { }
-    entry_type & Coef(int n, int m) { return mp.Coef(n,m); }
-    auto & SH() { return mp.SH(); }
-    auto & MP() { return mp; }
-    Vec<3> Center() const { return center; }
-    virtual double Evaluate (const BaseMappedIntegrationPoint & ip) const override
-    { throw Exception("real eval not available"); }
-    virtual void Evaluate (const BaseMappedIntegrationPoint & mip, FlatVector<Complex> values) const override
-    {
-      if constexpr (std::is_same<entry_type, Complex>())
-        values(0) = mp.Eval(mip.GetPoint()-center);
-      else
-        values = mp.Eval(mip.GetPoint()-center);
-    }
-    template <typename TARGET>
-    void ShiftZ (double z, MultiPole<TARGET, entry_type> & target) { mp.ShiftZ(z, target); }
-    using CoefficientFunction::Transform;
-    template <typename TARGET>
-    void Transform (MultiPoleCF<TARGET, entry_type> & target)
-    {
-      mp.Transform (target.MP(), target.Center()-center);
-    }
-  };
-  template <typename entry_type>
-  class SingularMLMultiPoleCF : public CoefficientFunction
-  {
-    shared_ptr<SingularMLMultiPole<entry_type>> mlmp;
-  public:
-    SingularMLMultiPoleCF (Vec<3> center, double r, int order, double kappa)
-      : CoefficientFunction(sizeof(entry_type)/sizeof(Complex), true), mlmp{make_shared<SingularMLMultiPole<entry_type>>(center, r, order, kappa)} { }
-    virtual double Evaluate (const BaseMappedIntegrationPoint & ip) const override
-    { throw Exception("real eval not available"); }
-    virtual void Evaluate (const BaseMappedIntegrationPoint & mip, FlatVector<Complex> values) const override
-    {
-      // values(0) = mlmp->Evaluate(mip.GetPoint());
-      if constexpr (std::is_same<entry_type, Complex>())
-        values(0) = mlmp->Evaluate(mip.GetPoint());
-      else
-        values = mlmp->Evaluate(mip.GetPoint());
-    }
-    shared_ptr<SingularMLMultiPole<entry_type>> MLMP() { return mlmp; }
-  };
-  template <typename entry_type>
-  class RegularMLMultiPoleCF : public CoefficientFunction
-  {
-    shared_ptr<RegularMLMultiPole<entry_type>> mlmp;
-  public:
-    RegularMLMultiPoleCF (shared_ptr<SingularMLMultiPoleCF<entry_type>> asingmp, Vec<3> center, double r, int order)
-      : CoefficientFunction(sizeof(entry_type)/sizeof(Complex), true), mlmp{make_shared<RegularMLMultiPole<entry_type>>(asingmp->MLMP(), center, r, order)} { }
-    virtual double Evaluate (const BaseMappedIntegrationPoint & ip) const override
-    { throw Exception("real eval not available"); }
-    virtual void Evaluate (const BaseMappedIntegrationPoint & mip, FlatVector<Complex> values) const override
-    {
-      // values(0) = mlmp->Evaluate(mip.GetPoint());
-      if constexpr (std::is_same<entry_type, Complex>())
-        values(0) = mlmp->Evaluate(mip.GetPoint());
-      else
-        values = mlmp->Evaluate(mip.GetPoint());
-    }
-    shared_ptr<RegularMLMultiPole<entry_type>> MLMP() { return mlmp; }
-  };
 }
 #endif