PyPI - ngsolve - Versions diffs - 6.2.2505__cp311-cp311-macosx_10_15_universal2.whl → 6.2.2505.post94.dev0__cp311-cp311-macosx_10_15_universal2.whl - Mend

ngsolve 6.2.2505__cp311-cp311-macosx_10_15_universal2.whl → 6.2.2505.post94.dev0__cp311-cp311-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ngsolve might be problematic. Click here for more details.

Files changed (60) hide show

netgen/include/mptools.hpp CHANGED Viewed

@@ -20,6 +20,157 @@ namespace ngsbem
 {
   using namespace ngfem;
+  template<typename T>
+  constexpr int VecLength = 1;  // Default: Complex has length 1
+  template<int N>
+  constexpr int VecLength<Vec<N, Complex>> = N;  // Specialization: Vec<N,Complex> has length N
+  constexpr int FMM_SW = 4;
+  // ************************ SIMD - creation (should end up in simd.hpp) *************
+  template <int S, typename T, int SW>
+  Vec<S,T> HSum (Vec<S,SIMD<T,SW>> v)
+  {
+    Vec<S,T> res;
+    for (int i = 0; i < S; i++)
+      res(i) = HSum(v(i));
+    // Iterate<S> ([&](auto i) {
+    // res.HTData().template Elem<i.value>() = HSum(v.HTData().template Elem<i.value>());
+    // });
+    return res;
+  }
+  template <typename T, size_t S> class MakeSimdCl;
+  template <typename T, size_t S>
+  auto MakeSimd (array<T,S> aa)  { return MakeSimdCl(aa).Get(); }
+  template <typename T, size_t S>
+  class MakeSimdCl
+  {
+    array<T,S> a;
+  public:
+    MakeSimdCl (array<T,S> aa) : a(aa)  { ; }
+    auto Get() const
+    {
+      SIMD<T,S> sa( [this] (auto i) { return (this->a)[i]; });
+      return sa;
+    }
+  };
+  template <typename T, size_t S, int VS>
+  class MakeSimdCl<Vec<VS,T>,S>
+  {
+    array<Vec<VS,T>,S> a;
+  public:
+    MakeSimdCl (array<Vec<VS,T>,S> aa) : a(aa)  { ; }
+    auto Get() const
+    {
+      array<T,S> ai;
+      Vec<VS, decltype(MakeSimd(ai))> res;
+      for (int i = 0; i < VS; i++)
+        {
+          for (int j = 0; j < S; j++)
+            ai[j] = a[j](i);
+          res(i) = MakeSimd(ai);
+        }
+      return res;
+    }
+  };
+  template <size_t S>
+  class MakeSimdCl<Complex,S>
+  {
+    array<Complex,S> a;
+  public:
+    MakeSimdCl (array<Complex,S> aa) : a(aa)  { ; }
+    auto Get() const
+    {
+      array<double,S> ar, ai;
+      for (int j = 0; j < S; j++)
+        {
+          ar[j] = Real(a[j]);
+          ai[j] = Imag(a[j]);
+        }
+      return SIMD<Complex,S> (MakeSimd(ar), MakeSimd(ai));
+    }
+  };
+  template <typename Tfirst, size_t S, typename ...Trest>
+  class MakeSimdCl<std::tuple<Tfirst,Trest...>,S>
+  {
+    array<std::tuple<Tfirst,Trest...>,S> a;
+  public:
+    MakeSimdCl (array<std::tuple<Tfirst,Trest...>,S> aa) : a(aa)  { ; }
+    auto Get() const
+    {
+      array<Tfirst,S> a0;
+      for (int i = 0; i < S; i++)
+        a0[i] = std::get<0> (a[i]);
+      if constexpr (std::tuple_size<tuple<Tfirst,Trest...>>::value == 1)
+        {
+          return tuple(MakeSimd(a0));
+        }
+      else
+        {
+          array<tuple<Trest...>,S> arest;
+          for (int i = 0; i < S; i++)
+            arest[i] = skip_first(a[i]);
+          return tuple_cat ( tuple (MakeSimd(a0)), MakeSimd(arest) );
+        }
+    }
+    template <typename... Ts>
+    static auto skip_first(const std::tuple<Ts...>& t) {
+      return std::apply([](auto first, auto... rest) {
+        return std::make_tuple(rest...);
+      }, t);
+    }
+  };
+  inline std::tuple<double, double, double> SphericalCoordinates(Vec<3> dist){
+    double len, theta, phi;
+    len = L2Norm(dist);
+    if (len < 1e-30)
+      theta = 0;
+    else
+      theta = acos (dist(2) / len);
+    if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
+      phi = 0;
+    else
+      phi = atan2(dist(1), dist(0));
+    return {len, theta, phi};
+  }
   template <typename entry_type = Complex>
   class NGS_DLL_HEADER SphericalHarmonics
@@ -84,9 +235,69 @@ namespace ngsbem
     void Calc (Vec<3> x, FlatVector<Complex> shapes);
+    void FlipZ ();
     void RotateZ (double alpha);
-    void RotateY (double alpha);
+    template <typename FUNC>
+    void RotateZ (double alpha, FUNC func) const
+    {
+      if (order < 0) return;
+      Vector<Complex> exp_imalpha(order+1);
+      Complex exp_ialpha(cos(alpha), sin(alpha));
+      Complex prod = 1.0;
+      for (int i = 0; i <= order; i++)
+        {
+          exp_imalpha(i) = prod;
+          prod *= exp_ialpha;
+        }
+      int ii = 0;
+      for (int n = 0; n <= order; n++)
+        {
+          for (int m = -n; m < 0; m++, ii++)
+            func(ii, conj(exp_imalpha(-m)));
+          for (int m = 0; m <= n; m++, ii++)
+            func(ii, exp_imalpha(m));
+        };
+    };
+    template <typename FUNC>
+    void RotateZFlip (double alpha, bool flip, FUNC func) const
+    {
+      if (order < 0) return;
+      Vector<Complex> exp_imalpha(order+1);
+      Complex exp_ialpha(cos(alpha), sin(alpha));
+      Complex prod = 1.0;
+      for (int i = 0; i <= order; i++)
+        {
+          exp_imalpha(i) = prod;
+          prod *= exp_ialpha;
+        }
+      int ii = 0;
+      auto FlipFactor = [] (int n, int m, bool flip)->double
+      {
+        if (flip)
+          return ((n-m)%2) == 1 ? -1 : 1;
+        return 1.0;
+      };
+      for (int n = 0; n <= order; n++)
+        {
+          for (int m = -n; m < 0; m++, ii++)
+            func(ii, FlipFactor(n,m,flip)*conj(exp_imalpha(-m)));
+          for (int m = 0; m <= n; m++, ii++)
+            func(ii, FlipFactor(n,m,flip)*exp_imalpha(m));
+        };
+    };
+    void RotateY (double alpha, bool parallel = false);
     static double CalcAmn (int m, int n)
@@ -119,11 +330,11 @@ namespace ngsbem
   // https://fortran-lang.discourse.group/t/looking-for-spherical-bessel-and-hankel-functions-of-first-and-second-kind-and-arbitrary-order/2308/2
   NGS_DLL_HEADER
   void besseljs3d (int nterms, double z, double scale,
-                   FlatVector<double> fjs, FlatVector<double> fjder);
+                   SliceVector<double> fjs, SliceVector<double> fjder = FlatVector<double>(0, nullptr));
   NGS_DLL_HEADER
   void besseljs3d (int nterms, Complex z, double scale,
-                   FlatVector<Complex> fjs, FlatVector<Complex> fjder);
+                   SliceVector<Complex> fjs, SliceVector<Complex> fjder = FlatVector<Complex>(0, nullptr));
   /*
@@ -142,14 +353,17 @@ namespace ngsbem
                FlatVector<double> jp,
                FlatVector<double> yp);
   template <typename T>
   void SphericalBessel (int n, double rho, double scale, T && values)
   {
+    besseljs3d (n, rho, scale,  values);
+    /*
     Vector<double> j(n+1), jp(n+1);
     besseljs3d (n, rho, scale,  j, jp);
     values = j;
+    */
   }
@@ -173,21 +387,6 @@ namespace ngsbem
         return;
       }
     Vector j(n+1), y(n+1), jp(n+1), yp(n+1);
-    // SBESJY (rho, n, j, y, jp, yp);
-    /*
-    values = j + Complex(0,1) * y;
-    if (scale != 1.0)
-      {
-        double prod = 1.0;
-        for (int i = 0; i <= n; i++)
-          {
-            values(i) *= prod;
-            prod *= scale;
-          }
-      }
-    */
     // the bessel-evaluation with scale
     besseljs3d (n, rho, 1/scale,  j, jp);
@@ -358,18 +557,7 @@ namespace ngsbem
       // static Timer t("mptool Transform "+ToString(typeid(RADIAL).name())+ToString(typeid(TARGET).name()));
       // RegionTimer reg(t);
-      double len = L2Norm(dist);
-      double theta, phi;
-      if (len < 1e-30)
-        theta = 0;
-      else
-        theta = acos (dist(2) / len);
-      if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
-        phi = 0;
-      else
-        phi = atan2(dist(1), dist(0));
+      auto [len, theta, phi] = SphericalCoordinates(dist);
       // MultiPole<RADIAL,entry_type> tmp{*this};
@@ -386,14 +574,18 @@ namespace ngsbem
     }
     template <typename TARGET>
-    void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
+    void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist, bool atomic = false) const
     {
       if (SH().Order() < 0) return;
       if (target.SH().Order() < 0) return;
       MultiPole<TARGET,entry_type> tmp{target};
       Transform(tmp, dist);
-      target.SH().Coefs() += tmp.SH().Coefs();
+      if (!atomic)
+        target.SH().Coefs() += tmp.SH().Coefs();
+      else
+        for (int j = 0; j < target.SH().Coefs().Size(); j++)
+          AtomicAdd(target.SH().Coefs()[j], tmp.SH().Coefs()[j]);
     }
     template <typename TARGET>
@@ -412,11 +604,124 @@ namespace ngsbem
   static constexpr int maxdirect = 100;
+  template <typename SCAL, auto S>
+  inline auto VecVector2Matrix (FlatVector<Vec<S,SCAL>> vec)
+  {
+    return FlatMatrixFixWidth<S,SCAL> (vec.Size(), vec.Data()->Data());
+  }
+  inline auto VecVector2Matrix (FlatVector<Complex> vec)
+  {
+    return FlatMatrixFixWidth<1,Complex> (vec.Size(), vec.Data());
+  }
   template <typename entry_type=Complex>
   class SingularMLMultiPole
   {
+    using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
     static Array<size_t> nodes_on_level;
+    struct RecordingSS
+    {
+      const MultiPole<MPSingular,entry_type> * mp_source;
+      MultiPole<MPSingular,entry_type> * mp_target;
+      Vec<3> dist;
+      double len, theta, phi;
+      bool flipz;
+    public:
+      RecordingSS() = default;
+      RecordingSS (const MultiPole<MPSingular,entry_type> * amp_source,
+                   MultiPole<MPSingular,entry_type> * amp_target,
+                   Vec<3> adist)
+        : mp_source(amp_source), mp_target(amp_target), dist(adist)
+      {
+        std::tie(len, theta, phi) = SphericalCoordinates(adist);
+        // flipz = false;
+        flipz = theta > M_PI/2;
+        if (flipz) theta = M_PI-theta;
+      }
+    };
+    static void ProcessBatch(FlatArray<RecordingSS*> batch, double len, double theta) {
+      constexpr int vec_length = VecLength<entry_type>;
+      int batch_size = batch.Size();
+      int N = batch_size * vec_length;
+      // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(entry_type).name() << ", len = " << len << ", theta = " << theta << endl;
+      if (N <= 1 || batch_size <= 1) {
+        for (auto* rec : batch) {
+          rec->mp_source->TransformAdd(*rec->mp_target, rec->dist, true);
+        }
+      }
+      else if (N <= 3) {
+        ProcessVectorizedBatch<3, vec_length>(batch, len, theta);
+      }
+      else if (N <= 4) {
+        ProcessVectorizedBatch<4, vec_length>(batch, len, theta);
+      }
+      else if (N <= 6) {
+        ProcessVectorizedBatch<6, vec_length>(batch, len, theta);
+      }
+      else if (N <= 12) {
+        ProcessVectorizedBatch<12, vec_length>(batch, len, theta);
+      }
+      else if (N <= 24) {
+        ProcessVectorizedBatch<24, vec_length>(batch, len, theta);
+      }
+      else if (N <= 48) {
+        ProcessVectorizedBatch<48, vec_length>(batch, len, theta);
+      }
+      else if (N <= 96) {
+        ProcessVectorizedBatch<96, vec_length>(batch, len, theta);
+      }
+      else if (N <= 192) {
+        ProcessVectorizedBatch<192, vec_length>(batch, len, theta);
+      }
+      else {
+        // Split large batches
+        ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
+        ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
+      }
+    }
+    template<int N, int vec_length>
+    static void ProcessVectorizedBatch(FlatArray<RecordingSS*> batch, double len, double theta) {
+      // *testout << "Processing vectorized S->S batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
+      MultiPole<MPSingular, Vec<N,Complex>> vec_source(batch[0]->mp_source->Order(), batch[0]->mp_source->Kappa(), batch[0]->mp_source->RTyp());
+      MultiPole<MPSingular, Vec<N,Complex>> vec_target(batch[0]->mp_target->Order(), batch[0]->mp_target->Kappa(), batch[0]->mp_target->RTyp());
+      // Copy multipoles into vectorized multipole
+      for (int i = 0; i < batch.Size(); i++)
+        {
+          auto source_i = VecVector2Matrix (batch[i]->mp_source->SH().Coefs());
+          auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
+          batch[i]->mp_source->SH().RotateZFlip(batch[i]->phi, batch[i]->flipz,
+                                            [source_i, source_mati] (size_t ii, Complex factor)
+                                            {
+                                              source_mati.Row(ii) = factor * source_i.Row(ii);
+                                            });
+        }
+      vec_source.SH().RotateY(theta, vec_source.SH().Order() >= 100);
+      vec_source.ShiftZ(-len, vec_target);
+      vec_target.SH().RotateY(-theta, vec_target.SH().Order() >= 100);
+      // Copy vectorized multipole into individual multipoles
+      for (int i = 0; i < batch.Size(); i++)
+        {
+          auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
+          auto target_mati = VecVector2Matrix (batch[i]->mp_target->SH().Coefs());
+          batch[i]->mp_target->SH().RotateZFlip(-batch[i]->phi, batch[i]->flipz,
+                                      [source_mati, target_mati] (size_t ii, Complex factor)
+                                      {
+                                        AtomicAdd (target_mati.Row(ii), factor * source_mati.Row(ii));
+                                      });
+      }
+    }
     struct Node
     {
       Vec<3> center;
@@ -428,7 +733,14 @@ namespace ngsbem
       Array<tuple<Vec<3>, entry_type>> charges;
       Array<tuple<Vec<3>, Vec<3>, entry_type>> dipoles;
       Array<tuple<Vec<3>, Vec<3>, Complex,int>> currents;
+      using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
+      Array<tuple<Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_charges;
+      Array<tuple<Vec<3,SIMD<double,FMM_SW>>, Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_dipoles;
       int total_sources;
+      std::mutex node_mutex;
+      atomic<bool> have_childs{false};
       Node (Vec<3> acenter, double ar, int alevel, double akappa)
         : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*akappa), akappa, ar) // min(1.0, ar*akappa))
@@ -449,12 +761,13 @@ namespace ngsbem
             cc(2) += (i&4) ? r/2 : -r/2;
             childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
           }
+        have_childs = true;
       }
       void AddCharge (Vec<3> x, entry_type c)
       {
-        if (childs[0])
+        if (have_childs) // quick check without locking
           {
             // directly send to childs:
             int childnum  = 0;
@@ -465,6 +778,21 @@ namespace ngsbem
             return;
           }
+        lock_guard<mutex> guard(node_mutex);
+        if (have_childs) // test again after locking
+          {
+            // directly send to childs:
+            int childnum  = 0;
+            if (x(0) > center(0)) childnum += 1;
+            if (x(1) > center(1)) childnum += 2;
+            if (x(2) > center(2)) childnum += 4;
+            childs[childnum] -> AddCharge(x, c);
+            return;
+          }
         charges.Append( tuple{x,c} );
         // if (r*mp.Kappa() < 1e-8) return;
@@ -489,7 +817,7 @@ namespace ngsbem
       void AddDipole (Vec<3> x, Vec<3> d, entry_type c)
       {
-        if (childs[0])
+        if (have_childs)
           {
             // directly send to childs:
@@ -501,6 +829,23 @@ namespace ngsbem
             return;
           }
+        lock_guard<mutex> guard(node_mutex);
+        if (have_childs)
+          {
+            // directly send to childs:
+            int childnum  = 0;
+            if (x(0) > center(0)) childnum += 1;
+            if (x(1) > center(1)) childnum += 2;
+            if (x(2) > center(2)) childnum += 4;
+            childs[childnum] -> AddDipole(x, d, c);
+            return;
+          }
         dipoles.Append (tuple{x,d,c});
         if (dipoles.Size() < maxdirect || r < 1e-8)
@@ -520,6 +865,7 @@ namespace ngsbem
         currents.SetSize0();
       }
+      // not parallel yet
       void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
       {
         if (childs[0])
@@ -549,7 +895,7 @@ namespace ngsbem
                 }
             return;
           }
         currents.Append (tuple{sp,ep,j,num});
         // if (currents.Size() < maxdirect || r < 1e-8)
@@ -583,26 +929,74 @@ namespace ngsbem
             return sum;
           }
-        // static Timer t("fmm direct eval"); RegionTimer reg(t);
-        if (mp.Kappa() < 1e-8)
+        {
+          // static Timer t("fmm direct eval"); RegionTimer reg(t);
+          // t.AddFlops (charges.Size());
+        if (simd_charges.Size())
+          {
+            simd_entry_type vsum{0.0};
+            if (mp.Kappa() < 1e-8)
+              for (auto [x,c] : simd_charges)
+                {
+                  auto rho = L2Norm(p-x);
+                  auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW> (1,rho*mp.Kappa()) / rho;
+                  kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
+                  vsum += kernel * c;
+                }
+            else
+              for (auto [x,c] : simd_charges)
+                {
+                  auto rho = L2Norm(p-x);
+                  auto [si,co] = sincos(rho*mp.Kappa());
+                  auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) / rho;
+                  kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
+                  vsum += kernel * c;
+                }
+            sum += HSum(vsum);
+          }
+        else
+          {
+            if (mp.Kappa() < 1e-8)
+              {
+                for (auto [x,c] : charges)
+                  if (double rho = L2Norm(p-x); rho > 0)
+                    sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
+              }
+            else
+              for (auto [x,c] : charges)
+                if (double rho = L2Norm(p-x); rho > 0)
+                  sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
+          }
+        }
+        if (simd_dipoles.Size())
+        {
+          simd_entry_type vsum{0.0};
+          for (auto [x,d,c] : simd_dipoles)
           {
-            for (auto [x,c] : charges)
-              if (double rho = L2Norm(p-x); rho > 0)
-                sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
+            auto rho = L2Norm(p-x);
+            auto drhodp = (1.0/rho) * (p-x);
+            auto [si,co] = sincos(rho*mp.Kappa());
+            auto dGdrho = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) *
+                          (-1.0/(rho*rho) + SIMD<Complex,FMM_SW>(0, mp.Kappa())/rho);
+            auto kernel = dGdrho * InnerProduct(drhodp, d);
+            kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
+            vsum += kernel * c;
           }
+          sum += HSum(vsum);
+        }
         else
-          for (auto [x,c] : charges)
-            if (double rho = L2Norm(p-x); rho > 0)
-              sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
-        for (auto [x,d,c] : dipoles)
+        {
+          for (auto [x,d,c] : dipoles)
           if (double rho = L2Norm(p-x); rho > 0)
-            {
+          {
               Vec<3> drhodp = 1.0/rho * (p-x);
               Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
-                (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
+              (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
               sum += dGdrho * InnerProduct(drhodp, d) * c;
-            }
+          }
+        }
         for (auto [sp,ep,j,num] : currents)
           {
@@ -664,23 +1058,27 @@ namespace ngsbem
             }
       }
-      void CalcMP()
+      void CalcMP(Array<RecordingSS> * recording, Array<Node*> * nodes_to_process)
       {
-        mp.SH().Coefs() = 0.0;
+        // mp.SH().Coefs() = 0.0;
         if (childs[0])
           {
-            if (total_sources < 1000)
+            if (total_sources < 1000 || recording)
               for (auto & child : childs)
-                child->CalcMP();
+                child->CalcMP(recording, nodes_to_process);
             else
               ParallelFor (8, [&] (int nr)
                            {
-                             childs[nr] -> CalcMP();
+                             childs[nr] -> CalcMP(recording, nodes_to_process);
                            });
-            for (auto & child : childs)
-              child->mp.TransformAdd(mp, center-child->center);
+            for (auto & child : childs){
+              if (recording && child->mp.SH().Coefs().Size() > 0)
+                *recording += RecordingSS(&child->mp, &mp, center-child->center);
+              else
+                child->mp.TransformAdd(mp, center-child->center);
+            }
           }
         else
           {
@@ -690,14 +1088,54 @@ namespace ngsbem
                 return;
               }
-            for (auto [x,c] : charges)
-              mp.AddCharge (x-center,c);
+            // make simd charges, comment this block for testing ...
+            simd_charges.SetSize( (charges.Size()+FMM_SW-1)/FMM_SW);
+            size_t i = 0, ii = 0;
+            for ( ; i+FMM_SW <= charges.Size(); i+=FMM_SW, ii++)
+              {
+                std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
+                for (int j = 0; j < FMM_SW; j++) ca[j] = charges[i+j];
+                simd_charges[ii] = MakeSimd(ca);
+              }
+            if (i < charges.Size())
+              {
+                std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
+                int j = 0;
+                for ( ; i+j < charges.Size(); j++) ca[j] = charges[i+j];
+                for ( ; j < FMM_SW; j++) ca[j] = tuple( get<0>(ca[0]), entry_type{0.0} );
+                simd_charges[ii] = MakeSimd(ca);
+              }
+            simd_dipoles.SetSize( (dipoles.Size()+FMM_SW-1)/FMM_SW);
+            i = 0, ii = 0;
+            for ( ; i+FMM_SW <= dipoles.Size(); i+=FMM_SW, ii++)
+              {
+                std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
+                for (int j = 0; j < FMM_SW; j++) di[j] = dipoles[i+j];
+                simd_dipoles[ii] = MakeSimd(di);
+              }
+            if (i < dipoles.Size())
+              {
+                std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
+                int j = 0;
+                for ( ; i+j < dipoles.Size(); j++) di[j] = dipoles[i+j];
+                for ( ; j < FMM_SW; j++) di[j] = tuple( get<0>(di[0]), get<1>(di[0]), entry_type{0.0} );
+                simd_dipoles[ii] = MakeSimd(di);
+              }
-            for (auto [x,d,c] : dipoles)
-              mp.AddDipole (x-center, d, c);
+            if (nodes_to_process)
+                *nodes_to_process += this;
+            else {
+              for (auto [x,c] : charges)
+                mp.AddCharge (x-center,c);
+              for (auto [x,d,c] : dipoles)
+                mp.AddDipole (x-center, d, c);
-            for (auto [sp,ep,j,num] : currents)
-              mp.AddCurrent (sp-center, ep-center, j, num);
+              for (auto [sp,ep,j,num] : currents)
+                mp.AddCurrent (sp-center, ep-center, j, num);
+            }
           }
       }
@@ -836,6 +1274,10 @@ namespace ngsbem
     void CalcMP()
     {
       static Timer t("mptool compute singular MLMP"); RegionTimer rg(t);
+      static Timer ts2mp("mptool compute singular MLMP - source2mp");
+      static Timer tS2S("mptool compute singular MLMP - S->S");
+      static Timer trec("mptool comput singular recording");
+      static Timer tsort("mptool comput singular sort");
       /*
       int maxlevel = 0;
@@ -847,7 +1289,87 @@ namespace ngsbem
       */
       root.CalcTotalSources();
-      root.CalcMP();
+      if (false)
+        // direct evaluation of S->S
+        root.CalcMP(nullptr, nullptr);
+      else
+        {
+      Array<RecordingSS> recording;
+      Array<Node*> nodes_to_process;
+      {
+        RegionTimer reg(trec);
+      root.CalcMP(&recording, &nodes_to_process);
+      }
+      {
+        RegionTimer rs2mp(ts2mp);
+        ParallelFor(nodes_to_process.Size(), [&](int i){
+          auto node = nodes_to_process[i];
+          for (auto [x,c]: node->charges)
+            node->mp.AddCharge(x-node->center, c);
+          for (auto [x,d,c]: node->dipoles)
+            node->mp.AddDipole(x-node->center, d, c);
+          for (auto [sp,ep,j,num]: node->currents)
+            node->mp.AddCurrent(sp-node->center, ep-node->center, j, num);
+        }, TasksPerThread(4));
+      }
+      {
+      RegionTimer reg(tsort);
+      QuickSort (recording, [] (auto & a, auto & b)
+      {
+        if (a.len < (1-1e-8) * b.len) return true;
+        if (a.len > (1+1e-8) * b.len) return false;
+        return a.theta < b.theta;
+      });
+      }
+      double current_len = -1e100;
+      double current_theta = -1e100;
+      Array<RecordingSS*> current_batch;
+      Array<Array<RecordingSS*>> batch_group;
+      Array<double> group_lengths;
+      Array<double> group_thetas;
+      for (auto & record : recording)
+        {
+          bool len_changed = fabs(record.len - current_len) > 1e-8;
+          bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
+          if ((len_changed || theta_changed) && current_batch.Size() > 0) {
+            batch_group.Append(current_batch);
+            group_lengths.Append(current_len);
+            group_thetas.Append(current_theta);
+            current_batch.SetSize(0);
+            }
+          current_len = record.len;
+          current_theta = record.theta;
+          current_batch.Append(&record);
+        }
+      if (current_batch.Size() > 0) {
+        batch_group.Append(current_batch);
+        group_lengths.Append(current_len);
+        group_thetas.Append(current_theta);
+      }
+      {
+        RegionTimer rS2S(tS2S);
+      // ParallelFor(batch_group.Size(), [&](int i) {
+      for (int i = 0; i < batch_group.Size(); i++){
+          // *testout << "Processing batch " << i << " of size " << batch_group[i].Size() << ", with len = " << group_lengths[i] << ", theta = " << group_thetas[i] << endl;
+        int chunk_size = 24;
+        if (batch_group[i].Size() < chunk_size)
+            ProcessBatch(batch_group[i], group_lengths[i], group_thetas[i]);
+        else
+          ParallelForRange(IntRange(batch_group[i].Size()), [&](IntRange range) {
+              auto sub_batch = batch_group[i].Range(range.First(), range.Next());
+              ProcessBatch(sub_batch, group_lengths[i], group_thetas[i]);
+          }, TasksPerThread(4));
+      }
+      }
+        }
       havemp = true;
     }
@@ -877,6 +1399,143 @@ namespace ngsbem
   class NGS_DLL_HEADER RegularMLMultiPole
   {
     static Array<size_t> nodes_on_level;
+    struct RecordingRS
+    {
+      const MultiPole<MPSingular,elem_type> * mpS;
+      MultiPole<MPRegular,elem_type> * mpR;
+      Vec<3> dist;
+      double len, theta, phi;
+    public:
+      RecordingRS() = default;
+      RecordingRS (const MultiPole<MPSingular,elem_type> * ampS,
+                   MultiPole<MPRegular,elem_type> * ampR,
+                   Vec<3> adist)
+        : mpS(ampS), mpR(ampR), dist(adist)
+      {
+        std::tie(len, theta, phi) = SphericalCoordinates(dist);
+      }
+    };
+    static void ProcessBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
+      // static Timer t("ProcessBatchRS"); RegionTimer reg(t, batch.Size());
+      constexpr int vec_length = VecLength<elem_type>;
+      int batch_size = batch.Size();
+      int N = batch_size * vec_length;
+      // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(elem_type).name() << ", len = " << len << ", theta = " << theta << endl;
+      if (N <= 1 || batch_size <= 1) {
+        for (auto* rec : batch) {
+          rec->mpS->TransformAdd(*rec->mpR, rec->dist);
+        }
+      }
+      else if (N <= 3) {
+        ProcessVectorizedBatch<3, vec_length>(batch, len, theta);
+      }
+      else if (N <= 4) {
+        ProcessVectorizedBatch<4, vec_length>(batch, len, theta);
+      }
+      else if (N <= 6) {
+        ProcessVectorizedBatch<6, vec_length>(batch, len, theta);
+      }
+      else if (N <= 12) {
+        ProcessVectorizedBatch<12, vec_length>(batch, len, theta);
+      }
+      else if (N <= 24) {
+        ProcessVectorizedBatch<24, vec_length>(batch, len, theta);
+      }
+      else if (N <= 48) {
+        ProcessVectorizedBatch<48, vec_length>(batch, len, theta);
+      }
+      else if (N <= 96) {
+        ProcessVectorizedBatch<96, vec_length>(batch, len, theta);
+      }
+      else if (N <= 192) {
+        ProcessVectorizedBatch<192, vec_length>(batch, len, theta);
+      }
+      else {
+        // Split large batches
+        /*
+        ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
+        ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
+        */
+        /*
+        ParallelFor (2, [&] (int i)
+        {
+          if (i == 0)
+            ProcessBatchRS(batch.Range(0, 192 / vec_length), len, theta);
+          else
+            ProcessBatchRS(batch.Range(192 / vec_length, batch_size), len, theta);
+        }, 2);
+        */
+        size_t chunksize = 192/vec_length;
+        size_t num = (batch.Size()+chunksize-1) / chunksize;
+        ParallelFor (num, [&](int i)
+        {
+          ProcessBatchRS(batch.Range(i*chunksize, min((i+1)*chunksize, batch.Size())), len, theta);
+        }, num);
+      }
+    }
+    template<int N, int vec_length>
+    static void ProcessVectorizedBatch(FlatArray<RecordingRS*> batch, double len, double theta) {
+      // static Timer t("ProcessVectorizedBatch, N = "+ToString(N) + ", vec_len = " + ToString(vec_length));
+      // RegionTimer reg(t, batch[0]->mpS->SH().Order());
+      // static Timer ttobatch("mptools - copy to batch 2");
+      // static Timer tfrombatch("mptools - copy from batch 2");
+      // *testout << "Processing vectorized batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
+      MultiPole<MPSingular, Vec<N,Complex>> vec_source(batch[0]->mpS->Order(), batch[0]->mpS->Kappa(), batch[0]->mpS->RTyp());
+      // MultiPole<MPSingular, elem_type> tmp_source{*batch[0]->mpS};
+      MultiPole<MPRegular, elem_type> tmp_target{*batch[0]->mpR};
+      MultiPole<MPRegular, Vec<N,Complex>> vec_target(batch[0]->mpR->Order(), batch[0]->mpR->Kappa(), batch[0]->mpR->RTyp());
+      // Copy multipoles into vectorized multipole
+      // ttobatch.Start();
+      for (int i = 0; i < batch.Size(); i++)
+      {
+        auto source_i = VecVector2Matrix (batch[i]->mpS->SH().Coefs());
+        auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
+        batch[i]->mpS->SH().RotateZ(batch[i]->phi,
+            [source_i, source_mati] (size_t ii, Complex factor)
+            {
+                source_mati.Row(ii) = factor * source_i.Row(ii);
+            });
+      }
+      // ttobatch.Stop();
+      vec_source.SH().RotateY(theta);
+      vec_source.ShiftZ(-len, vec_target);
+      vec_target.SH().RotateY(-theta);
+      // Copy vectorized multipole into individual multipoles
+      // tfrombatch.Start();
+      for (int i = 0; i < batch.Size(); i++) {
+        // auto source_i = VecVector2Matrix (tmp_target.SH().Coefs());
+        auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
+        auto targeti = VecVector2Matrix(batch[i]->mpR->SH().Coefs());
+        tmp_target.SH().RotateZ(-batch[i]->phi,
+                                [source_mati, targeti] (size_t ii, Complex factor)
+                                          {
+                                            // source_i.Row(ii) = factor * source_mati.Row(ii);
+                                            AtomicAdd (VectorView(targeti.Row(ii)), factor * source_mati.Row(ii));
+                                          });
+        // for (int j = 0; j < tmp_target.SH().Coefs().Size(); j++)
+        // AtomicAdd(batch[i]->mpR->SH().Coefs()[j], tmp_target.SH().Coefs()[j]);
+      }
+      // tfrombatch.Stop();
+    }
     struct Node
     {
@@ -887,6 +1546,8 @@ namespace ngsbem
       MultiPole<MPRegular,elem_type> mp;
       Array<Vec<3>> targets;
       int total_targets;
+      std::mutex node_mutex;
+      atomic<bool> have_childs{false};
       Array<const typename SingularMLMultiPole<elem_type>::Node*> singnodes;
@@ -911,13 +1572,15 @@ namespace ngsbem
             cc(2) += (i&4) ? r/2 : -r/2;
             childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
           }
+        have_childs = true;
       }
-      void AddSingularNode (const typename SingularMLMultiPole<elem_type>::Node & singnode, bool allow_refine)
+      void AddSingularNode (const typename SingularMLMultiPole<elem_type>::Node & singnode, bool allow_refine,
+                            Array<RecordingRS> * recording)
       {
         if (mp.SH().Order() < 0) return;
         if (singnode.mp.SH().Order() < 0) return;
-        if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
+        // if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
         if (level > 20)
           {
             singnodes.Append(&singnode);
@@ -936,12 +1599,15 @@ namespace ngsbem
                 singnode.childs[0]->mp.Order() < singnode.mp.Order())
               {
                 for (auto & child : singnode.childs)
-                  AddSingularNode (*child, allow_refine);
+                  AddSingularNode (*child, allow_refine, recording);
                 return;
               }
             // static Timer t("mptool transform Helmholtz-criterion"); RegionTimer r(t);
-            singnode.mp.TransformAdd(mp, dist);
+            if (recording)
+              *recording += RecordingRS(&singnode.mp, &mp, dist);
+            else
+              singnode.mp.TransformAdd(mp, dist);
             return;
           }
@@ -960,21 +1626,21 @@ namespace ngsbem
                   CreateChilds();
                 for (auto & ch : childs)
-                  ch -> AddSingularNode (singnode, allow_refine);
+                  ch -> AddSingularNode (singnode, allow_refine, recording);
               }
             else
               {
-                if (total_targets < 1000)
+                if (total_targets < 1000 || recording)
                   {
                     for (auto & ch : childs)
                       if (ch)
-                        ch -> AddSingularNode (singnode, allow_refine);
+                        ch -> AddSingularNode (singnode, allow_refine, recording);
                   }
                 else
                   ParallelFor (8, [&] (int nr)
                                {
                                  if (childs[nr])
-                                   childs[nr] -> AddSingularNode (singnode, allow_refine);
+                                   childs[nr] -> AddSingularNode (singnode, allow_refine, recording);
                                });
                 if (targets.Size())
@@ -984,7 +1650,7 @@ namespace ngsbem
         else
           {
             for (auto & childsing : singnode.childs)
-              AddSingularNode (*childsing, allow_refine);
+              AddSingularNode (*childsing, allow_refine, recording);
           }
       }
@@ -996,12 +1662,22 @@ namespace ngsbem
         if (childs[0])
           {
-            for (auto & ch : childs)
+            if (total_targets < 1000)
               {
-                if (L2Norm(mp.SH().Coefs()) > 0)
-                  mp.TransformAdd (ch->mp, ch->center-center);
-                ch->LocalizeExpansion(allow_refine);
+                for (int nr = 0; nr < 8; nr++)
+                  {
+                    if (L2Norm(mp.SH().Coefs()) > 0)
+                      mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
+                    childs[nr]->LocalizeExpansion(allow_refine);
+                  }
               }
+            else
+              ParallelFor(8, [&] (int nr)
+              {
+                if (L2Norm(mp.SH().Coefs()) > 0)
+                  mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
+                childs[nr]->LocalizeExpansion(allow_refine);
+              });
             mp = MultiPole<MPRegular,elem_type>(-1, mp.Kappa(), 1.);
             //mp.SH().Coefs()=0.0;
           }
@@ -1009,18 +1685,8 @@ namespace ngsbem
       elem_type Evaluate (Vec<3> p) const
       {
-        // *testout << "eval p = " << p << ", level = " << level << ", center = " << center <<  ", r = " << r << endl;
         elem_type sum{0.0};
-        /*
-        if (childs[0])
-          {
-            int childnum = 0;
-            if (p(0) > center(0)) childnum += 1;
-            if (p(1) > center(1)) childnum += 2;
-            if (p(2) > center(2)) childnum += 4;
-            sum = childs[childnum]->Evaluate(p);
-          }
-        */
         int childnum = 0;
         if (p(0) > center(0)) childnum += 1;
         if (p(1) > center(1)) childnum += 2;
@@ -1030,8 +1696,6 @@ namespace ngsbem
         else
           sum = mp.Eval(p-center);
-        // static Timer t("mptool direct evaluate"); RegionTimer r(t);
         for (auto sn : singnodes)
           sum += sn->EvaluateMP(p);
@@ -1080,7 +1744,8 @@ namespace ngsbem
       void AddTarget (Vec<3> x)
       {
-        if (childs[0])
+        // if (childs[0])
+        if (have_childs) // quick check without locking
           {
             // directly send to childs:
             int childnum  = 0;
@@ -1091,6 +1756,20 @@ namespace ngsbem
             return;
           }
+        lock_guard<mutex> guard(node_mutex);
+        if (have_childs) // test again after locking
+        {
+          // directly send to childs:
+          int childnum  = 0;
+          if (x(0) > center(0)) childnum += 1;
+          if (x(1) > center(1)) childnum += 2;
+          if (x(2) > center(2)) childnum += 4;
+          childs[childnum] -> AddTarget(x);
+          return;
+        }
         targets.Append( x );
         // if (r*mp.Kappa() < 1e-8) return;
@@ -1158,8 +1837,8 @@ namespace ngsbem
       nodes_on_level = 0;
       nodes_on_level[0] = 1;
       {
-        static Timer t("mptool compute regular MLMP"); RegionTimer rg(t);
-        root.AddSingularNode(singmp->root, true);
+        static Timer t("mptool compute regular MLMP"); RegionTimer rg(t);
+        root.AddSingularNode(singmp->root, true, nullptr);
         // cout << "norm after S->R conversion: " << root.Norm() << endl;
       }
@@ -1195,14 +1874,69 @@ namespace ngsbem
     void CalcMP(shared_ptr<SingularMLMultiPole<elem_type>> asingmp)
     {
       static Timer t("mptool regular MLMP"); RegionTimer rg(t);
+      static Timer trec("mptool regular MLMP - recording");
+      static Timer tsort("mptool regular MLMP - sort");
       singmp = asingmp;
       root.CalcTotalTargets();
       root.RemoveEmptyTrees();
-      root.AddSingularNode(singmp->root, false);
+      // root.AddSingularNode(singmp->root, false, nullptr);
+      // /*
+      Array<RecordingRS> recording;
+      {
+        RegionTimer rrec(trec);
+        root.AddSingularNode(singmp->root, false, &recording);
+      }
+      // cout << "recorded: " << recording.Size() << endl;
+      {
+      RegionTimer reg(tsort);
+      QuickSort (recording, [] (auto & a, auto & b)
+      {
+        if (a.len < (1-1e-8) * b.len) return true;
+        if (a.len > (1+1e-8) * b.len) return false;
+        return a.theta < b.theta;
+      });
+      }
+      double current_len = -1e100;
+      double current_theta = -1e100;
+      Array<RecordingRS*> current_batch;
+      Array<Array<RecordingRS*>> batch_group;
+      Array<double> group_lengths;
+      Array<double> group_thetas;
+      for (auto & record : recording)
+        {
+          bool len_changed = fabs(record.len - current_len) > 1e-8;
+          bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
+          if ((len_changed || theta_changed) && current_batch.Size() > 0) {
+            // ProcessBatch(current_batch, current_len, current_theta);
+            batch_group.Append(current_batch);
+            group_lengths.Append(current_len);
+            group_thetas.Append(current_theta);
+            current_batch.SetSize(0);
+            }
+          current_len = record.len;
+          current_theta = record.theta;
+          current_batch.Append(&record);
+        }
+      if (current_batch.Size() > 0) {
+        // ProcessBatch(current_batch, current_len, current_theta);
+        batch_group.Append(current_batch);
+        group_lengths.Append(current_len);
+        group_thetas.Append(current_theta);
+      }
+      ParallelFor(batch_group.Size(), [&](int i) {
+          ProcessBatchRS(batch_group[i], group_lengths[i], group_thetas[i]);
+      }, TasksPerThread(4));
+      // */
       /*
       int maxlevel = 0;
       for (auto [i,num] : Enumerate(RegularMLMultiPole::nodes_on_level))
@@ -1212,7 +1946,7 @@ namespace ngsbem
         cout << "reg " << i << ": " << RegularMLMultiPole::nodes_on_level[i] << endl;
       */
-      static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
+      static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
       root.LocalizeExpansion(false);
     }
@@ -1246,6 +1980,7 @@ namespace ngsbem
   };
   template <typename elem_type>
   inline ostream & operator<< (ostream & ost, const RegularMLMultiPole<elem_type> & mlmp)
   {