ngsolve 6.2.2504.post44.dev0__cp311-cp311-win_amd64.whl → 6.2.2601__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. netgen/include/analytic_integrals.hpp +10 -0
  2. netgen/include/basematrix.hpp +6 -0
  3. netgen/include/bdbequations.hpp +55 -0
  4. netgen/include/bem_diffops.hpp +475 -0
  5. netgen/include/bilinearform.hpp +4 -1
  6. netgen/include/bspline.hpp +2 -0
  7. netgen/include/code_generation.hpp +2 -2
  8. netgen/include/complex_wrapper.hpp +30 -2
  9. netgen/include/contact.hpp +8 -0
  10. netgen/include/diagonalmatrix.hpp +6 -0
  11. netgen/include/diffop_impl.hpp +3 -1
  12. netgen/include/diffopwithfactor.hpp +123 -0
  13. netgen/include/elementbyelement.hpp +9 -3
  14. netgen/include/expr.hpp +45 -7
  15. netgen/include/fespace.hpp +9 -2
  16. netgen/include/gridfunction.hpp +3 -3
  17. netgen/include/h1amg.hpp +24 -1
  18. netgen/include/h1lumping.hpp +6 -0
  19. netgen/include/hcurl_equations.hpp +29 -0
  20. netgen/include/hcurlcurlfe.hpp +20 -0
  21. netgen/include/hdivfe_utils.hpp +1 -0
  22. netgen/include/hdivhofespace.hpp +2 -0
  23. netgen/include/kernels.hpp +724 -0
  24. netgen/include/l2hofe.hpp +1 -0
  25. netgen/include/matrix.hpp +8 -3
  26. netgen/include/meshaccess.hpp +1 -1
  27. netgen/include/mp_coefficient.hpp +24 -19
  28. netgen/include/mptools.hpp +1255 -237
  29. netgen/include/mycomplex.hpp +1 -1
  30. netgen/include/ngblas.hpp +116 -7
  31. netgen/include/potentialtools.hpp +2 -2
  32. netgen/include/preconditioner.hpp +2 -2
  33. netgen/include/prolongation.hpp +6 -3
  34. netgen/include/recursive_pol.hpp +63 -11
  35. netgen/include/simd_complex.hpp +45 -0
  36. netgen/include/sparsecholesky.hpp +6 -2
  37. netgen/include/sparsefactorization_interface.hpp +159 -0
  38. netgen/include/sparsematrix.hpp +21 -7
  39. netgen/include/sparsematrix_dyn.hpp +2 -2
  40. netgen/include/sparsematrix_impl.hpp +100 -33
  41. netgen/include/statushandler.hpp +8 -8
  42. netgen/include/thdivfe_impl.hpp +66 -0
  43. netgen/include/tscalarfe.hpp +1 -1
  44. netgen/include/vector.hpp +272 -47
  45. netgen/lib/libngsolve.lib +0 -0
  46. netgen/libngsolve.dll +0 -0
  47. netgen/ngscxx.bat +1 -1
  48. netgen/ngsld.bat +1 -1
  49. ngsolve/cmake/NGSolveConfig.cmake +8 -8
  50. ngsolve/cmake/ngsolve-targets.cmake +17 -10
  51. ngsolve/config/config.py +8 -8
  52. ngsolve/demos/intro/cmagnet.py +19 -22
  53. ngsolve/directsolvers.py +9 -21
  54. ngsolve/krylovspace.py +172 -3
  55. ngsolve/ngslib.pyd +0 -0
  56. ngsolve/nonlinearsolvers.py +2 -2
  57. ngsolve/solve_implementation.py +14 -1
  58. ngsolve/{solvers.py → solvers/__init__.py} +1 -1
  59. ngsolve/solvers/cudss.py +112 -0
  60. ngsolve/webgui.py +1 -0
  61. {ngsolve-6.2.2504.post44.dev0.dist-info → ngsolve-6.2.2601.dist-info}/METADATA +2 -2
  62. {ngsolve-6.2.2504.post44.dev0.dist-info → ngsolve-6.2.2601.dist-info}/RECORD +94 -88
  63. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/Scripts/ngsolve.tcl +0 -0
  64. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/beam.geo +0 -0
  65. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/beam.vol +0 -0
  66. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/chip.in2d +0 -0
  67. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/chip.vol +0 -0
  68. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coil.geo +0 -0
  69. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coil.vol +0 -0
  70. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coilshield.geo +0 -0
  71. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coilshield.vol +0 -0
  72. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/cube.geo +0 -0
  73. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/cube.vol +0 -0
  74. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d10_DGdoubleglazing.pde +0 -0
  75. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d11_chip_nitsche.pde +0 -0
  76. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d1_square.pde +0 -0
  77. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d2_chip.pde +0 -0
  78. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d3_helmholtz.pde +0 -0
  79. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d4_cube.pde +0 -0
  80. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d5_beam.pde +0 -0
  81. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d6_shaft.pde +0 -0
  82. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d7_coil.pde +0 -0
  83. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d8_coilshield.pde +0 -0
  84. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d9_hybridDG.pde +0 -0
  85. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/doubleglazing.in2d +0 -0
  86. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/doubleglazing.vol +0 -0
  87. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/piezo2d40round4.vol.gz +0 -0
  88. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/shaft.geo +0 -0
  89. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/shaft.vol +0 -0
  90. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/square.in2d +0 -0
  91. {ngsolve-6.2.2504.post44.dev0.data → ngsolve-6.2.2601.data}/data/share/ngsolve/square.vol +0 -0
  92. {ngsolve-6.2.2504.post44.dev0.dist-info → ngsolve-6.2.2601.dist-info}/LICENSE +0 -0
  93. {ngsolve-6.2.2504.post44.dev0.dist-info → ngsolve-6.2.2601.dist-info}/WHEEL +0 -0
  94. {ngsolve-6.2.2504.post44.dev0.dist-info → ngsolve-6.2.2601.dist-info}/top_level.txt +0 -0
@@ -11,15 +11,77 @@
11
11
  #include <recursive_pol.hpp>
12
12
 
13
13
 
14
- namespace ngcomp
15
- {
16
- class Region;
17
- }
18
-
19
14
  namespace ngsbem
20
15
  {
21
16
  using namespace ngfem;
22
17
 
18
+ template<typename T>
19
+ constexpr int VecLength = 1; // Default: Complex has length 1
20
+
21
+ template<int N>
22
+ constexpr int VecLength<Vec<N, Complex>> = N; // Specialization: Vec<N,Complex> has length N
23
+
24
+
25
+
26
+ constexpr int FMM_SW = 4;
27
+
28
+
29
+
30
+ // ************************ SIMD - creation (should end up in simd.hpp) *************
31
+
32
+
33
+ template <int S, typename T, int SW>
34
+ Vec<S,T> HSum (Vec<S,SIMD<T,SW>> v)
35
+ {
36
+ Vec<S,T> res;
37
+ for (int i = 0; i < S; i++)
38
+ res(i) = HSum(v(i));
39
+ // Iterate<S> ([&](auto i) {
40
+ // res.HTData().template Elem<i.value>() = HSum(v.HTData().template Elem<i.value>());
41
+ // });
42
+ return res;
43
+ }
44
+
45
+
46
+ class NGS_DLL_HEADER PrecomputedSqrts
47
+ {
48
+ public:
49
+ Array<double> sqrt_int;
50
+ // Array<double> inv_sqrt_int;
51
+ Array<double> sqrt_n_np1; // sqrt(n*(n+1))
52
+ Array<double> inv_sqrt_2np1_2np3; // 1/sqrt( (2n+1)*(2n+3) )
53
+
54
+ PrecomputedSqrts();
55
+ };
56
+
57
+ extern NGS_DLL_HEADER PrecomputedSqrts presqrt;
58
+
59
+
60
+
61
+ class FMM_Parameters
62
+ {
63
+ public:
64
+ int maxdirect = 100;
65
+ int minorder = 20; // order = minorder + 2 kappa r
66
+ };
67
+
68
+
69
+
70
+
71
+ inline std::tuple<double, double, double> SphericalCoordinates(Vec<3> dist){
72
+ double len, theta, phi;
73
+ len = L2Norm(dist);
74
+ if (len < 1e-30)
75
+ theta = 0;
76
+ else
77
+ theta = acos (dist(2) / len);
78
+ if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
79
+ phi = 0;
80
+ else
81
+ phi = atan2(dist(1), dist(0));
82
+ return {len, theta, phi};
83
+ }
84
+
23
85
 
24
86
  template <typename entry_type = Complex>
25
87
  class NGS_DLL_HEADER SphericalHarmonics
@@ -84,23 +146,91 @@ namespace ngsbem
84
146
 
85
147
  void Calc (Vec<3> x, FlatVector<Complex> shapes);
86
148
 
87
-
149
+
150
+ void FlipZ ();
88
151
  void RotateZ (double alpha);
89
- void RotateY (double alpha);
152
+
153
+ template <typename FUNC>
154
+ void RotateZ (double alpha, FUNC func) const
155
+ {
156
+ if (order < 0) return;
157
+
158
+ Vector<Complex> exp_imalpha(order+1);
159
+ Complex exp_ialpha(cos(alpha), sin(alpha));
160
+ Complex prod = 1.0;
161
+ for (int i = 0; i <= order; i++)
162
+ {
163
+ exp_imalpha(i) = prod;
164
+ prod *= exp_ialpha;
165
+ }
166
+
167
+ int ii = 0;
168
+ for (int n = 0; n <= order; n++)
169
+ {
170
+ for (int m = -n; m < 0; m++, ii++)
171
+ func(ii, conj(exp_imalpha(-m)));
172
+ for (int m = 0; m <= n; m++, ii++)
173
+ func(ii, exp_imalpha(m));
174
+ };
175
+ };
176
+
177
+ template <typename FUNC>
178
+ void RotateZFlip (double alpha, bool flip, FUNC func) const
179
+ {
180
+ if (order < 0) return;
181
+
182
+ Vector<Complex> exp_imalpha(order+1);
183
+ Complex exp_ialpha(cos(alpha), sin(alpha));
184
+ Complex prod = 1.0;
185
+ for (int i = 0; i <= order; i++)
186
+ {
187
+ exp_imalpha(i) = prod;
188
+ prod *= exp_ialpha;
189
+ }
190
+
191
+ int ii = 0;
192
+
193
+ auto FlipFactor = [] (int n, int m, bool flip)->double
194
+ {
195
+ if (flip)
196
+ return ((n-m)%2) == 1 ? -1 : 1;
197
+ return 1.0;
198
+ };
199
+
200
+ for (int n = 0; n <= order; n++)
201
+ {
202
+ for (int m = -n; m < 0; m++, ii++)
203
+ func(ii, FlipFactor(n,m,flip)*conj(exp_imalpha(-m)));
204
+ for (int m = 0; m <= n; m++, ii++)
205
+ func(ii, FlipFactor(n,m,flip)*exp_imalpha(m));
206
+ };
207
+ };
90
208
 
91
209
 
210
+
211
+ void RotateY (double alpha, bool parallel = false);
212
+
213
+
214
+
92
215
  static double CalcAmn (int m, int n)
93
216
  {
94
217
  if (m < 0) m=-m;
95
218
  if (n < m) return 0;
96
- return sqrt( (n+1.0+m)*(n+1.0-m) / ( (2*n+1)*(2*n+3) ));
219
+
220
+ if (2*n+1 < presqrt.sqrt_int.Size())
221
+ return presqrt.sqrt_int[n+1+m]*presqrt.sqrt_int[n+1-m] * presqrt.inv_sqrt_2np1_2np3[n];
222
+ else
223
+ return sqrt( (n+1.0+m)*(n+1.0-m) / ( (2*n+1)*(2*n+3) ));
97
224
  }
98
225
 
99
226
  static double CalcBmn (int m, int n)
100
227
  {
101
228
  double sgn = (m >= 0) ? 1 : -1;
102
- if ( (m > n) || (-m > n) ) return 0;
103
- return sgn * sqrt( (n-m-1.0)*(n-m) / ( (2*n-1.0)*(2*n+1)));
229
+ if ( (m >= n) || (-m > n) ) return 0;
230
+ if (n <= presqrt.inv_sqrt_2np1_2np3.Size())
231
+ return sgn * presqrt.sqrt_n_np1[n-m-1] * presqrt.inv_sqrt_2np1_2np3[n-1];
232
+ else
233
+ return sgn * sqrt( (n-m-1.0)*(n-m) / ( (2*n-1.0)*(2*n+1)));
104
234
  }
105
235
 
106
236
  static double CalcDmn (int m, int n)
@@ -119,11 +249,11 @@ namespace ngsbem
119
249
  // https://fortran-lang.discourse.group/t/looking-for-spherical-bessel-and-hankel-functions-of-first-and-second-kind-and-arbitrary-order/2308/2
120
250
  NGS_DLL_HEADER
121
251
  void besseljs3d (int nterms, double z, double scale,
122
- FlatVector<double> fjs, FlatVector<double> fjder);
252
+ SliceVector<double> fjs, SliceVector<double> fjder = FlatVector<double>(0, nullptr));
123
253
 
124
254
  NGS_DLL_HEADER
125
255
  void besseljs3d (int nterms, Complex z, double scale,
126
- FlatVector<Complex> fjs, FlatVector<Complex> fjder);
256
+ SliceVector<Complex> fjs, SliceVector<Complex> fjder = FlatVector<Complex>(0, nullptr));
127
257
 
128
258
 
129
259
  /*
@@ -142,14 +272,17 @@ namespace ngsbem
142
272
  FlatVector<double> jp,
143
273
  FlatVector<double> yp);
144
274
 
145
-
275
+
146
276
 
147
277
  template <typename T>
148
278
  void SphericalBessel (int n, double rho, double scale, T && values)
149
279
  {
280
+ besseljs3d (n, rho, scale, values);
281
+ /*
150
282
  Vector<double> j(n+1), jp(n+1);
151
283
  besseljs3d (n, rho, scale, j, jp);
152
284
  values = j;
285
+ */
153
286
  }
154
287
 
155
288
 
@@ -173,21 +306,6 @@ namespace ngsbem
173
306
  return;
174
307
  }
175
308
  Vector j(n+1), y(n+1), jp(n+1), yp(n+1);
176
- // SBESJY (rho, n, j, y, jp, yp);
177
-
178
- /*
179
- values = j + Complex(0,1) * y;
180
- if (scale != 1.0)
181
- {
182
- double prod = 1.0;
183
- for (int i = 0; i <= n; i++)
184
- {
185
- values(i) *= prod;
186
- prod *= scale;
187
- }
188
- }
189
- */
190
-
191
309
 
192
310
  // the bessel-evaluation with scale
193
311
  besseljs3d (n, rho, 1/scale, j, jp);
@@ -215,7 +333,7 @@ namespace ngsbem
215
333
 
216
334
 
217
335
  // hn1 = jn+ i*yn
218
- class MPSingular
336
+ class Singular
219
337
  {
220
338
  public:
221
339
  template <typename T>
@@ -241,7 +359,7 @@ namespace ngsbem
241
359
 
242
360
 
243
361
  // jn
244
- class MPRegular
362
+ class Regular
245
363
  {
246
364
  public:
247
365
  template <typename T>
@@ -269,14 +387,14 @@ namespace ngsbem
269
387
 
270
388
 
271
389
  template <typename RADIAL, typename entry_type=Complex>
272
- class NGS_DLL_HEADER MultiPole
390
+ class NGS_DLL_HEADER SphericalExpansion
273
391
  {
274
392
  SphericalHarmonics<entry_type> sh;
275
393
  double kappa;
276
394
  double rtyp;
277
395
  public:
278
396
 
279
- MultiPole (int aorder, double akappa, double artyp)
397
+ SphericalExpansion (int aorder, double akappa, double artyp)
280
398
  : sh(aorder), kappa(akappa), rtyp(artyp) { }
281
399
 
282
400
 
@@ -288,15 +406,15 @@ namespace ngsbem
288
406
  double RTyp() const { return rtyp; }
289
407
  int Order() const { return sh.Order(); }
290
408
 
291
- MultiPole Truncate(int neworder) const
409
+ SphericalExpansion Truncate(int neworder) const
292
410
  {
293
411
  if (neworder > sh.Order()) neworder=sh.Order();
294
- MultiPole nmp(neworder, kappa, rtyp);
412
+ SphericalExpansion nmp(neworder, kappa, rtyp);
295
413
  nmp.sh.Coefs() = sh.Coefs().Range(sqr(neworder+1));
296
414
  return nmp;
297
415
  }
298
416
 
299
- MultiPole & operator+= (const MultiPole & mp2)
417
+ SphericalExpansion & operator+= (const SphericalExpansion & mp2)
300
418
  {
301
419
  size_t commonsize = min(SH().Coefs().Size(), mp2.SH().Coefs().Size());
302
420
  SH().Coefs().Range(commonsize) += mp2.SH().Coefs().Range(commonsize);
@@ -307,27 +425,24 @@ namespace ngsbem
307
425
  entry_type EvalDirectionalDerivative (Vec<3> x, Vec<3> d) const;
308
426
 
309
427
  void AddCharge (Vec<3> x, entry_type c);
310
- void AddDipole (Vec<3> x, Vec<3> d, entry_type c);
311
- void AddCurrent (Vec<3> ap, Vec<3> ep, Complex j, int num=100);
312
-
313
- /*
314
- void ChangeScaleTo (double newscale)
428
+ void AddDipole (Vec<3> x, Vec<3> dir, entry_type c);
429
+ void AddChargeDipole (Vec<3> x, entry_type c, Vec<3> dir, entry_type c2)
315
430
  {
316
- double fac = Scale()/newscale;
317
- double prod = 1;
318
- for (int n = 0; n <= sh.Order(); n++, prod*= fac)
319
- sh.CoefsN(n) *= prod;
320
- scale = newscale;
431
+ // TODO: add them at once
432
+ AddCharge (x, c);
433
+ AddDipole (x, dir, c2);
321
434
  }
322
- */
435
+
436
+ void AddPlaneWave (Vec<3> d, entry_type c);
437
+ void AddCurrent (Vec<3> ap, Vec<3> ep, Complex j, int num=100);
438
+
439
+
323
440
  void ChangeRTypTo (double new_rtyp)
324
441
  {
325
- // double fac = Scale()/newscale;
326
442
  double fac = RADIAL::Scale(kappa, rtyp) / RADIAL::Scale(kappa, new_rtyp);
327
443
  double prod = 1;
328
444
  for (int n = 0; n <= sh.Order(); n++, prod*= fac)
329
445
  sh.CoefsN(n) *= prod;
330
- // scale = newscale;
331
446
  rtyp = new_rtyp;
332
447
  }
333
448
 
@@ -346,7 +461,7 @@ namespace ngsbem
346
461
 
347
462
 
348
463
  template <typename TARGET>
349
- void Transform (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
464
+ void Transform (SphericalExpansion<TARGET,entry_type> & target, Vec<3> dist) const
350
465
  {
351
466
  if (target.SH().Order() < 0) return;
352
467
  if (SH().Order() < 0)
@@ -358,22 +473,11 @@ namespace ngsbem
358
473
  // static Timer t("mptool Transform "+ToString(typeid(RADIAL).name())+ToString(typeid(TARGET).name()));
359
474
  // RegionTimer reg(t);
360
475
 
361
- double len = L2Norm(dist);
362
- double theta, phi;
363
-
364
- if (len < 1e-30)
365
- theta = 0;
366
- else
367
- theta = acos (dist(2) / len);
368
-
369
- if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
370
- phi = 0;
371
- else
372
- phi = atan2(dist(1), dist(0));
476
+ auto [len, theta, phi] = SphericalCoordinates(dist);
373
477
 
374
478
 
375
- // MultiPole<RADIAL,entry_type> tmp{*this};
376
- MultiPole<RADIAL,entry_type> tmp(Order(), kappa, rtyp);
479
+ // SphericalExpansion<RADIAL,entry_type> tmp{*this};
480
+ SphericalExpansion<RADIAL,entry_type> tmp(Order(), kappa, rtyp);
377
481
  tmp.SH().Coefs() = SH().Coefs();
378
482
 
379
483
  tmp.SH().RotateZ(phi);
@@ -386,58 +490,213 @@ namespace ngsbem
386
490
  }
387
491
 
388
492
  template <typename TARGET>
389
- void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
493
+ void TransformAdd (SphericalExpansion<TARGET,entry_type> & target, Vec<3> dist, bool atomic = false) const
390
494
  {
391
495
  if (SH().Order() < 0) return;
392
496
  if (target.SH().Order() < 0) return;
393
497
 
394
- MultiPole<TARGET,entry_type> tmp{target};
498
+ SphericalExpansion<TARGET,entry_type> tmp{target};
395
499
  Transform(tmp, dist);
396
- target.SH().Coefs() += tmp.SH().Coefs();
500
+ if (!atomic)
501
+ target.SH().Coefs() += tmp.SH().Coefs();
502
+ else
503
+ for (int j = 0; j < target.SH().Coefs().Size(); j++)
504
+ AtomicAdd(target.SH().Coefs()[j], tmp.SH().Coefs()[j]);
397
505
  }
398
506
 
399
507
  template <typename TARGET>
400
- void ShiftZ (double z, MultiPole<TARGET,entry_type> & target);
508
+ void ShiftZ (double z, SphericalExpansion<TARGET,entry_type> & target);
509
+
401
510
 
511
+ template <typename TARGET>
512
+ void In2Out (SphericalExpansion<TARGET,entry_type> & target, double r) const
513
+ {
514
+ Vector<Complex> rad(Order()+1);
515
+ Vector<Complex> radout(target.Order()+1);
516
+ RADIAL::Eval(Order(), kappa, r, RTyp(), rad);
517
+ TARGET::Eval(target.Order(), kappa, r, target.RTyp(), radout);
518
+ target.SH().Coefs() = 0;
519
+ for (int j = 0; j <= std::min(Order(), target.Order()); j++)
520
+ target.SH().CoefsN(j) = rad(j)/radout(j) * SH().CoefsN(j);
521
+ }
402
522
  };
403
523
 
404
524
 
405
525
 
406
526
  // ***************** parameters ****************
407
527
 
528
+ /*
408
529
  static constexpr int MPOrder (double rho_kappa)
409
530
  {
410
- return max (20, int(2*rho_kappa));
531
+ // return max (20, int(2*rho_kappa));
532
+ return 20+int(2*rho_kappa);
411
533
  }
412
534
  static constexpr int maxdirect = 100;
535
+ */
536
+
537
+
538
+ template <typename SCAL, auto S>
539
+ inline auto VecVector2Matrix (FlatVector<Vec<S,SCAL>> vec)
540
+ {
541
+ return FlatMatrixFixWidth<S,SCAL> (vec.Size(), vec.Data()->Data());
542
+ }
543
+
544
+ inline auto VecVector2Matrix (FlatVector<Complex> vec)
545
+ {
546
+ return FlatMatrixFixWidth<1,Complex> (vec.Size(), vec.Data());
547
+ }
413
548
 
414
549
 
415
550
  template <typename entry_type=Complex>
416
- class SingularMLMultiPole
551
+ class SingularMLExpansion
417
552
  {
553
+ using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
418
554
  static Array<size_t> nodes_on_level;
419
555
 
556
+ struct RecordingSS
557
+ {
558
+ const SphericalExpansion<Singular,entry_type> * mp_source;
559
+ SphericalExpansion<Singular,entry_type> * mp_target;
560
+ Vec<3> dist;
561
+ double len, theta, phi;
562
+ bool flipz;
563
+ public:
564
+ RecordingSS() = default;
565
+ RecordingSS (const SphericalExpansion<Singular,entry_type> * amp_source,
566
+ SphericalExpansion<Singular,entry_type> * amp_target,
567
+ Vec<3> adist)
568
+ : mp_source(amp_source), mp_target(amp_target), dist(adist)
569
+ {
570
+ std::tie(len, theta, phi) = SphericalCoordinates(adist);
571
+ // flipz = false;
572
+ flipz = theta > M_PI/2;
573
+ if (flipz) theta = M_PI-theta;
574
+ }
575
+ };
576
+
577
+
578
+ static void ProcessBatchSS(FlatArray<RecordingSS*> batch, double len, double theta) {
579
+ constexpr int vec_length = VecLength<entry_type>;
580
+ int batch_size = batch.Size();
581
+ int N = batch_size * vec_length;
582
+ // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(entry_type).name() << ", len = " << len << ", theta = " << theta << endl;
583
+
584
+ if (N <= 1 || batch_size <= 1) {
585
+ for (auto* rec : batch) {
586
+ rec->mp_source->TransformAdd(*rec->mp_target, rec->dist, true);
587
+ }
588
+ }
589
+ else if (N <= 3) {
590
+ ProcessVectorizedBatchSS<3, vec_length>(batch, len, theta);
591
+ }
592
+ else if (N <= 4) {
593
+ ProcessVectorizedBatchSS<4, vec_length>(batch, len, theta);
594
+ }
595
+ else if (N <= 6) {
596
+ ProcessVectorizedBatchSS<6, vec_length>(batch, len, theta);
597
+ }
598
+ else if (N <= 12) {
599
+ ProcessVectorizedBatchSS<12, vec_length>(batch, len, theta);
600
+ }
601
+ else if (N <= 24) {
602
+ ProcessVectorizedBatchSS<24, vec_length>(batch, len, theta);
603
+ }
604
+ else if (N <= 48) {
605
+ ProcessVectorizedBatchSS<48, vec_length>(batch, len, theta);
606
+ }
607
+ else if (N <= 96) {
608
+ ProcessVectorizedBatchSS<96, vec_length>(batch, len, theta);
609
+ }
610
+ else if (N <= 192) {
611
+ ProcessVectorizedBatchSS<192, vec_length>(batch, len, theta);
612
+ }
613
+ else {
614
+ // Split large batches
615
+ ProcessBatchSS(batch.Range(0, 192 / vec_length), len, theta);
616
+ ProcessBatchSS(batch.Range(192 / vec_length, batch_size), len, theta);
617
+ }
618
+ }
619
+
620
+ template<int N, int vec_length>
621
+ static void ProcessVectorizedBatchSS(FlatArray<RecordingSS*> batch, double len, double theta) {
622
+
623
+ // *testout << "Processing vectorized S->S batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
624
+ double kappa = batch[0]->mp_source->Kappa();
625
+ int so = batch[0]->mp_source->Order();
626
+ int to = batch[0]->mp_target->Order();
627
+ SphericalExpansion<Singular, Vec<N,Complex>> vec_source(so, kappa, batch[0]->mp_source->RTyp());
628
+ SphericalExpansion<Singular, Vec<N,Complex>> vec_target(to, kappa, batch[0]->mp_target->RTyp());
629
+
630
+ // Copy multipoles into vectorized multipole
631
+ for (int i = 0; i < batch.Size(); i++)
632
+ {
633
+ auto source_i = VecVector2Matrix (batch[i]->mp_source->SH().Coefs());
634
+ auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
635
+ batch[i]->mp_source->SH().RotateZFlip(batch[i]->phi, batch[i]->flipz,
636
+ [source_i, source_mati] (size_t ii, Complex factor)
637
+ {
638
+ source_mati.Row(ii) = factor * source_i.Row(ii);
639
+ });
640
+ }
641
+
642
+ vec_source.SH().RotateY(theta, vec_source.SH().Order() >= 100);
643
+ vec_source.ShiftZ(-len, vec_target);
644
+ vec_target.SH().RotateY(-theta, vec_target.SH().Order() >= 100);
645
+
646
+ // Copy vectorized multipole into individual multipoles
647
+ for (int i = 0; i < batch.Size(); i++)
648
+ {
649
+ auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
650
+ auto target_mati = VecVector2Matrix (batch[i]->mp_target->SH().Coefs());
651
+ batch[i]->mp_target->SH().RotateZFlip(-batch[i]->phi, batch[i]->flipz,
652
+ [source_mati, target_mati] (size_t ii, Complex factor)
653
+ {
654
+ AtomicAdd (target_mati.Row(ii), factor * source_mati.Row(ii));
655
+ });
656
+ }
657
+ }
658
+
420
659
  struct Node
421
660
  {
422
661
  Vec<3> center;
423
662
  double r;
424
663
  int level;
425
664
  std::array<unique_ptr<Node>,8> childs;
426
- MultiPole<MPSingular, entry_type> mp;
665
+ SphericalExpansion<Singular, entry_type> mp;
427
666
 
428
667
  Array<tuple<Vec<3>, entry_type>> charges;
429
668
  Array<tuple<Vec<3>, Vec<3>, entry_type>> dipoles;
669
+ Array<tuple<Vec<3>, entry_type, Vec<3>, entry_type>> chargedipoles;
430
670
  Array<tuple<Vec<3>, Vec<3>, Complex,int>> currents;
671
+
672
+ using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
673
+ Array<tuple<Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_charges;
674
+ Array<tuple<Vec<3,SIMD<double,FMM_SW>>, Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_dipoles;
675
+ Array<tuple<Vec<3,SIMD<double,FMM_SW>>, simd_entry_type,
676
+ Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_chargedipoles;
677
+
431
678
  int total_sources;
679
+ const FMM_Parameters & fmm_params;
680
+ std::mutex node_mutex;
681
+ atomic<bool> have_childs{false};
432
682
 
433
- Node (Vec<3> acenter, double ar, int alevel, double akappa)
434
- : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*akappa), akappa, ar) // min(1.0, ar*akappa))
683
+ Node (Vec<3> acenter, double ar, int alevel, double akappa, const FMM_Parameters & afmm_params)
684
+ // : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*akappa), akappa, ar), fmm_params(afmm_params)
685
+ : center(acenter), r(ar), level(alevel), mp(afmm_params.minorder+2*ar*akappa, akappa, ar), fmm_params(afmm_params)
435
686
  {
436
687
  if (level < nodes_on_level.Size())
437
688
  nodes_on_level[level]++;
438
689
  }
439
690
 
440
-
691
+ int GetChildNum (Vec<3> x) const
692
+ {
693
+ int childnum = 0;
694
+ if (x(0) > center(0)) childnum += 1;
695
+ if (x(1) > center(1)) childnum += 2;
696
+ if (x(2) > center(2)) childnum += 4;
697
+ return childnum;
698
+ }
699
+
441
700
  void CreateChilds()
442
701
  {
443
702
  if (childs[0]) throw Exception("have already childs");
@@ -447,20 +706,47 @@ namespace ngsbem
447
706
  cc(0) += (i&1) ? r/2 : -r/2;
448
707
  cc(1) += (i&2) ? r/2 : -r/2;
449
708
  cc(2) += (i&4) ? r/2 : -r/2;
450
- childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
709
+ childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa(), fmm_params);
451
710
  }
711
+ have_childs = true;
452
712
  }
453
713
 
454
714
 
715
+ void SendSourcesToChilds()
716
+ {
717
+ CreateChilds();
718
+
719
+ for (auto [x,c] : charges)
720
+ AddCharge (x,c);
721
+ for (auto [x,d,c] : dipoles)
722
+ AddDipole (x,d,c);
723
+ for (auto [x,c,d,c2] : chargedipoles)
724
+ AddChargeDipole (x,c,d,c2);
725
+ for (auto [sp,ep,j,num] : currents)
726
+ AddCurrent (sp,ep,j,num);
727
+
728
+ charges.DeleteAll();
729
+ dipoles.DeleteAll();
730
+ chargedipoles.DeleteAll();
731
+ currents.DeleteAll();
732
+ }
733
+
734
+
455
735
  void AddCharge (Vec<3> x, entry_type c)
456
736
  {
457
- if (childs[0])
737
+ if (have_childs) // quick check without locking
458
738
  {
459
739
  // directly send to childs:
460
- int childnum = 0;
461
- if (x(0) > center(0)) childnum += 1;
462
- if (x(1) > center(1)) childnum += 2;
463
- if (x(2) > center(2)) childnum += 4;
740
+ int childnum = GetChildNum(x);
741
+ childs[childnum] -> AddCharge(x, c);
742
+ return;
743
+ }
744
+
745
+ lock_guard<mutex> guard(node_mutex);
746
+
747
+ if (have_childs) // test again after locking
748
+ {
749
+ int childnum = GetChildNum(x);
464
750
  childs[childnum] -> AddCharge(x, c);
465
751
  return;
466
752
  }
@@ -469,57 +755,78 @@ namespace ngsbem
469
755
 
470
756
  // if (r*mp.Kappa() < 1e-8) return;
471
757
  if (level > 20) return;
472
- if (charges.Size() < maxdirect && r*mp.Kappa() < 1)
758
+ if (charges.Size() < fmm_params.maxdirect && r*mp.Kappa() < 5)
473
759
  return;
474
-
475
- CreateChilds();
476
-
477
- for (auto [x,c] : charges)
478
- AddCharge (x,c);
479
- for (auto [x,d,c] : dipoles)
480
- AddDipole (x,d,c);
481
- for (auto [sp,ep,j,num] : currents)
482
- AddCurrent (sp,ep,j,num);
483
760
 
484
- charges.SetSize0();
485
- dipoles.SetSize0();
486
- currents.SetSize0();
761
+ SendSourcesToChilds();
487
762
  }
488
763
 
489
764
 
490
765
  void AddDipole (Vec<3> x, Vec<3> d, entry_type c)
491
766
  {
492
- if (childs[0])
767
+ if (have_childs)
493
768
  {
494
769
  // directly send to childs:
495
-
496
- int childnum = 0;
497
- if (x(0) > center(0)) childnum += 1;
498
- if (x(1) > center(1)) childnum += 2;
499
- if (x(2) > center(2)) childnum += 4;
770
+ int childnum = GetChildNum(x);
500
771
  childs[childnum] -> AddDipole(x, d, c);
501
772
  return;
502
773
  }
503
774
 
504
- dipoles.Append (tuple{x,d,c});
775
+ lock_guard<mutex> guard(node_mutex);
505
776
 
506
- if (dipoles.Size() < maxdirect || r < 1e-8)
777
+ if (have_childs)
778
+ {
779
+ // directly send to childs:
780
+ int childnum = GetChildNum(x);
781
+ childs[childnum] -> AddDipole(x, d, c);
782
+ return;
783
+ }
784
+
785
+ dipoles.Append (tuple{x,d,c});
786
+
787
+ if (level > 20) return;
788
+ if (dipoles.Size() < fmm_params.maxdirect)
507
789
  return;
790
+
791
+ SendSourcesToChilds();
792
+ }
793
+
794
+
795
+ void AddChargeDipole (Vec<3> x, entry_type c, Vec<3> dir, entry_type c2)
796
+ {
797
+ if (have_childs)
798
+ {
799
+ // directly send to childs:
800
+ int childnum = GetChildNum(x);
801
+ childs[childnum] -> AddChargeDipole(x, c, dir, c2);
802
+ return;
803
+ }
804
+
805
+ lock_guard<mutex> guard(node_mutex);
806
+
807
+ if (have_childs)
808
+ {
809
+ // directly send to childs:
810
+ int childnum = GetChildNum(x);
811
+ childs[childnum] -> AddChargeDipole(x, c, dir, c2);
812
+ return;
813
+ }
508
814
 
509
- CreateChilds();
815
+ chargedipoles.Append (tuple{x,c,dir,c2});
510
816
 
511
- for (auto [x,c] : charges)
512
- AddCharge (x,c);
513
- for (auto [x,d,c] : dipoles)
514
- AddDipole (x,d,c);
515
- for (auto [sp,ep,j,num] : currents)
516
- AddCurrent (sp,ep,j,num);
817
+ if (chargedipoles.Size() < fmm_params.maxdirect || r < 1e-8)
818
+ return;
517
819
 
518
- charges.SetSize0();
519
- dipoles.SetSize0();
520
- currents.SetSize0();
820
+ SendSourcesToChilds();
821
+
822
+ /*
823
+ AddCharge (x, c);
824
+ AddDipole (x, dir, c2);
825
+ */
521
826
  }
522
827
 
828
+
829
+ // not parallel yet
523
830
  void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
524
831
  {
525
832
  if (childs[0])
@@ -528,7 +835,7 @@ namespace ngsbem
528
835
  Array<double> split;
529
836
  split.Append(0);
530
837
  for (int i = 0; i < 3; i++)
531
- if (sp(i) < center(i) != ep(i) < center(i))
838
+ if ((sp(i) < center(i)) != (ep(i) < center(i)))
532
839
  split += (center(i)-sp(i)) / (ep(i)-sp(i)); // segment cuts i-th coordinate plane
533
840
  split.Append(1);
534
841
  BubbleSort(split);
@@ -549,9 +856,15 @@ namespace ngsbem
549
856
  }
550
857
  return;
551
858
  }
552
-
859
+
553
860
  currents.Append (tuple{sp,ep,j,num});
554
861
 
862
+ // if (currents.Size() < maxdirect || r < 1e-8)
863
+ if (currents.Size() < 4 || r < 1e-8)
864
+ return;
865
+
866
+ SendSourcesToChilds();
867
+ /*
555
868
  // if (currents.Size() < maxdirect || r < 1e-8)
556
869
  if (currents.Size() < 4 || r < 1e-8)
557
870
  return;
@@ -568,6 +881,7 @@ namespace ngsbem
568
881
  charges.SetSize0();
569
882
  dipoles.SetSize0();
570
883
  currents.SetSize0();
884
+ */
571
885
  }
572
886
 
573
887
 
@@ -583,27 +897,141 @@ namespace ngsbem
583
897
  return sum;
584
898
  }
585
899
 
586
- // static Timer t("fmm direct eval"); RegionTimer reg(t);
587
- if (mp.Kappa() < 1e-8)
588
- {
589
- for (auto [x,c] : charges)
590
- if (double rho = L2Norm(p-x); rho > 0)
591
- sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
592
- }
593
- else
594
- for (auto [x,c] : charges)
595
- if (double rho = L2Norm(p-x); rho > 0)
596
- sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
900
+ if (simd_charges.Size())
901
+ {
902
+ // static Timer t("mptool singmp, evaluate, simd charges"); RegionTimer r(t);
903
+ // t.AddFlops (charges.Size());
904
+
905
+ simd_entry_type vsum{0.0};
906
+ if (mp.Kappa() < 1e-12)
907
+ {
908
+ for (auto [x,c] : simd_charges)
909
+ {
910
+ auto rho = L2Norm(p-x);
911
+ auto kernel = 1/(4*M_PI)/rho;
912
+ kernel = If(rho > 0.0, kernel, SIMD<double,FMM_SW>(0.0));
913
+ vsum += kernel * c;
914
+
915
+ /*
916
+ auto rho2 = L2Norm2(p-x);
917
+ auto kernel = (1/(4*M_PI)) * rsqrt(rho2);
918
+ kernel = If(rho2 > 0.0, kernel, SIMD<double,FMM_SW>(0.0));
919
+ vsum += kernel * c;
920
+ */
921
+ }
922
+ }
923
+ else if (mp.Kappa() < 1e-8)
924
+ for (auto [x,c] : simd_charges)
925
+ {
926
+ auto rho = L2Norm(p-x);
927
+ auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW> (1,rho*mp.Kappa()) / rho;
928
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
929
+ vsum += kernel * c;
930
+ }
931
+ else
932
+ for (auto [x,c] : simd_charges)
933
+ {
934
+ auto rho = L2Norm(p-x);
935
+ auto [si,co] = sincos(rho*mp.Kappa());
936
+ auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) / rho;
937
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
938
+ vsum += kernel * c;
939
+ }
940
+
941
+ sum += HSum(vsum);
942
+ }
943
+ else
944
+ {
945
+ if (mp.Kappa() < 1e-8)
946
+ {
947
+ for (auto [x,c] : charges)
948
+ if (double rho = L2Norm(p-x); rho > 0)
949
+ sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
950
+ }
951
+ else
952
+ for (auto [x,c] : charges)
953
+ if (double rho = L2Norm(p-x); rho > 0)
954
+ sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
955
+ }
956
+
957
+ if (simd_dipoles.Size())
958
+ {
959
+ // static Timer t("mptool singmp, evaluate, simd dipoles"); RegionTimer r(t);
960
+
961
+ simd_entry_type vsum{0.0};
962
+ for (auto [x,d,c] : simd_dipoles)
963
+ {
964
+ auto rho = L2Norm(p-x);
965
+ auto drhodp = (1.0/rho) * (p-x);
966
+ auto [si,co] = sincos(rho*mp.Kappa());
967
+ auto dGdrho = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) *
968
+ (-1.0/(rho*rho) + SIMD<Complex,FMM_SW>(0, mp.Kappa())/rho);
969
+ auto kernel = dGdrho * InnerProduct(drhodp, d);
970
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
971
+ vsum += kernel * c;
972
+ }
973
+ sum += HSum(vsum);
974
+ }
975
+ else
976
+ {
977
+ for (auto [x,d,c] : dipoles)
978
+ if (double rho = L2Norm(p-x); rho > 0)
979
+ {
980
+ Vec<3> drhodp = 1.0/rho * (p-x);
981
+ Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
982
+ (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
983
+ sum += dGdrho * InnerProduct(drhodp, d) * c;
984
+ }
985
+ }
986
+
987
+
988
+
989
+ if (simd_chargedipoles.Size())
990
+ {
991
+ // static Timer t("mptool singmp, evaluate, simd chargedipoles"); RegionTimer r(t);
992
+ // t.AddFlops (simd_chargedipoles.Size()*FMM_SW);
993
+
994
+ simd_entry_type vsum{0.0};
995
+ for (auto [x,c,d,c2] : simd_chargedipoles)
996
+ {
997
+ auto rho = L2Norm(p-x);
998
+ auto rhokappa = rho*mp.Kappa();
999
+ auto invrho = If(rho>0.0, 1.0/rho, SIMD<double,FMM_SW>(0.0));
1000
+ auto [si,co] = sincos(rhokappa);
1001
+
1002
+ auto kernelc = (1/(4*M_PI))*invrho*SIMD<Complex,FMM_SW>(co,si);
1003
+ vsum += kernelc * c;
1004
+
1005
+ auto kernel =
1006
+ invrho*invrho * InnerProduct(p-x, d) *
1007
+ kernelc * SIMD<Complex,FMM_SW>(-1.0, rhokappa);
1008
+
1009
+ vsum += kernel * c2;
1010
+ }
1011
+ sum += HSum(vsum);
1012
+ }
1013
+ else
1014
+ {
1015
+ // static Timer t("mptool singmp, evaluate, chargedipoles"); RegionTimer r(t);
1016
+ // t.AddFlops (chargedipoles.Size());
1017
+
1018
+ for (auto [x,c,d,c2] : chargedipoles)
1019
+ if (double rho = L2Norm(p-x); rho > 0)
1020
+ {
1021
+ sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
1022
+
1023
+ Vec<3> drhodp = 1.0/rho * (p-x);
1024
+ Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
1025
+ (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
1026
+
1027
+ sum += dGdrho * InnerProduct(drhodp, d) * c2;
1028
+ }
1029
+ }
1030
+
1031
+
1032
+
1033
+
597
1034
 
598
- for (auto [x,d,c] : dipoles)
599
- if (double rho = L2Norm(p-x); rho > 0)
600
- {
601
- Vec<3> drhodp = 1.0/rho * (p-x);
602
- Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
603
- (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
604
- sum += dGdrho * InnerProduct(drhodp, d) * c;
605
- }
606
-
607
1035
  for (auto [sp,ep,j,num] : currents)
608
1036
  {
609
1037
  // should use explizit formula instead ...
@@ -640,7 +1068,16 @@ namespace ngsbem
640
1068
  }
641
1069
 
642
1070
  if (dipoles.Size())
643
- throw Exception("EvaluateDeriv not implemented for dipoles in SingularMLMultiPole");
1071
+ {
1072
+ static int cnt = 0;
1073
+ cnt++;
1074
+ if (cnt < 3)
1075
+ cout << "we know what we do - evaluateDeriv not implemented for dipoles in SingularMLExpansion" << endl;
1076
+ // return sum;
1077
+ // throw Exception("EvaluateDeriv not implemented for dipoles in SingularMLExpansion");
1078
+ }
1079
+ if (chargedipoles.Size())
1080
+ throw Exception("EvaluateDeriv not implemented for dipoles in SingularMLExpansion");
644
1081
 
645
1082
  for (auto [x,c] : charges)
646
1083
  if (double rho = L2Norm(p-x); rho > 0)
@@ -655,7 +1092,7 @@ namespace ngsbem
655
1092
 
656
1093
  void CalcTotalSources()
657
1094
  {
658
- total_sources = charges.Size() + dipoles.Size();
1095
+ total_sources = charges.Size() + dipoles.Size() + chargedipoles.Size();
659
1096
  for (auto & child : childs)
660
1097
  if (child)
661
1098
  {
@@ -664,46 +1101,111 @@ namespace ngsbem
664
1101
  }
665
1102
  }
666
1103
 
667
- void CalcMP()
1104
+ void CalcMP(Array<RecordingSS> * recording, Array<Node*> * nodes_to_process)
668
1105
  {
669
- mp.SH().Coefs() = 0.0;
1106
+ // mp.SH().Coefs() = 0.0;
670
1107
  if (childs[0])
671
1108
  {
672
- if (total_sources < 1000)
1109
+ if (total_sources < 1000 || recording)
673
1110
  for (auto & child : childs)
674
- child->CalcMP();
1111
+ child->CalcMP(recording, nodes_to_process);
675
1112
  else
676
1113
  ParallelFor (8, [&] (int nr)
677
1114
  {
678
- childs[nr] -> CalcMP();
1115
+ childs[nr] -> CalcMP(recording, nodes_to_process);
679
1116
  });
680
1117
 
681
1118
 
682
- for (auto & child : childs)
683
- child->mp.TransformAdd(mp, center-child->center);
1119
+ for (auto & child : childs){
1120
+ if (recording && child->mp.SH().Coefs().Size() > 0)
1121
+ *recording += RecordingSS(&child->mp, &mp, center-child->center);
1122
+ else
1123
+ child->mp.TransformAdd(mp, center-child->center);
1124
+ }
684
1125
  }
685
1126
  else
686
1127
  {
687
- if (charges.Size()+dipoles.Size()+currents.Size() == 0)
1128
+ if (charges.Size()+dipoles.Size()+chargedipoles.Size()+currents.Size() == 0)
688
1129
  {
689
- mp = MultiPole<MPSingular,entry_type> (-1, mp.Kappa(), 1.);
1130
+ mp = SphericalExpansion<Singular,entry_type> (-1, mp.Kappa(), 1.);
690
1131
  return;
691
1132
  }
692
1133
 
693
- for (auto [x,c] : charges)
694
- mp.AddCharge (x-center,c);
695
-
696
- for (auto [x,d,c] : dipoles)
697
- mp.AddDipole (x-center, d, c);
1134
+ // make simd charges, comment this block for testing ...
1135
+ simd_charges.SetSize( (charges.Size()+FMM_SW-1)/FMM_SW);
1136
+ size_t i = 0, ii = 0;
1137
+ for ( ; i+FMM_SW <= charges.Size(); i+=FMM_SW, ii++)
1138
+ {
1139
+ std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
1140
+ for (int j = 0; j < FMM_SW; j++) ca[j] = charges[i+j];
1141
+ simd_charges[ii] = MakeSimd(ca);
1142
+ }
1143
+ if (i < charges.Size())
1144
+ {
1145
+ std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
1146
+ int j = 0;
1147
+ for ( ; i+j < charges.Size(); j++) ca[j] = charges[i+j];
1148
+ for ( ; j < FMM_SW; j++) ca[j] = tuple( get<0>(ca[0]), entry_type{0.0} );
1149
+ simd_charges[ii] = MakeSimd(ca);
1150
+ }
1151
+
1152
+ simd_dipoles.SetSize( (dipoles.Size()+FMM_SW-1)/FMM_SW);
1153
+ i = 0, ii = 0;
1154
+ for ( ; i+FMM_SW <= dipoles.Size(); i+=FMM_SW, ii++)
1155
+ {
1156
+ std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
1157
+ for (int j = 0; j < FMM_SW; j++) di[j] = dipoles[i+j];
1158
+ simd_dipoles[ii] = MakeSimd(di);
1159
+ }
1160
+ if (i < dipoles.Size())
1161
+ {
1162
+ std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
1163
+ int j = 0;
1164
+ for ( ; i+j < dipoles.Size(); j++) di[j] = dipoles[i+j];
1165
+ for ( ; j < FMM_SW; j++) di[j] = tuple( get<0>(di[0]), get<1>(di[0]), entry_type{0.0} );
1166
+ simd_dipoles[ii] = MakeSimd(di);
1167
+ }
698
1168
 
699
- for (auto [sp,ep,j,num] : currents)
700
- mp.AddCurrent (sp-center, ep-center, j, num);
1169
+
1170
+ simd_chargedipoles.SetSize( (chargedipoles.Size()+FMM_SW-1)/FMM_SW);
1171
+ i = 0, ii = 0;
1172
+ for ( ; i+FMM_SW <= chargedipoles.Size(); i+=FMM_SW, ii++)
1173
+ {
1174
+ std::array<tuple<Vec<3>,entry_type,Vec<3>,entry_type>, FMM_SW> di;
1175
+ for (int j = 0; j < FMM_SW; j++) di[j] = chargedipoles[i+j];
1176
+ simd_chargedipoles[ii] = MakeSimd(di);
1177
+ }
1178
+ if (i < chargedipoles.Size())
1179
+ {
1180
+ std::array<tuple<Vec<3>,entry_type,Vec<3>,entry_type>, FMM_SW> di;
1181
+ int j = 0;
1182
+ for ( ; i+j < chargedipoles.Size(); j++) di[j] = chargedipoles[i+j];
1183
+ for ( ; j < FMM_SW; j++) di[j] = tuple( get<0>(di[0]), entry_type{0.0}, get<2>(di[0]), entry_type{0.0} );
1184
+ simd_chargedipoles[ii] = MakeSimd(di);
1185
+ }
1186
+
1187
+
1188
+ if (nodes_to_process)
1189
+ *nodes_to_process += this;
1190
+ else {
1191
+ for (auto [x,c] : charges)
1192
+ mp.AddCharge (x-center,c);
1193
+
1194
+ for (auto [x,d,c] : dipoles)
1195
+ mp.AddDipole (x-center, d, c);
1196
+
1197
+ for (auto [x,c,d,c2] : chargedipoles)
1198
+ mp.AddChargeDipole (x-center, c, d, c2);
1199
+
1200
+ for (auto [sp,ep,j,num] : currents)
1201
+ mp.AddCurrent (sp-center, ep-center, j, num);
1202
+ }
701
1203
  }
702
1204
  }
703
1205
 
704
1206
  entry_type EvaluateMP(Vec<3> p) const
705
1207
  {
706
- if (charges.Size() || dipoles.Size())
1208
+ if (charges.Size() || dipoles.Size() || chargedipoles.Size())
707
1209
  return Evaluate(p);
708
1210
 
709
1211
  if (L2Norm(p-center) > 3*r)
@@ -723,7 +1225,7 @@ namespace ngsbem
723
1225
  // cout << "EvaluateMPDeriv Singular, p = " << p << ", d = " << d << ", r = " << r << ", center = " << center << endl;
724
1226
  // cout << "Norm: " << L2Norm(p-center) << " > " << 3*r << endl;
725
1227
  // cout << "charges.Size() = " << charges.Size() << ", dipoles.Size() = " << dipoles.Size() << endl;
726
- if (charges.Size() || dipoles.Size() || !childs[0])
1228
+ if (charges.Size() || dipoles.Size() || chargedipoles.Size() || !childs[0])
727
1229
  return EvaluateDeriv(p, d);
728
1230
 
729
1231
  if (L2Norm(p-center) > 3*r)
@@ -746,6 +1248,8 @@ namespace ngsbem
746
1248
  ost << "xi = " << x << ", ci = " << c << endl;
747
1249
  for (auto [x,d,c] : dipoles)
748
1250
  ost << "xi = " << x << ", di = " << d << ", ci = " << c << endl;
1251
+ for (auto [x,c,d,c2] : chargedipoles)
1252
+ ost << "xi = " << x << ", c = " << c << ", di = " << d << ", ci = " << c2 << endl;
749
1253
 
750
1254
  for (int i = 0; i < 8; i++)
751
1255
  if (childs[i]) childs[i] -> Print (ost, i);
@@ -768,14 +1272,23 @@ namespace ngsbem
768
1272
  num += ch->NumCoefficients();
769
1273
  return num;
770
1274
  }
1275
+
1276
+ void TraverseTree (const std::function<void(Node&)> & func)
1277
+ {
1278
+ func(*this);
1279
+ for (auto & child : childs)
1280
+ if (child)
1281
+ child->TraverseTree(func);
1282
+ }
771
1283
  };
772
1284
 
773
- Node root;
1285
+ FMM_Parameters fmm_params;
1286
+ Node root;
774
1287
  bool havemp = false;
775
1288
 
776
1289
  public:
777
- SingularMLMultiPole (Vec<3> center, double r, double kappa)
778
- : root(center, r, 0, kappa)
1290
+ SingularMLExpansion (Vec<3> center, double r, double kappa, FMM_Parameters _params = FMM_Parameters())
1291
+ : fmm_params(_params), root(center, r, 0, kappa, fmm_params)
779
1292
  {
780
1293
  nodes_on_level = 0;
781
1294
  nodes_on_level[0] = 1;
@@ -793,6 +1306,11 @@ namespace ngsbem
793
1306
  root.AddDipole(x, d, c);
794
1307
  }
795
1308
 
1309
+ void AddChargeDipole(Vec<3> x, entry_type c, Vec<3> dir, entry_type c2)
1310
+ {
1311
+ root.AddChargeDipole(x, c, dir, c2);
1312
+ }
1313
+
796
1314
  void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
797
1315
  {
798
1316
  if constexpr (!std::is_same<entry_type, Vec<3,Complex>>())
@@ -836,6 +1354,10 @@ namespace ngsbem
836
1354
  void CalcMP()
837
1355
  {
838
1356
  static Timer t("mptool compute singular MLMP"); RegionTimer rg(t);
1357
+ static Timer ts2mp("mptool compute singular MLMP - source2mp");
1358
+ static Timer tS2S("mptool compute singular MLMP - S->S");
1359
+ static Timer trec("mptool comput singular recording");
1360
+ static Timer tsort("mptool comput singular sort");
839
1361
 
840
1362
  /*
841
1363
  int maxlevel = 0;
@@ -847,8 +1369,94 @@ namespace ngsbem
847
1369
  */
848
1370
 
849
1371
  root.CalcTotalSources();
850
- root.CalcMP();
1372
+
1373
+ if constexpr (false)
1374
+ // direct evaluation of S->S
1375
+ root.CalcMP(nullptr, nullptr);
1376
+ else
1377
+ {
1378
+
1379
+ Array<RecordingSS> recording;
1380
+ Array<Node*> nodes_to_process;
1381
+
1382
+ {
1383
+ RegionTimer reg(trec);
1384
+ root.CalcMP(&recording, &nodes_to_process);
1385
+ }
851
1386
 
1387
+ {
1388
+ RegionTimer rs2mp(ts2mp);
1389
+ ParallelFor(nodes_to_process.Size(), [&](int i)
1390
+ {
1391
+ auto node = nodes_to_process[i];
1392
+ for (auto [x,c]: node->charges)
1393
+ node->mp.AddCharge(x-node->center, c);
1394
+ for (auto [x,d,c]: node->dipoles)
1395
+ node->mp.AddDipole(x-node->center, d, c);
1396
+ for (auto [x,c,d,c2]: node->chargedipoles)
1397
+ node->mp.AddChargeDipole(x-node->center, c, d, c2);
1398
+ for (auto [sp,ep,j,num]: node->currents)
1399
+ node->mp.AddCurrent(sp-node->center, ep-node->center, j, num);
1400
+ }, TasksPerThread(4));
1401
+ }
1402
+
1403
+ {
1404
+ RegionTimer reg(tsort);
1405
+ QuickSort (recording, [] (auto & a, auto & b)
1406
+ {
1407
+ if (a.len < (1-1e-8) * b.len) return true;
1408
+ if (a.len > (1+1e-8) * b.len) return false;
1409
+ return a.theta < b.theta;
1410
+ });
1411
+ }
1412
+
1413
+ double current_len = -1e100;
1414
+ double current_theta = -1e100;
1415
+ Array<RecordingSS*> current_batch;
1416
+ Array<Array<RecordingSS*>> batch_group;
1417
+ Array<double> group_lengths;
1418
+ Array<double> group_thetas;
1419
+ for (auto & record : recording)
1420
+ {
1421
+ bool len_changed = fabs(record.len - current_len) > 1e-8;
1422
+ bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
1423
+ if ((len_changed || theta_changed) && current_batch.Size() > 0) {
1424
+ batch_group.Append(current_batch);
1425
+ group_lengths.Append(current_len);
1426
+ group_thetas.Append(current_theta);
1427
+ current_batch.SetSize(0);
1428
+ }
1429
+
1430
+ current_len = record.len;
1431
+ current_theta = record.theta;
1432
+ current_batch.Append(&record);
1433
+ }
1434
+
1435
+ if (current_batch.Size() > 0) {
1436
+ batch_group.Append(current_batch);
1437
+ group_lengths.Append(current_len);
1438
+ group_thetas.Append(current_theta);
1439
+ }
1440
+
1441
+ {
1442
+ RegionTimer rS2S(tS2S);
1443
+ // ParallelFor(batch_group.Size(), [&](int i) {
1444
+ for (int i = 0; i < batch_group.Size(); i++){
1445
+ // *testout << "Processing batch " << i << " of size " << batch_group[i].Size() << ", with len = " << group_lengths[i] << ", theta = " << group_thetas[i] << endl;
1446
+ int chunk_size = 24;
1447
+ if (batch_group[i].Size() < chunk_size)
1448
+ ProcessBatchSS(batch_group[i], group_lengths[i], group_thetas[i]);
1449
+ else
1450
+ ParallelForRange(IntRange(batch_group[i].Size()), [&](IntRange range) {
1451
+ auto sub_batch = batch_group[i].Range(range.First(), range.Next());
1452
+ ProcessBatchSS(sub_batch, group_lengths[i], group_thetas[i]);
1453
+ }, TasksPerThread(4));
1454
+ }
1455
+ }
1456
+ }
1457
+
1458
+ // cout << "have singular:" << endl;
1459
+ // PrintStatistics (cout);
852
1460
  havemp = true;
853
1461
  }
854
1462
 
@@ -860,23 +1468,198 @@ namespace ngsbem
860
1468
  return root.Evaluate(p);
861
1469
  }
862
1470
 
1471
+
1472
+ void PrintStatistics (ostream & ost)
1473
+ {
1474
+ int levels = 0;
1475
+ int cnt = 0;
1476
+ root.TraverseTree( [&](Node & node) {
1477
+ levels = max(levels, node.level);
1478
+ cnt++;
1479
+ });
1480
+ ost << "levels: " << levels << endl;
1481
+ ost << "nodes: " << cnt << endl;
1482
+
1483
+ Array<int> num_on_level(levels+1);
1484
+ Array<int> order_on_level(levels+1);
1485
+ Array<size_t> coefs_on_level(levels+1);
1486
+ num_on_level = 0;
1487
+ order_on_level = 0;
1488
+ root.TraverseTree( [&](Node & node) {
1489
+ num_on_level[node.level]++;
1490
+ order_on_level[node.level] = max(order_on_level[node.level],node.mp.Order());
1491
+ coefs_on_level[node.level] += node.mp.SH().Coefs().Size();
1492
+ });
1493
+
1494
+ cout << "num on level" << endl;
1495
+ for (int i = 0; i < num_on_level.Size(); i++)
1496
+ cout << i << ": " << num_on_level[i] << ", order = " << order_on_level[i] << ", coefs " << coefs_on_level[i] << endl;
1497
+
1498
+ size_t totcoefs = 0;
1499
+ for (auto n : coefs_on_level)
1500
+ totcoefs += n;
1501
+ cout << "total mem in coefs: " << sizeof(entry_type)*totcoefs / sqr(1024) << " MB" << endl;
1502
+ }
1503
+
1504
+
1505
+
863
1506
  template <typename entry_type2>
864
- friend class RegularMLMultiPole;
1507
+ friend class RegularMLExpansion;
865
1508
  };
866
1509
 
867
1510
 
868
1511
  template <typename entry_type>
869
- inline ostream & operator<< (ostream & ost, const SingularMLMultiPole<entry_type> & mlmp)
1512
+ inline ostream & operator<< (ostream & ost, const SingularMLExpansion<entry_type> & mlmp)
870
1513
  {
871
1514
  mlmp.Print(ost);
872
1515
  return ost;
873
1516
  }
874
1517
 
875
1518
 
1519
+ // *********************************** Regular multilevel Expansion
1520
+
1521
+
876
1522
  template <typename elem_type=Complex>
877
- class NGS_DLL_HEADER RegularMLMultiPole
1523
+ class NGS_DLL_HEADER RegularMLExpansion
878
1524
  {
879
1525
  static Array<size_t> nodes_on_level;
1526
+
1527
+
1528
+ struct RecordingRS
1529
+ {
1530
+ const SphericalExpansion<Singular,elem_type> * mpS;
1531
+ SphericalExpansion<Regular,elem_type> * mpR;
1532
+ Vec<3> dist;
1533
+ double len, theta, phi;
1534
+ public:
1535
+ RecordingRS() = default;
1536
+ RecordingRS (const SphericalExpansion<Singular,elem_type> * ampS,
1537
+ SphericalExpansion<Regular,elem_type> * ampR,
1538
+ Vec<3> adist)
1539
+ : mpS(ampS), mpR(ampR), dist(adist)
1540
+ {
1541
+ std::tie(len, theta, phi) = SphericalCoordinates(dist);
1542
+ }
1543
+ };
1544
+
1545
+ static void ProcessBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
1546
+ // static Timer t("ProcessBatchRS"); RegionTimer reg(t, batch.Size());
1547
+ constexpr int vec_length = VecLength<elem_type>;
1548
+ int batch_size = batch.Size();
1549
+ int N = batch_size * vec_length;
1550
+ // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(elem_type).name() << ", len = " << len << ", theta = " << theta << endl;
1551
+
1552
+ if (N <= 1 || batch_size <= 1) {
1553
+ for (auto* rec : batch) {
1554
+ rec->mpS->TransformAdd(*rec->mpR, rec->dist);
1555
+ }
1556
+ }
1557
+ else if (N <= 3) {
1558
+ ProcessVectorizedBatchRS<3, vec_length>(batch, len, theta);
1559
+ }
1560
+ else if (N <= 4) {
1561
+ ProcessVectorizedBatchRS<4, vec_length>(batch, len, theta);
1562
+ }
1563
+ else if (N <= 6) {
1564
+ ProcessVectorizedBatchRS<6, vec_length>(batch, len, theta);
1565
+ }
1566
+ else if (N <= 12) {
1567
+ ProcessVectorizedBatchRS<12, vec_length>(batch, len, theta);
1568
+ }
1569
+ else if (N <= 24) {
1570
+ ProcessVectorizedBatchRS<24, vec_length>(batch, len, theta);
1571
+ }
1572
+ else if (N <= 48) {
1573
+ ProcessVectorizedBatchRS<48, vec_length>(batch, len, theta);
1574
+ }
1575
+ else if (N <= 96) {
1576
+ ProcessVectorizedBatchRS<96, vec_length>(batch, len, theta);
1577
+ }
1578
+ else if (N <= 192) {
1579
+ ProcessVectorizedBatchRS<192, vec_length>(batch, len, theta);
1580
+ }
1581
+ else {
1582
+ // Split large batches
1583
+ /*
1584
+ ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
1585
+ ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
1586
+ */
1587
+
1588
+ /*
1589
+ ParallelFor (2, [&] (int i)
1590
+ {
1591
+ if (i == 0)
1592
+ ProcessBatchRS(batch.Range(0, 192 / vec_length), len, theta);
1593
+ else
1594
+ ProcessBatchRS(batch.Range(192 / vec_length, batch_size), len, theta);
1595
+ }, 2);
1596
+ */
1597
+
1598
+
1599
+ size_t chunksize = 192/vec_length;
1600
+ size_t num = (batch.Size()+chunksize-1) / chunksize;
1601
+ ParallelFor (num, [&](int i)
1602
+ {
1603
+ ProcessBatchRS(batch.Range(i*chunksize, min((i+1)*chunksize, batch.Size())), len, theta);
1604
+ }, num);
1605
+
1606
+ }
1607
+ }
1608
+
1609
+
1610
+ template<int N, int vec_length>
1611
+ static void ProcessVectorizedBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
1612
+
1613
+ // static Timer t("ProcessVectorizedBatch, N = "+ToString(N) + ", vec_len = " + ToString(vec_length));
1614
+ // RegionTimer reg(t, batch[0]->mpS->SH().Order());
1615
+ // static Timer ttobatch("mptools - copy to batch 2");
1616
+ // static Timer tfrombatch("mptools - copy from batch 2");
1617
+
1618
+ // *testout << "Processing vectorized batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
1619
+ SphericalExpansion<Singular, Vec<N,Complex>> vec_source(batch[0]->mpS->Order(), batch[0]->mpS->Kappa(), batch[0]->mpS->RTyp());
1620
+ // SphericalExpansion<Singular, elem_type> tmp_source{*batch[0]->mpS};
1621
+ SphericalExpansion<Regular, elem_type> tmp_target{*batch[0]->mpR};
1622
+ SphericalExpansion<Regular, Vec<N,Complex>> vec_target(batch[0]->mpR->Order(), batch[0]->mpR->Kappa(), batch[0]->mpR->RTyp());
1623
+
1624
+ // Copy multipoles into vectorized multipole
1625
+ // ttobatch.Start();
1626
+ for (int i = 0; i < batch.Size(); i++)
1627
+ {
1628
+ auto source_i = VecVector2Matrix (batch[i]->mpS->SH().Coefs());
1629
+ auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
1630
+ batch[i]->mpS->SH().RotateZ(batch[i]->phi,
1631
+ [source_i, source_mati] (size_t ii, Complex factor)
1632
+ {
1633
+ source_mati.Row(ii) = factor * source_i.Row(ii);
1634
+ });
1635
+ }
1636
+
1637
+ // ttobatch.Stop();
1638
+
1639
+ vec_source.SH().RotateY(theta);
1640
+ vec_source.ShiftZ(-len, vec_target);
1641
+ vec_target.SH().RotateY(-theta);
1642
+
1643
+ // Copy vectorized multipole into individual multipoles
1644
+ // tfrombatch.Start();
1645
+ for (int i = 0; i < batch.Size(); i++) {
1646
+ // auto source_i = VecVector2Matrix (tmp_target.SH().Coefs());
1647
+ auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
1648
+ auto targeti = VecVector2Matrix(batch[i]->mpR->SH().Coefs());
1649
+
1650
+ tmp_target.SH().RotateZ(-batch[i]->phi,
1651
+ [source_mati, targeti] (size_t ii, Complex factor)
1652
+ {
1653
+ // source_i.Row(ii) = factor * source_mati.Row(ii);
1654
+ AtomicAdd (VectorView(targeti.Row(ii)), factor * source_mati.Row(ii));
1655
+ });
1656
+ // for (int j = 0; j < tmp_target.SH().Coefs().Size(); j++)
1657
+ // AtomicAdd(batch[i]->mpR->SH().Coefs()[j], tmp_target.SH().Coefs()[j]);
1658
+ }
1659
+ // tfrombatch.Stop();
1660
+
1661
+ }
1662
+
880
1663
 
881
1664
  struct Node
882
1665
  {
@@ -884,22 +1667,35 @@ namespace ngsbem
884
1667
  double r;
885
1668
  int level;
886
1669
  std::array<unique_ptr<Node>,8> childs;
887
- MultiPole<MPRegular,elem_type> mp;
1670
+ SphericalExpansion<Regular,elem_type> mp;
888
1671
  Array<Vec<3>> targets;
1672
+ Array<tuple<Vec<3>,double>> vol_targets;
889
1673
  int total_targets;
1674
+ std::mutex node_mutex;
1675
+ atomic<bool> have_childs{false};
890
1676
 
891
- Array<const typename SingularMLMultiPole<elem_type>::Node*> singnodes;
1677
+ Array<const typename SingularMLExpansion<elem_type>::Node*> singnodes;
1678
+ const FMM_Parameters & params;
892
1679
 
893
- Node (Vec<3> acenter, double ar, int alevel, double kappa)
894
- : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*kappa), kappa, ar) // 1.0/min(1.0, 0.25*r*kappa))
1680
+
1681
+ Node (Vec<3> acenter, double ar, int alevel, double kappa, const FMM_Parameters & _params)
1682
+ : center(acenter), r(ar), level(alevel),
1683
+ // mp(MPOrder(ar*kappa), kappa, ar) // 1.0/min(1.0, 0.25*r*kappa))
1684
+ mp(-1, kappa, ar), params(_params)
895
1685
  // : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*kappa), kappa, 1.0)
896
1686
  {
897
1687
  if (level < nodes_on_level.Size())
898
1688
  nodes_on_level[level]++;
899
1689
  }
900
1690
 
901
-
902
- void CreateChilds()
1691
+ void Allocate()
1692
+ {
1693
+ // mp = SphericalExpansion<Regular,elem_type>(MPOrder(r*mp.Kappa()), mp.Kappa(), r);
1694
+ mp = SphericalExpansion<Regular,elem_type>(params.minorder+2*r*mp.Kappa(), mp.Kappa(), r);
1695
+ }
1696
+
1697
+
1698
+ void CreateChilds(bool allocate = false)
903
1699
  {
904
1700
  if (childs[0]) throw Exception("have already childs");
905
1701
  // create children nodes:
@@ -909,15 +1705,19 @@ namespace ngsbem
909
1705
  cc(0) += (i&1) ? r/2 : -r/2;
910
1706
  cc(1) += (i&2) ? r/2 : -r/2;
911
1707
  cc(2) += (i&4) ? r/2 : -r/2;
912
- childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
1708
+ childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa(), params);
1709
+ if (allocate)
1710
+ childs[i] -> Allocate();
913
1711
  }
1712
+ have_childs = true;
914
1713
  }
915
-
916
- void AddSingularNode (const typename SingularMLMultiPole<elem_type>::Node & singnode, bool allow_refine)
1714
+
1715
+ void AddSingularNode (const typename SingularMLExpansion<elem_type>::Node & singnode, bool allow_refine,
1716
+ Array<RecordingRS> * recording)
917
1717
  {
918
1718
  if (mp.SH().Order() < 0) return;
919
1719
  if (singnode.mp.SH().Order() < 0) return;
920
- if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
1720
+ // if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
921
1721
  if (level > 20)
922
1722
  {
923
1723
  singnodes.Append(&singnode);
@@ -936,12 +1736,15 @@ namespace ngsbem
936
1736
  singnode.childs[0]->mp.Order() < singnode.mp.Order())
937
1737
  {
938
1738
  for (auto & child : singnode.childs)
939
- AddSingularNode (*child, allow_refine);
1739
+ AddSingularNode (*child, allow_refine, recording);
940
1740
  return;
941
1741
  }
942
1742
 
943
1743
  // static Timer t("mptool transform Helmholtz-criterion"); RegionTimer r(t);
944
- singnode.mp.TransformAdd(mp, dist);
1744
+ if (recording)
1745
+ *recording += RecordingRS(&singnode.mp, &mp, dist);
1746
+ else
1747
+ singnode.mp.TransformAdd(mp, dist);
945
1748
  return;
946
1749
  }
947
1750
 
@@ -957,70 +1760,70 @@ namespace ngsbem
957
1760
  if (allow_refine)
958
1761
  {
959
1762
  if (!childs[0])
960
- CreateChilds();
1763
+ CreateChilds(true);
961
1764
 
962
1765
  for (auto & ch : childs)
963
- ch -> AddSingularNode (singnode, allow_refine);
1766
+ ch -> AddSingularNode (singnode, allow_refine, recording);
964
1767
  }
965
1768
  else
966
1769
  {
967
- if (total_targets < 1000)
1770
+ if (total_targets < 1000 || recording)
968
1771
  {
969
1772
  for (auto & ch : childs)
970
1773
  if (ch)
971
- ch -> AddSingularNode (singnode, allow_refine);
1774
+ ch -> AddSingularNode (singnode, allow_refine, recording);
972
1775
  }
973
1776
  else
974
1777
  ParallelFor (8, [&] (int nr)
975
1778
  {
976
1779
  if (childs[nr])
977
- childs[nr] -> AddSingularNode (singnode, allow_refine);
1780
+ childs[nr] -> AddSingularNode (singnode, allow_refine, recording);
978
1781
  });
979
1782
 
980
- if (targets.Size())
1783
+ if (targets.Size()+vol_targets.Size())
981
1784
  singnodes.Append(&singnode);
982
1785
  }
983
1786
  }
984
1787
  else
985
1788
  {
986
1789
  for (auto & childsing : singnode.childs)
987
- AddSingularNode (*childsing, allow_refine);
1790
+ AddSingularNode (*childsing, allow_refine, recording);
988
1791
  }
989
1792
  }
990
1793
 
991
1794
  void LocalizeExpansion(bool allow_refine)
992
1795
  {
993
1796
  if (allow_refine)
994
- if (mp.Order() > 20 && !childs[0])
995
- CreateChilds();
1797
+ if (mp.Order() > 30 && !childs[0])
1798
+ CreateChilds(allow_refine);
996
1799
 
997
1800
  if (childs[0])
998
1801
  {
999
- for (auto & ch : childs)
1802
+ if (total_targets < 1000)
1000
1803
  {
1001
- if (L2Norm(mp.SH().Coefs()) > 0)
1002
- mp.TransformAdd (ch->mp, ch->center-center);
1003
- ch->LocalizeExpansion(allow_refine);
1804
+ for (int nr = 0; nr < 8; nr++)
1805
+ {
1806
+ if (L2Norm(mp.SH().Coefs()) > 0)
1807
+ mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
1808
+ childs[nr]->LocalizeExpansion(allow_refine);
1809
+ }
1004
1810
  }
1005
- mp = MultiPole<MPRegular,elem_type>(-1, mp.Kappa(), 1.);
1811
+ else
1812
+ ParallelFor(8, [&] (int nr)
1813
+ {
1814
+ if (L2Norm(mp.SH().Coefs()) > 0)
1815
+ mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
1816
+ childs[nr]->LocalizeExpansion(allow_refine);
1817
+ });
1818
+ mp = SphericalExpansion<Regular,elem_type>(-1, mp.Kappa(), 1.);
1006
1819
  //mp.SH().Coefs()=0.0;
1007
1820
  }
1008
1821
  }
1009
1822
 
1010
1823
  elem_type Evaluate (Vec<3> p) const
1011
1824
  {
1012
- // *testout << "eval p = " << p << ", level = " << level << ", center = " << center << ", r = " << r << endl;
1013
1825
  elem_type sum{0.0};
1014
- /*
1015
- if (childs[0])
1016
- {
1017
- int childnum = 0;
1018
- if (p(0) > center(0)) childnum += 1;
1019
- if (p(1) > center(1)) childnum += 2;
1020
- if (p(2) > center(2)) childnum += 4;
1021
- sum = childs[childnum]->Evaluate(p);
1022
- }
1023
- */
1826
+
1024
1827
  int childnum = 0;
1025
1828
  if (p(0) > center(0)) childnum += 1;
1026
1829
  if (p(1) > center(1)) childnum += 2;
@@ -1028,13 +1831,16 @@ namespace ngsbem
1028
1831
  if (childs[childnum])
1029
1832
  sum = childs[childnum]->Evaluate(p);
1030
1833
  else
1031
- sum = mp.Eval(p-center);
1032
-
1033
-
1034
- // static Timer t("mptool direct evaluate"); RegionTimer r(t);
1035
- for (auto sn : singnodes)
1036
- sum += sn->EvaluateMP(p);
1834
+ {
1835
+ // static Timer t("mptool regmp, evaluate reg"); RegionTimer r(t);
1836
+ sum = mp.Eval(p-center);
1837
+ }
1037
1838
 
1839
+ {
1840
+ // static Timer t("mptool regmp, evaluate, singnode"); RegionTimer r(t);
1841
+ for (auto sn : singnodes)
1842
+ sum += sn->EvaluateMP(p);
1843
+ }
1038
1844
  return sum;
1039
1845
  }
1040
1846
 
@@ -1060,6 +1866,14 @@ namespace ngsbem
1060
1866
  return sum;
1061
1867
  }
1062
1868
 
1869
+ void TraverseTree (const std::function<void(Node&)> & func)
1870
+ {
1871
+ func(*this);
1872
+ for (auto & child : childs)
1873
+ if (child)
1874
+ child->TraverseTree(func);
1875
+ }
1876
+
1063
1877
  double Norm() const
1064
1878
  {
1065
1879
  double norm = L2Norm(mp.SH().Coefs());
@@ -1077,37 +1891,100 @@ namespace ngsbem
1077
1891
  num += ch->NumCoefficients();
1078
1892
  return num;
1079
1893
  }
1080
-
1894
+
1895
+ int GetChildNum (Vec<3> x) const
1896
+ {
1897
+ int childnum = 0;
1898
+ if (x(0) > center(0)) childnum += 1;
1899
+ if (x(1) > center(1)) childnum += 2;
1900
+ if (x(2) > center(2)) childnum += 4;
1901
+ return childnum;
1902
+ }
1903
+
1081
1904
  void AddTarget (Vec<3> x)
1082
1905
  {
1083
- if (childs[0])
1906
+ // if (childs[0])
1907
+ if (have_childs) // quick check without locking
1084
1908
  {
1085
1909
  // directly send to childs:
1086
- int childnum = 0;
1087
- if (x(0) > center(0)) childnum += 1;
1088
- if (x(1) > center(1)) childnum += 2;
1089
- if (x(2) > center(2)) childnum += 4;
1910
+ int childnum = GetChildNum(x);
1090
1911
  childs[childnum] -> AddTarget( x );
1091
1912
  return;
1092
1913
  }
1093
1914
 
1915
+ lock_guard<mutex> guard(node_mutex);
1916
+
1917
+ if (have_childs) // test again after locking
1918
+ {
1919
+ // directly send to childs:
1920
+ int childnum = GetChildNum(x);
1921
+ childs[childnum] -> AddTarget(x);
1922
+ return;
1923
+ }
1924
+
1094
1925
  targets.Append( x );
1095
1926
 
1096
1927
  // if (r*mp.Kappa() < 1e-8) return;
1097
1928
  if (level > 20) return;
1098
- if (targets.Size() < maxdirect && r*mp.Kappa() < 1)
1929
+ if (targets.Size() < params.maxdirect && r*mp.Kappa() < 5)
1930
+ return;
1931
+
1932
+ CreateChilds();
1933
+
1934
+ for (auto t : targets)
1935
+ AddTarget (t);
1936
+ for (auto [x,r] : vol_targets)
1937
+ AddVolumeTarget (x,r);
1938
+
1939
+ targets.SetSize0();
1940
+ vol_targets.SetSize0();
1941
+ }
1942
+
1943
+
1944
+ void AddVolumeTarget (Vec<3> x, double tr)
1945
+ {
1946
+ if (MaxNorm(x-center) > r+tr) return;
1947
+
1948
+ if (have_childs)
1949
+ {
1950
+ for (auto & child : childs)
1951
+ child->AddVolumeTarget(x, tr);
1952
+ return;
1953
+ }
1954
+
1955
+
1956
+ lock_guard<mutex> guard(node_mutex);
1957
+
1958
+ if (have_childs)
1959
+ {
1960
+ for (auto & child : childs)
1961
+ child->AddVolumeTarget(x, tr);
1962
+ return;
1963
+ }
1964
+
1965
+
1966
+ vol_targets.Append (tuple(x,tr));
1967
+
1968
+ if (level > 20) return;
1969
+ if (vol_targets.Size() < params.maxdirect && (r*mp.Kappa() < 5))
1099
1970
  return;
1100
1971
 
1101
1972
  CreateChilds();
1102
1973
 
1103
1974
  for (auto t : targets)
1104
1975
  AddTarget (t);
1976
+ for (auto [x,r] : vol_targets)
1977
+ AddVolumeTarget (x,r);
1978
+
1105
1979
  targets.SetSize0();
1980
+ vol_targets.SetSize0();
1106
1981
  }
1107
1982
 
1983
+
1984
+
1108
1985
  void CalcTotalTargets()
1109
1986
  {
1110
- total_targets = targets.Size();
1987
+ total_targets = targets.Size() + vol_targets.Size();
1111
1988
  for (auto & child : childs)
1112
1989
  if (child)
1113
1990
  {
@@ -1127,8 +2004,21 @@ namespace ngsbem
1127
2004
  }
1128
2005
 
1129
2006
  if (total_targets == 0)
1130
- mp = MultiPole<MPRegular,elem_type>(-1, mp.Kappa(),1.);
2007
+ mp = SphericalExpansion<Regular,elem_type>(-1, mp.Kappa(),1.);
2008
+ }
2009
+
2010
+ void AllocateMemory()
2011
+ {
2012
+ for (auto & child : childs)
2013
+ if (child)
2014
+ child->AllocateMemory();
2015
+
2016
+ if (total_targets > 0)
2017
+ Allocate();
2018
+ // mp = SphericalExpansion<Regular,elem_type>(MPOrder(r*mp.Kappa()), mp.Kappa(), r); // -1, mp.Kappa(),1.);
1131
2019
  }
2020
+
2021
+
1132
2022
 
1133
2023
 
1134
2024
  void Print (ostream & ost, size_t childnr = -1) const
@@ -1145,21 +2035,24 @@ namespace ngsbem
1145
2035
  }
1146
2036
 
1147
2037
  };
1148
-
2038
+
2039
+ FMM_Parameters fmm_params;
1149
2040
  Node root;
1150
- shared_ptr<SingularMLMultiPole<elem_type>> singmp;
2041
+ shared_ptr<SingularMLExpansion<elem_type>> singmp;
1151
2042
 
1152
2043
  public:
1153
- RegularMLMultiPole (shared_ptr<SingularMLMultiPole<elem_type>> asingmp, Vec<3> center, double r)
1154
- : root(center, r, 0, asingmp->Kappa()), singmp(asingmp)
1155
- {
2044
+ RegularMLExpansion (shared_ptr<SingularMLExpansion<elem_type>> asingmp, Vec<3> center, double r,
2045
+ const FMM_Parameters & _params)
2046
+ : fmm_params(_params), root(center, r, 0, asingmp->Kappa(), fmm_params), singmp(asingmp)
2047
+ {
1156
2048
  if (!singmp->havemp) throw Exception("first call Calc for singular MP");
1157
-
2049
+ root.Allocate();
2050
+
1158
2051
  nodes_on_level = 0;
1159
2052
  nodes_on_level[0] = 1;
1160
2053
  {
1161
- static Timer t("mptool compute regular MLMP"); RegionTimer rg(t);
1162
- root.AddSingularNode(singmp->root, true);
2054
+ static Timer t("mptool compute regular MLMP"); RegionTimer rg(t);
2055
+ root.AddSingularNode(singmp->root, true, nullptr);
1163
2056
  // cout << "norm after S->R conversion: " << root.Norm() << endl;
1164
2057
  }
1165
2058
 
@@ -1180,42 +2073,163 @@ namespace ngsbem
1180
2073
  }
1181
2074
  }
1182
2075
 
1183
- RegularMLMultiPole (Vec<3> center, double r, double kappa)
1184
- : root(center, r, 0, kappa)
1185
- {
1186
- nodes_on_level = 0;
1187
- nodes_on_level[0] = 1;
1188
- }
1189
-
2076
+ RegularMLExpansion (Vec<3> center, double r, double kappa, const FMM_Parameters & _params)
2077
+ : fmm_params(_params), root(center, r, 0, kappa, fmm_params)
2078
+ {
2079
+ nodes_on_level = 0;
2080
+ nodes_on_level[0] = 1;
2081
+ }
2082
+
1190
2083
  void AddTarget (Vec<3> t)
1191
2084
  {
1192
2085
  root.AddTarget (t);
1193
2086
  }
1194
2087
 
1195
- void CalcMP(shared_ptr<SingularMLMultiPole<elem_type>> asingmp)
2088
+ void AddVolumeTarget (Vec<3> t, double r)
2089
+ {
2090
+ root.AddVolumeTarget (t, r);
2091
+ }
2092
+
2093
+ void CalcMP(shared_ptr<SingularMLExpansion<elem_type>> asingmp, bool onlytargets = true)
1196
2094
  {
1197
2095
  static Timer t("mptool regular MLMP"); RegionTimer rg(t);
2096
+ static Timer tremove("removeempty");
2097
+ static Timer trec("mptool regular MLMP - recording");
2098
+ static Timer tsort("mptool regular MLMP - sort");
1198
2099
 
1199
2100
  singmp = asingmp;
1200
2101
 
2102
+
1201
2103
  root.CalcTotalTargets();
1202
- root.RemoveEmptyTrees();
1203
-
1204
- root.AddSingularNode(singmp->root, false);
2104
+ // cout << "before remove empty trees:" << endl;
2105
+ // PrintStatistics(cout);
2106
+
2107
+ /*
2108
+ tremove.Start();
2109
+ if (onlytargets)
2110
+ root.RemoveEmptyTrees();
2111
+ tremove.Stop();
2112
+ */
2113
+
2114
+ root.AllocateMemory();
2115
+
2116
+ // cout << "after allocating regular:" << endl;
2117
+ // PrintStatistics(cout);
1205
2118
 
2119
+ // cout << "starting S-R converion" << endl;
2120
+ // PrintStatistics(cout);
2121
+
2122
+
2123
+ if constexpr (false)
2124
+ {
2125
+ root.AddSingularNode(singmp->root, !onlytargets, nullptr);
2126
+ }
2127
+ else
2128
+ { // use recording
2129
+ Array<RecordingRS> recording;
2130
+ {
2131
+ RegionTimer rrec(trec);
2132
+ root.AddSingularNode(singmp->root, !onlytargets, &recording);
2133
+ }
2134
+
2135
+ // cout << "recorded: " << recording.Size() << endl;
2136
+ {
2137
+ RegionTimer reg(tsort);
2138
+ QuickSort (recording, [] (auto & a, auto & b)
2139
+ {
2140
+ if (a.len < (1-1e-8) * b.len) return true;
2141
+ if (a.len > (1+1e-8) * b.len) return false;
2142
+ return a.theta < b.theta;
2143
+ });
2144
+ }
2145
+
2146
+ double current_len = -1e100;
2147
+ double current_theta = -1e100;
2148
+ Array<RecordingRS*> current_batch;
2149
+ Array<Array<RecordingRS*>> batch_group;
2150
+ Array<double> group_lengths;
2151
+ Array<double> group_thetas;
2152
+ for (auto & record : recording)
2153
+ {
2154
+ bool len_changed = fabs(record.len - current_len) > 1e-8;
2155
+ bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
2156
+ if ((len_changed || theta_changed) && current_batch.Size() > 0) {
2157
+ // ProcessBatch(current_batch, current_len, current_theta);
2158
+ batch_group.Append(current_batch);
2159
+ group_lengths.Append(current_len);
2160
+ group_thetas.Append(current_theta);
2161
+ current_batch.SetSize(0);
2162
+ }
2163
+
2164
+ current_len = record.len;
2165
+ current_theta = record.theta;
2166
+ current_batch.Append(&record);
2167
+ }
2168
+ if (current_batch.Size() > 0) {
2169
+ // ProcessBatch(current_batch, current_len, current_theta);
2170
+ batch_group.Append(current_batch);
2171
+ group_lengths.Append(current_len);
2172
+ group_thetas.Append(current_theta);
2173
+ }
2174
+
2175
+ ParallelFor(batch_group.Size(), [&](int i) {
2176
+ ProcessBatchRS(batch_group[i], group_lengths[i], group_thetas[i]);
2177
+ }, TasksPerThread(4));
2178
+ }
2179
+
2180
+
1206
2181
  /*
1207
2182
  int maxlevel = 0;
1208
- for (auto [i,num] : Enumerate(RegularMLMultiPole::nodes_on_level))
2183
+ for (auto [i,num] : Enumerate(RegularMLExpansion::nodes_on_level))
1209
2184
  if (num > 0) maxlevel = i;
1210
2185
 
1211
2186
  for (int i = 0; i <= maxlevel; i++)
1212
- cout << "reg " << i << ": " << RegularMLMultiPole::nodes_on_level[i] << endl;
2187
+ cout << "reg " << i << ": " << RegularMLExpansion::nodes_on_level[i] << endl;
1213
2188
  */
1214
2189
 
1215
- static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
1216
- root.LocalizeExpansion(false);
2190
+ // cout << "starting R-R converion" << endl;
2191
+ // PrintStatistics(cout);
2192
+
2193
+ static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
2194
+ root.LocalizeExpansion(!onlytargets);
2195
+
2196
+
2197
+ // cout << "R-R conversion done" << endl;
2198
+ // PrintStatistics(cout);
1217
2199
  }
1218
2200
 
2201
+ void PrintStatistics (ostream & ost)
2202
+ {
2203
+ int levels = 0;
2204
+ int cnt = 0;
2205
+ root.TraverseTree( [&](Node & node) {
2206
+ levels = max(levels, node.level);
2207
+ cnt++;
2208
+ });
2209
+ ost << "levels: " << levels << endl;
2210
+ ost << "nodes: " << cnt << endl;
2211
+
2212
+ Array<int> num_on_level(levels+1);
2213
+ Array<int> order_on_level(levels+1);
2214
+ Array<size_t> coefs_on_level(levels+1);
2215
+ num_on_level = 0;
2216
+ order_on_level = 0;
2217
+ root.TraverseTree( [&](Node & node) {
2218
+ num_on_level[node.level]++;
2219
+ order_on_level[node.level] = max(order_on_level[node.level],node.mp.Order());
2220
+ coefs_on_level[node.level] += node.mp.SH().Coefs().Size();
2221
+ });
2222
+
2223
+ cout << "num on level" << endl;
2224
+ for (int i = 0; i < num_on_level.Size(); i++)
2225
+ cout << i << ": " << num_on_level[i] << ", order = " << order_on_level[i] << ", coefs " << coefs_on_level[i] << endl;
2226
+
2227
+ size_t totcoefs = 0;
2228
+ for (auto n : coefs_on_level)
2229
+ totcoefs += n;
2230
+ cout << "total mem in coefs: " << sizeof(elem_type)*totcoefs / sqr(1024) << " MB" << endl;
2231
+ }
2232
+
1219
2233
  void Print (ostream & ost) const
1220
2234
  {
1221
2235
  root.Print(ost);
@@ -1234,7 +2248,10 @@ namespace ngsbem
1234
2248
  elem_type Evaluate (Vec<3> p) const
1235
2249
  {
1236
2250
  // static Timer t("mptool Eval MLMP regular"); RegionTimer r(t);
1237
- if (L2Norm(p-root.center) > root.r) return elem_type{0.0};
2251
+ // if (L2Norm(p-root.center) > root.r) return elem_type{0.0};
2252
+
2253
+ if (MaxNorm(p-root.center) > root.r)
2254
+ return singmp->Evaluate(p);
1238
2255
  return root.Evaluate(p);
1239
2256
  }
1240
2257
 
@@ -1246,11 +2263,12 @@ namespace ngsbem
1246
2263
 
1247
2264
  };
1248
2265
 
2266
+
1249
2267
  template <typename elem_type>
1250
- inline ostream & operator<< (ostream & ost, const RegularMLMultiPole<elem_type> & mlmp)
2268
+ inline ostream & operator<< (ostream & ost, const RegularMLExpansion<elem_type> & mlmp)
1251
2269
  {
1252
2270
  mlmp.Print(ost);
1253
- // ost << "RegularMLMultiPole" << endl;
2271
+ // ost << "RegularMLExpansion" << endl;
1254
2272
  return ost;
1255
2273
  }
1256
2274