ngsolve 6.2.2502__cp311-cp311-win_amd64.whl → 6.2.2601__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. netgen/include/analytic_integrals.hpp +10 -0
  2. netgen/include/basematrix.hpp +6 -0
  3. netgen/include/bdbequations.hpp +55 -0
  4. netgen/include/bem_diffops.hpp +475 -0
  5. netgen/include/bilinearform.hpp +4 -1
  6. netgen/include/blockjacobi.hpp +17 -5
  7. netgen/include/bspline.hpp +2 -0
  8. netgen/include/cholesky.hpp +2 -2
  9. netgen/include/code_generation.hpp +2 -2
  10. netgen/include/complex_wrapper.hpp +30 -2
  11. netgen/include/contact.hpp +8 -0
  12. netgen/include/diagonalmatrix.hpp +6 -0
  13. netgen/include/diffop_impl.hpp +3 -1
  14. netgen/include/diffopwithfactor.hpp +123 -0
  15. netgen/include/elementbyelement.hpp +9 -3
  16. netgen/include/expr.hpp +45 -7
  17. netgen/include/fespace.hpp +12 -4
  18. netgen/include/gridfunction.hpp +3 -3
  19. netgen/include/h1amg.hpp +85 -2
  20. netgen/include/h1lumping.hpp +6 -0
  21. netgen/include/hcurl_equations.hpp +29 -0
  22. netgen/include/hcurlcurlfe.hpp +20 -0
  23. netgen/include/hdiv_equations.hpp +15 -0
  24. netgen/include/hdivfe_utils.hpp +1 -0
  25. netgen/include/hdivhofespace.hpp +2 -0
  26. netgen/include/integrator.hpp +4 -16
  27. netgen/include/intrule.hpp +2 -1
  28. netgen/include/intrules_SauterSchwab.hpp +25 -0
  29. netgen/include/jacobi.hpp +35 -18
  30. netgen/include/kernels.hpp +724 -0
  31. netgen/include/l2hofe.hpp +1 -0
  32. netgen/include/matrix.hpp +8 -3
  33. netgen/include/meshaccess.hpp +4 -3
  34. netgen/include/mp_coefficient.hpp +145 -0
  35. netgen/include/mptools.hpp +1331 -368
  36. netgen/include/mycomplex.hpp +1 -1
  37. netgen/include/ngblas.hpp +116 -7
  38. netgen/include/potentialtools.hpp +22 -0
  39. netgen/include/preconditioner.hpp +23 -23
  40. netgen/include/prolongation.hpp +132 -6
  41. netgen/include/recursive_pol.hpp +63 -11
  42. netgen/include/simd_complex.hpp +45 -0
  43. netgen/include/sparsecholesky.hpp +6 -2
  44. netgen/include/sparsefactorization_interface.hpp +159 -0
  45. netgen/include/sparsematrix.hpp +21 -7
  46. netgen/include/sparsematrix_dyn.hpp +6 -7
  47. netgen/include/sparsematrix_impl.hpp +175 -40
  48. netgen/include/special_matrix.hpp +2 -0
  49. netgen/include/statushandler.hpp +8 -8
  50. netgen/include/symbolicintegrator.hpp +2 -1
  51. netgen/include/tangentialfacetfespace.hpp +7 -22
  52. netgen/include/thdivfe_impl.hpp +66 -0
  53. netgen/include/tscalarfe.hpp +1 -1
  54. netgen/include/vector.hpp +272 -47
  55. netgen/lib/libngsolve.lib +0 -0
  56. netgen/libngsolve.dll +0 -0
  57. netgen/ngscxx.bat +1 -1
  58. netgen/ngsld.bat +1 -1
  59. ngsolve/__init__.py +1 -0
  60. ngsolve/cmake/NGSolveConfig.cmake +8 -8
  61. ngsolve/cmake/ngsolve-targets.cmake +24 -18
  62. ngsolve/config/config.py +7 -7
  63. ngsolve/demos/intro/cmagnet.py +19 -22
  64. ngsolve/directsolvers.py +9 -21
  65. ngsolve/krylovspace.py +172 -3
  66. ngsolve/ngslib.lib +0 -0
  67. ngsolve/ngslib.pyd +0 -0
  68. ngsolve/nonlinearsolvers.py +2 -2
  69. ngsolve/preconditioners.py +1 -0
  70. ngsolve/solve_implementation.py +168 -0
  71. ngsolve/{solvers.py → solvers/__init__.py} +1 -1
  72. ngsolve/solvers/cudss.py +112 -0
  73. ngsolve/webgui.py +2 -0
  74. {ngsolve-6.2.2502.dist-info → ngsolve-6.2.2601.dist-info}/METADATA +2 -2
  75. {ngsolve-6.2.2502.dist-info → ngsolve-6.2.2601.dist-info}/RECORD +107 -97
  76. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/Scripts/ngsolve.tcl +0 -0
  77. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/beam.geo +0 -0
  78. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/beam.vol +0 -0
  79. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/chip.in2d +0 -0
  80. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/chip.vol +0 -0
  81. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coil.geo +0 -0
  82. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coil.vol +0 -0
  83. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coilshield.geo +0 -0
  84. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/coilshield.vol +0 -0
  85. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/cube.geo +0 -0
  86. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/cube.vol +0 -0
  87. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d10_DGdoubleglazing.pde +0 -0
  88. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d11_chip_nitsche.pde +0 -0
  89. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d1_square.pde +0 -0
  90. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d2_chip.pde +0 -0
  91. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d3_helmholtz.pde +0 -0
  92. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d4_cube.pde +0 -0
  93. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d5_beam.pde +0 -0
  94. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d6_shaft.pde +0 -0
  95. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d7_coil.pde +0 -0
  96. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d8_coilshield.pde +0 -0
  97. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/d9_hybridDG.pde +0 -0
  98. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/doubleglazing.in2d +0 -0
  99. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/doubleglazing.vol +0 -0
  100. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/piezo2d40round4.vol.gz +0 -0
  101. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/shaft.geo +0 -0
  102. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/shaft.vol +0 -0
  103. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/square.in2d +0 -0
  104. {ngsolve-6.2.2502.data → ngsolve-6.2.2601.data}/data/share/ngsolve/square.vol +0 -0
  105. {ngsolve-6.2.2502.dist-info → ngsolve-6.2.2601.dist-info}/LICENSE +0 -0
  106. {ngsolve-6.2.2502.dist-info → ngsolve-6.2.2601.dist-info}/WHEEL +0 -0
  107. {ngsolve-6.2.2502.dist-info → ngsolve-6.2.2601.dist-info}/top_level.txt +0 -0
@@ -10,8 +10,77 @@
10
10
  #include <coefficient.hpp>
11
11
  #include <recursive_pol.hpp>
12
12
 
13
- namespace ngfem
13
+
14
+ namespace ngsbem
14
15
  {
16
+ using namespace ngfem;
17
+
18
+ template<typename T>
19
+ constexpr int VecLength = 1; // Default: Complex has length 1
20
+
21
+ template<int N>
22
+ constexpr int VecLength<Vec<N, Complex>> = N; // Specialization: Vec<N,Complex> has length N
23
+
24
+
25
+
26
+ constexpr int FMM_SW = 4;
27
+
28
+
29
+
30
+ // ************************ SIMD - creation (should end up in simd.hpp) *************
31
+
32
+
33
+ template <int S, typename T, int SW>
34
+ Vec<S,T> HSum (Vec<S,SIMD<T,SW>> v)
35
+ {
36
+ Vec<S,T> res;
37
+ for (int i = 0; i < S; i++)
38
+ res(i) = HSum(v(i));
39
+ // Iterate<S> ([&](auto i) {
40
+ // res.HTData().template Elem<i.value>() = HSum(v.HTData().template Elem<i.value>());
41
+ // });
42
+ return res;
43
+ }
44
+
45
+
46
+ class NGS_DLL_HEADER PrecomputedSqrts
47
+ {
48
+ public:
49
+ Array<double> sqrt_int;
50
+ // Array<double> inv_sqrt_int;
51
+ Array<double> sqrt_n_np1; // sqrt(n*(n+1))
52
+ Array<double> inv_sqrt_2np1_2np3; // 1/sqrt( (2n+1)*(2n+3) )
53
+
54
+ PrecomputedSqrts();
55
+ };
56
+
57
+ extern NGS_DLL_HEADER PrecomputedSqrts presqrt;
58
+
59
+
60
+
61
+ class FMM_Parameters
62
+ {
63
+ public:
64
+ int maxdirect = 100;
65
+ int minorder = 20; // order = minorder + 2 kappa r
66
+ };
67
+
68
+
69
+
70
+
71
+ inline std::tuple<double, double, double> SphericalCoordinates(Vec<3> dist){
72
+ double len, theta, phi;
73
+ len = L2Norm(dist);
74
+ if (len < 1e-30)
75
+ theta = 0;
76
+ else
77
+ theta = acos (dist(2) / len);
78
+ if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
79
+ phi = 0;
80
+ else
81
+ phi = atan2(dist(1), dist(0));
82
+ return {len, theta, phi};
83
+ }
15
84
 
16
85
 
17
86
  template <typename entry_type = Complex>
@@ -77,23 +146,91 @@ namespace ngfem
77
146
 
78
147
  void Calc (Vec<3> x, FlatVector<Complex> shapes);
79
148
 
80
-
149
+
150
+ void FlipZ ();
81
151
  void RotateZ (double alpha);
82
- void RotateY (double alpha);
83
152
 
153
+ template <typename FUNC>
154
+ void RotateZ (double alpha, FUNC func) const
155
+ {
156
+ if (order < 0) return;
157
+
158
+ Vector<Complex> exp_imalpha(order+1);
159
+ Complex exp_ialpha(cos(alpha), sin(alpha));
160
+ Complex prod = 1.0;
161
+ for (int i = 0; i <= order; i++)
162
+ {
163
+ exp_imalpha(i) = prod;
164
+ prod *= exp_ialpha;
165
+ }
166
+
167
+ int ii = 0;
168
+ for (int n = 0; n <= order; n++)
169
+ {
170
+ for (int m = -n; m < 0; m++, ii++)
171
+ func(ii, conj(exp_imalpha(-m)));
172
+ for (int m = 0; m <= n; m++, ii++)
173
+ func(ii, exp_imalpha(m));
174
+ };
175
+ };
176
+
177
+ template <typename FUNC>
178
+ void RotateZFlip (double alpha, bool flip, FUNC func) const
179
+ {
180
+ if (order < 0) return;
181
+
182
+ Vector<Complex> exp_imalpha(order+1);
183
+ Complex exp_ialpha(cos(alpha), sin(alpha));
184
+ Complex prod = 1.0;
185
+ for (int i = 0; i <= order; i++)
186
+ {
187
+ exp_imalpha(i) = prod;
188
+ prod *= exp_ialpha;
189
+ }
190
+
191
+ int ii = 0;
192
+
193
+ auto FlipFactor = [] (int n, int m, bool flip)->double
194
+ {
195
+ if (flip)
196
+ return ((n-m)%2) == 1 ? -1 : 1;
197
+ return 1.0;
198
+ };
199
+
200
+ for (int n = 0; n <= order; n++)
201
+ {
202
+ for (int m = -n; m < 0; m++, ii++)
203
+ func(ii, FlipFactor(n,m,flip)*conj(exp_imalpha(-m)));
204
+ for (int m = 0; m <= n; m++, ii++)
205
+ func(ii, FlipFactor(n,m,flip)*exp_imalpha(m));
206
+ };
207
+ };
208
+
209
+
210
+
211
+ void RotateY (double alpha, bool parallel = false);
212
+
213
+
84
214
 
85
215
  static double CalcAmn (int m, int n)
86
216
  {
87
217
  if (m < 0) m=-m;
88
218
  if (n < m) return 0;
89
- return sqrt( (n+1.0+m)*(n+1.0-m) / ( (2*n+1)*(2*n+3) ));
219
+
220
+ if (2*n+1 < presqrt.sqrt_int.Size())
221
+ return presqrt.sqrt_int[n+1+m]*presqrt.sqrt_int[n+1-m] * presqrt.inv_sqrt_2np1_2np3[n];
222
+ else
223
+ return sqrt( (n+1.0+m)*(n+1.0-m) / ( (2*n+1)*(2*n+3) ));
90
224
  }
91
225
 
92
226
  static double CalcBmn (int m, int n)
93
227
  {
94
228
  double sgn = (m >= 0) ? 1 : -1;
95
- if ( (m > n) || (-m > n) ) return 0;
96
- return sgn * sqrt( (n-m-1.0)*(n-m) / ( (2*n-1.0)*(2*n+1)));
229
+ if ( (m >= n) || (-m > n) ) return 0;
230
+ if (n <= presqrt.inv_sqrt_2np1_2np3.Size())
231
+ return sgn * presqrt.sqrt_n_np1[n-m-1] * presqrt.inv_sqrt_2np1_2np3[n-1];
232
+ else
233
+ return sgn * sqrt( (n-m-1.0)*(n-m) / ( (2*n-1.0)*(2*n+1)));
97
234
  }
98
235
 
99
236
  static double CalcDmn (int m, int n)
@@ -112,11 +249,11 @@ namespace ngfem
112
249
  // https://fortran-lang.discourse.group/t/looking-for-spherical-bessel-and-hankel-functions-of-first-and-second-kind-and-arbitrary-order/2308/2
113
250
  NGS_DLL_HEADER
114
251
  void besseljs3d (int nterms, double z, double scale,
115
- FlatVector<double> fjs, FlatVector<double> fjder);
252
+ SliceVector<double> fjs, SliceVector<double> fjder = FlatVector<double>(0, nullptr));
116
253
 
117
254
  NGS_DLL_HEADER
118
255
  void besseljs3d (int nterms, Complex z, double scale,
119
- FlatVector<Complex> fjs, FlatVector<Complex> fjder);
256
+ SliceVector<Complex> fjs, SliceVector<Complex> fjder = FlatVector<Complex>(0, nullptr));
120
257
 
121
258
 
122
259
  /*
@@ -135,14 +272,17 @@ namespace ngfem
135
272
  FlatVector<double> jp,
136
273
  FlatVector<double> yp);
137
274
 
138
-
275
+
139
276
 
140
277
  template <typename T>
141
278
  void SphericalBessel (int n, double rho, double scale, T && values)
142
279
  {
280
+ besseljs3d (n, rho, scale, values);
281
+ /*
143
282
  Vector<double> j(n+1), jp(n+1);
144
283
  besseljs3d (n, rho, scale, j, jp);
145
284
  values = j;
285
+ */
146
286
  }
147
287
 
148
288
 
@@ -166,21 +306,6 @@ namespace ngfem
166
306
  return;
167
307
  }
168
308
  Vector j(n+1), y(n+1), jp(n+1), yp(n+1);
169
- // SBESJY (rho, n, j, y, jp, yp);
170
-
171
- /*
172
- values = j + Complex(0,1) * y;
173
- if (scale != 1.0)
174
- {
175
- double prod = 1.0;
176
- for (int i = 0; i <= n; i++)
177
- {
178
- values(i) *= prod;
179
- prod *= scale;
180
- }
181
- }
182
- */
183
-
184
309
 
185
310
  // the bessel-evaluation with scale
186
311
  besseljs3d (n, rho, 1/scale, j, jp);
@@ -208,7 +333,7 @@ namespace ngfem
208
333
 
209
334
 
210
335
  // hn1 = jn+ i*yn
211
- class MPSingular
336
+ class Singular
212
337
  {
213
338
  public:
214
339
  template <typename T>
@@ -216,48 +341,80 @@ namespace ngfem
216
341
  {
217
342
  SphericalHankel1(order, r, scale, values);
218
343
  }
344
+
345
+ template <typename T>
346
+ static void Eval (int order, double kappa, double r, double rtyp, T && values)
347
+ {
348
+ double scale = Scale(kappa, rtyp);
349
+ SphericalHankel1(order, r*kappa, scale, values);
350
+ }
351
+
352
+ static double Scale (double kappa, double rtyp)
353
+ {
354
+ // return min(1.0, rtyp*kappa);
355
+ return min(1.0, 0.5*rtyp*kappa);
356
+ }
219
357
  };
358
+
359
+
220
360
 
221
361
  // jn
222
- class MPRegular
362
+ class Regular
223
363
  {
224
- public:
364
+ public:
225
365
  template <typename T>
226
366
  static void Eval (int order, double r, double scale, T && values)
227
367
  {
228
368
  SphericalBessel (order, r, 1.0/scale, values);
229
369
  }
370
+
371
+ template <typename T>
372
+ static void Eval (int order, double kappa, double r, double rtyp, T && values)
373
+ {
374
+ double scale = Scale(kappa, rtyp);
375
+ SphericalBessel (order, r*kappa, 1.0/scale, values);
376
+ }
377
+
378
+ static double Scale (double kappa, double rtyp)
379
+ {
380
+ // return 1.0/ min(1.0, 0.25*rtyp*kappa);
381
+ return 1.0/ min(1.0, 0.5*rtyp*kappa);
382
+ }
383
+
230
384
  };
231
385
 
232
386
 
233
387
 
234
388
 
235
389
  template <typename RADIAL, typename entry_type=Complex>
236
- class NGS_DLL_HEADER MultiPole
390
+ class NGS_DLL_HEADER SphericalExpansion
237
391
  {
238
392
  SphericalHarmonics<entry_type> sh;
239
393
  double kappa;
240
- double scale;
394
+ double rtyp;
241
395
  public:
242
- MultiPole (int aorder, double akappa, double ascale = 1)
243
- : sh(aorder), kappa(akappa), scale(ascale) { }
244
396
 
397
+ SphericalExpansion (int aorder, double akappa, double artyp)
398
+ : sh(aorder), kappa(akappa), rtyp(artyp) { }
399
+
400
+
245
401
  entry_type & Coef(int n, int m) { return sh.Coef(n,m); }
246
402
  auto & SH() { return sh; }
247
403
  const auto & SH() const { return sh; }
248
404
  double Kappa() const { return kappa; }
249
- double Scale() const { return scale; }
405
+ double Scale() const { return RADIAL::Scale(kappa, rtyp); }
406
+ double RTyp() const { return rtyp; }
250
407
  int Order() const { return sh.Order(); }
251
408
 
252
- MultiPole Truncate(int neworder) const
409
+ SphericalExpansion Truncate(int neworder) const
253
410
  {
254
411
  if (neworder > sh.Order()) neworder=sh.Order();
255
- MultiPole nmp(neworder, kappa);
412
+ SphericalExpansion nmp(neworder, kappa, rtyp);
256
413
  nmp.sh.Coefs() = sh.Coefs().Range(sqr(neworder+1));
257
414
  return nmp;
258
415
  }
259
416
 
260
- MultiPole & operator+= (const MultiPole & mp2)
417
+ SphericalExpansion & operator+= (const SphericalExpansion & mp2)
261
418
  {
262
419
  size_t commonsize = min(SH().Coefs().Size(), mp2.SH().Coefs().Size());
263
420
  SH().Coefs().Range(commonsize) += mp2.SH().Coefs().Range(commonsize);
@@ -268,18 +425,27 @@ namespace ngfem
268
425
  entry_type EvalDirectionalDerivative (Vec<3> x, Vec<3> d) const;
269
426
 
270
427
  void AddCharge (Vec<3> x, entry_type c);
271
- void AddDipole (Vec<3> x, Vec<3> d, entry_type c);
272
- void AddCurrent (Vec<3> ap, Vec<3> ep, Complex j, int num=100);
428
+ void AddDipole (Vec<3> x, Vec<3> dir, entry_type c);
429
+ void AddChargeDipole (Vec<3> x, entry_type c, Vec<3> dir, entry_type c2)
430
+ {
431
+ // TODO: add them at once
432
+ AddCharge (x, c);
433
+ AddDipole (x, dir, c2);
434
+ }
273
435
 
436
+ void AddPlaneWave (Vec<3> d, entry_type c);
437
+ void AddCurrent (Vec<3> ap, Vec<3> ep, Complex j, int num=100);
274
438
 
275
- void ChangeScaleTo (double newscale)
439
+
440
+ void ChangeRTypTo (double new_rtyp)
276
441
  {
277
- double fac = scale/newscale;
442
+ double fac = RADIAL::Scale(kappa, rtyp) / RADIAL::Scale(kappa, new_rtyp);
278
443
  double prod = 1;
279
444
  for (int n = 0; n <= sh.Order(); n++, prod*= fac)
280
445
  sh.CoefsN(n) *= prod;
281
- scale = newscale;
446
+ rtyp = new_rtyp;
282
447
  }
448
+
283
449
 
284
450
  Vector<double> Spectrum (bool scaled) const
285
451
  {
@@ -288,14 +454,14 @@ namespace ngfem
288
454
  for (int n = 0; n <= Order(); n++)
289
455
  {
290
456
  spec(n) = fac * L2Norm2(sh.CoefsN(n));
291
- if (!scaled) fac *= sqr(scale);
457
+ if (!scaled) fac *= sqr(Scale());
292
458
  }
293
459
  return spec;
294
460
  }
295
461
 
296
462
 
297
463
  template <typename TARGET>
298
- void Transform (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
464
+ void Transform (SphericalExpansion<TARGET,entry_type> & target, Vec<3> dist) const
299
465
  {
300
466
  if (target.SH().Order() < 0) return;
301
467
  if (SH().Order() < 0)
@@ -304,24 +470,16 @@ namespace ngfem
304
470
  return;
305
471
  }
306
472
 
307
- static Timer t("mptool Transform "+ToString(typeid(RADIAL).name())+ToString(typeid(TARGET).name()));
308
- RegionTimer reg(t);
473
+ // static Timer t("mptool Transform "+ToString(typeid(RADIAL).name())+ToString(typeid(TARGET).name()));
474
+ // RegionTimer reg(t);
309
475
 
310
- double len = L2Norm(dist);
311
- double theta, phi;
312
-
313
- if (len < 1e-30)
314
- theta = 0;
315
- else
316
- theta = acos (dist(2) / len);
317
-
318
- if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
319
- phi = 0;
320
- else
321
- phi = atan2(dist(1), dist(0));
476
+ auto [len, theta, phi] = SphericalCoordinates(dist);
322
477
 
323
478
 
324
- MultiPole<RADIAL,entry_type> tmp(*this);
479
+ // SphericalExpansion<RADIAL,entry_type> tmp{*this};
480
+ SphericalExpansion<RADIAL,entry_type> tmp(Order(), kappa, rtyp);
481
+ tmp.SH().Coefs() = SH().Coefs();
482
+
325
483
  tmp.SH().RotateZ(phi);
326
484
  tmp.SH().RotateY(theta);
327
485
 
@@ -332,60 +490,213 @@ namespace ngfem
332
490
  }
333
491
 
334
492
  template <typename TARGET>
335
- void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
493
+ void TransformAdd (SphericalExpansion<TARGET,entry_type> & target, Vec<3> dist, bool atomic = false) const
336
494
  {
337
495
  if (SH().Order() < 0) return;
338
496
  if (target.SH().Order() < 0) return;
339
497
 
340
- MultiPole<TARGET,entry_type> tmp{target};
498
+ SphericalExpansion<TARGET,entry_type> tmp{target};
341
499
  Transform(tmp, dist);
342
- target.SH().Coefs() += tmp.SH().Coefs();
500
+ if (!atomic)
501
+ target.SH().Coefs() += tmp.SH().Coefs();
502
+ else
503
+ for (int j = 0; j < target.SH().Coefs().Size(); j++)
504
+ AtomicAdd(target.SH().Coefs()[j], tmp.SH().Coefs()[j]);
343
505
  }
344
506
 
345
507
  template <typename TARGET>
346
- void ShiftZ (double z, MultiPole<TARGET,entry_type> & target);
508
+ void ShiftZ (double z, SphericalExpansion<TARGET,entry_type> & target);
509
+
347
510
 
511
+ template <typename TARGET>
512
+ void In2Out (SphericalExpansion<TARGET,entry_type> & target, double r) const
513
+ {
514
+ Vector<Complex> rad(Order()+1);
515
+ Vector<Complex> radout(target.Order()+1);
516
+ RADIAL::Eval(Order(), kappa, r, RTyp(), rad);
517
+ TARGET::Eval(target.Order(), kappa, r, target.RTyp(), radout);
518
+ target.SH().Coefs() = 0;
519
+ for (int j = 0; j <= std::min(Order(), target.Order()); j++)
520
+ target.SH().CoefsN(j) = rad(j)/radout(j) * SH().CoefsN(j);
521
+ }
348
522
  };
349
523
 
350
524
 
351
525
 
352
526
  // ***************** parameters ****************
353
527
 
528
+ /*
354
529
  static constexpr int MPOrder (double rho_kappa)
355
530
  {
356
- return max (20, int(2*rho_kappa));
531
+ // return max (20, int(2*rho_kappa));
532
+ return 20+int(2*rho_kappa);
357
533
  }
358
534
  static constexpr int maxdirect = 100;
535
+ */
536
+
537
+
538
+ template <typename SCAL, auto S>
539
+ inline auto VecVector2Matrix (FlatVector<Vec<S,SCAL>> vec)
540
+ {
541
+ return FlatMatrixFixWidth<S,SCAL> (vec.Size(), vec.Data()->Data());
542
+ }
543
+
544
+ inline auto VecVector2Matrix (FlatVector<Complex> vec)
545
+ {
546
+ return FlatMatrixFixWidth<1,Complex> (vec.Size(), vec.Data());
547
+ }
359
548
 
360
549
 
361
550
  template <typename entry_type=Complex>
362
- class SingularMLMultiPole
551
+ class SingularMLExpansion
363
552
  {
553
+ using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
364
554
  static Array<size_t> nodes_on_level;
365
555
 
556
+ struct RecordingSS
557
+ {
558
+ const SphericalExpansion<Singular,entry_type> * mp_source;
559
+ SphericalExpansion<Singular,entry_type> * mp_target;
560
+ Vec<3> dist;
561
+ double len, theta, phi;
562
+ bool flipz;
563
+ public:
564
+ RecordingSS() = default;
565
+ RecordingSS (const SphericalExpansion<Singular,entry_type> * amp_source,
566
+ SphericalExpansion<Singular,entry_type> * amp_target,
567
+ Vec<3> adist)
568
+ : mp_source(amp_source), mp_target(amp_target), dist(adist)
569
+ {
570
+ std::tie(len, theta, phi) = SphericalCoordinates(adist);
571
+ // flipz = false;
572
+ flipz = theta > M_PI/2;
573
+ if (flipz) theta = M_PI-theta;
574
+ }
575
+ };
576
+
577
+
578
+ static void ProcessBatchSS(FlatArray<RecordingSS*> batch, double len, double theta) {
579
+ constexpr int vec_length = VecLength<entry_type>;
580
+ int batch_size = batch.Size();
581
+ int N = batch_size * vec_length;
582
+ // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(entry_type).name() << ", len = " << len << ", theta = " << theta << endl;
583
+
584
+ if (N <= 1 || batch_size <= 1) {
585
+ for (auto* rec : batch) {
586
+ rec->mp_source->TransformAdd(*rec->mp_target, rec->dist, true);
587
+ }
588
+ }
589
+ else if (N <= 3) {
590
+ ProcessVectorizedBatchSS<3, vec_length>(batch, len, theta);
591
+ }
592
+ else if (N <= 4) {
593
+ ProcessVectorizedBatchSS<4, vec_length>(batch, len, theta);
594
+ }
595
+ else if (N <= 6) {
596
+ ProcessVectorizedBatchSS<6, vec_length>(batch, len, theta);
597
+ }
598
+ else if (N <= 12) {
599
+ ProcessVectorizedBatchSS<12, vec_length>(batch, len, theta);
600
+ }
601
+ else if (N <= 24) {
602
+ ProcessVectorizedBatchSS<24, vec_length>(batch, len, theta);
603
+ }
604
+ else if (N <= 48) {
605
+ ProcessVectorizedBatchSS<48, vec_length>(batch, len, theta);
606
+ }
607
+ else if (N <= 96) {
608
+ ProcessVectorizedBatchSS<96, vec_length>(batch, len, theta);
609
+ }
610
+ else if (N <= 192) {
611
+ ProcessVectorizedBatchSS<192, vec_length>(batch, len, theta);
612
+ }
613
+ else {
614
+ // Split large batches
615
+ ProcessBatchSS(batch.Range(0, 192 / vec_length), len, theta);
616
+ ProcessBatchSS(batch.Range(192 / vec_length, batch_size), len, theta);
617
+ }
618
+ }
619
+
620
+ template<int N, int vec_length>
621
+ static void ProcessVectorizedBatchSS(FlatArray<RecordingSS*> batch, double len, double theta) {
622
+
623
+ // *testout << "Processing vectorized S->S batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
624
+ double kappa = batch[0]->mp_source->Kappa();
625
+ int so = batch[0]->mp_source->Order();
626
+ int to = batch[0]->mp_target->Order();
627
+ SphericalExpansion<Singular, Vec<N,Complex>> vec_source(so, kappa, batch[0]->mp_source->RTyp());
628
+ SphericalExpansion<Singular, Vec<N,Complex>> vec_target(to, kappa, batch[0]->mp_target->RTyp());
629
+
630
+ // Copy multipoles into vectorized multipole
631
+ for (int i = 0; i < batch.Size(); i++)
632
+ {
633
+ auto source_i = VecVector2Matrix (batch[i]->mp_source->SH().Coefs());
634
+ auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
635
+ batch[i]->mp_source->SH().RotateZFlip(batch[i]->phi, batch[i]->flipz,
636
+ [source_i, source_mati] (size_t ii, Complex factor)
637
+ {
638
+ source_mati.Row(ii) = factor * source_i.Row(ii);
639
+ });
640
+ }
641
+
642
+ vec_source.SH().RotateY(theta, vec_source.SH().Order() >= 100);
643
+ vec_source.ShiftZ(-len, vec_target);
644
+ vec_target.SH().RotateY(-theta, vec_target.SH().Order() >= 100);
645
+
646
+ // Copy vectorized multipole into individual multipoles
647
+ for (int i = 0; i < batch.Size(); i++)
648
+ {
649
+ auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
650
+ auto target_mati = VecVector2Matrix (batch[i]->mp_target->SH().Coefs());
651
+ batch[i]->mp_target->SH().RotateZFlip(-batch[i]->phi, batch[i]->flipz,
652
+ [source_mati, target_mati] (size_t ii, Complex factor)
653
+ {
654
+ AtomicAdd (target_mati.Row(ii), factor * source_mati.Row(ii));
655
+ });
656
+ }
657
+ }
658
+
366
659
  struct Node
367
660
  {
368
661
  Vec<3> center;
369
662
  double r;
370
663
  int level;
371
664
  std::array<unique_ptr<Node>,8> childs;
372
- MultiPole<MPSingular, entry_type> mp;
665
+ SphericalExpansion<Singular, entry_type> mp;
373
666
 
374
667
  Array<tuple<Vec<3>, entry_type>> charges;
375
668
  Array<tuple<Vec<3>, Vec<3>, entry_type>> dipoles;
669
+ Array<tuple<Vec<3>, entry_type, Vec<3>, entry_type>> chargedipoles;
376
670
  Array<tuple<Vec<3>, Vec<3>, Complex,int>> currents;
671
+
672
+ using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
673
+ Array<tuple<Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_charges;
674
+ Array<tuple<Vec<3,SIMD<double,FMM_SW>>, Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_dipoles;
675
+ Array<tuple<Vec<3,SIMD<double,FMM_SW>>, simd_entry_type,
676
+ Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_chargedipoles;
677
+
377
678
  int total_sources;
679
+ const FMM_Parameters & fmm_params;
680
+ std::mutex node_mutex;
681
+ atomic<bool> have_childs{false};
378
682
 
379
- Node (Vec<3> acenter, double ar, int alevel, int order, double kappa)
380
- : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*kappa), kappa, min(1.0, r*kappa))
381
- // : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*kappa), kappa, 1.0)
683
+ Node (Vec<3> acenter, double ar, int alevel, double akappa, const FMM_Parameters & afmm_params)
684
+ // : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*akappa), akappa, ar), fmm_params(afmm_params)
685
+ : center(acenter), r(ar), level(alevel), mp(afmm_params.minorder+2*ar*akappa, akappa, ar), fmm_params(afmm_params)
382
686
  {
383
- // cout << "singml, add node, level = " << level << endl;
384
687
  if (level < nodes_on_level.Size())
385
688
  nodes_on_level[level]++;
386
689
  }
387
690
 
388
-
691
+ int GetChildNum (Vec<3> x) const
692
+ {
693
+ int childnum = 0;
694
+ if (x(0) > center(0)) childnum += 1;
695
+ if (x(1) > center(1)) childnum += 2;
696
+ if (x(2) > center(2)) childnum += 4;
697
+ return childnum;
698
+ }
699
+
389
700
  void CreateChilds()
390
701
  {
391
702
  if (childs[0]) throw Exception("have already childs");
@@ -395,78 +706,127 @@ namespace ngfem
395
706
  cc(0) += (i&1) ? r/2 : -r/2;
396
707
  cc(1) += (i&2) ? r/2 : -r/2;
397
708
  cc(2) += (i&4) ? r/2 : -r/2;
398
- childs[i] = make_unique<Node> (cc, r/2, level+1, max(mp.SH().Order()/2, 8), mp.Kappa());
709
+ childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa(), fmm_params);
399
710
  }
711
+ have_childs = true;
400
712
  }
401
713
 
402
714
 
715
+ void SendSourcesToChilds()
716
+ {
717
+ CreateChilds();
718
+
719
+ for (auto [x,c] : charges)
720
+ AddCharge (x,c);
721
+ for (auto [x,d,c] : dipoles)
722
+ AddDipole (x,d,c);
723
+ for (auto [x,c,d,c2] : chargedipoles)
724
+ AddChargeDipole (x,c,d,c2);
725
+ for (auto [sp,ep,j,num] : currents)
726
+ AddCurrent (sp,ep,j,num);
727
+
728
+ charges.DeleteAll();
729
+ dipoles.DeleteAll();
730
+ chargedipoles.DeleteAll();
731
+ currents.DeleteAll();
732
+ }
733
+
734
+
403
735
  void AddCharge (Vec<3> x, entry_type c)
404
736
  {
405
- if (childs[0])
737
+ if (have_childs) // quick check without locking
406
738
  {
407
739
  // directly send to childs:
408
- int childnum = 0;
409
- if (x(0) > center(0)) childnum += 1;
410
- if (x(1) > center(1)) childnum += 2;
411
- if (x(2) > center(2)) childnum += 4;
740
+ int childnum = GetChildNum(x);
412
741
  childs[childnum] -> AddCharge(x, c);
413
742
  return;
414
743
  }
415
744
 
416
- charges.Append( tuple{x,c} );
745
+ lock_guard<mutex> guard(node_mutex);
417
746
 
418
- if (r*mp.Kappa() < 1e-8) return;
419
- if (charges.Size() < maxdirect && r*mp.Kappa() < 1)
420
- return;
747
+ if (have_childs) // test again after locking
748
+ {
749
+ int childnum = GetChildNum(x);
750
+ childs[childnum] -> AddCharge(x, c);
751
+ return;
752
+ }
421
753
 
422
- CreateChilds();
754
+ charges.Append( tuple{x,c} );
423
755
 
424
- for (auto [x,c] : charges)
425
- AddCharge (x,c);
426
- for (auto [x,d,c] : dipoles)
427
- AddDipole (x,d,c);
428
- for (auto [sp,ep,j,num] : currents)
429
- AddCurrent (sp,ep,j,num);
756
+ // if (r*mp.Kappa() < 1e-8) return;
757
+ if (level > 20) return;
758
+ if (charges.Size() < fmm_params.maxdirect && r*mp.Kappa() < 5)
759
+ return;
430
760
 
431
- charges.SetSize0();
432
- dipoles.SetSize0();
433
- currents.SetSize0();
761
+ SendSourcesToChilds();
434
762
  }
435
763
 
436
764
 
437
765
  void AddDipole (Vec<3> x, Vec<3> d, entry_type c)
438
766
  {
439
- if (childs[0])
767
+ if (have_childs)
440
768
  {
441
769
  // directly send to childs:
442
-
443
- int childnum = 0;
444
- if (x(0) > center(0)) childnum += 1;
445
- if (x(1) > center(1)) childnum += 2;
446
- if (x(2) > center(2)) childnum += 4;
770
+ int childnum = GetChildNum(x);
447
771
  childs[childnum] -> AddDipole(x, d, c);
448
772
  return;
449
773
  }
450
774
 
451
- dipoles.Append (tuple{x,d,c});
775
+ lock_guard<mutex> guard(node_mutex);
452
776
 
453
- if (dipoles.Size() < maxdirect || r < 1e-8)
777
+ if (have_childs)
778
+ {
779
+ // directly send to childs:
780
+ int childnum = GetChildNum(x);
781
+ childs[childnum] -> AddDipole(x, d, c);
782
+ return;
783
+ }
784
+
785
+ dipoles.Append (tuple{x,d,c});
786
+
787
+ if (level > 20) return;
788
+ if (dipoles.Size() < fmm_params.maxdirect)
454
789
  return;
790
+
791
+ SendSourcesToChilds();
792
+ }
793
+
794
+
795
+ void AddChargeDipole (Vec<3> x, entry_type c, Vec<3> dir, entry_type c2)
796
+ {
797
+ if (have_childs)
798
+ {
799
+ // directly send to childs:
800
+ int childnum = GetChildNum(x);
801
+ childs[childnum] -> AddChargeDipole(x, c, dir, c2);
802
+ return;
803
+ }
804
+
805
+ lock_guard<mutex> guard(node_mutex);
806
+
807
+ if (have_childs)
808
+ {
809
+ // directly send to childs:
810
+ int childnum = GetChildNum(x);
811
+ childs[childnum] -> AddChargeDipole(x, c, dir, c2);
812
+ return;
813
+ }
455
814
 
456
- CreateChilds();
815
+ chargedipoles.Append (tuple{x,c,dir,c2});
457
816
 
458
- for (auto [x,c] : charges)
459
- AddCharge (x,c);
460
- for (auto [x,d,c] : dipoles)
461
- AddDipole (x,d,c);
462
- for (auto [sp,ep,j,num] : currents)
463
- AddCurrent (sp,ep,j,num);
817
+ if (chargedipoles.Size() < fmm_params.maxdirect || r < 1e-8)
818
+ return;
464
819
 
465
- charges.SetSize0();
466
- dipoles.SetSize0();
467
- currents.SetSize0();
820
+ SendSourcesToChilds();
821
+
822
+ /*
823
+ AddCharge (x, c);
824
+ AddDipole (x, dir, c2);
825
+ */
468
826
  }
469
827
 
828
+
829
+ // not parallel yet
470
830
  void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
471
831
  {
472
832
  if (childs[0])
@@ -475,7 +835,7 @@ namespace ngfem
475
835
  Array<double> split;
476
836
  split.Append(0);
477
837
  for (int i = 0; i < 3; i++)
478
- if (sp(i) < center(i) != ep(i) < center(i))
838
+ if ((sp(i) < center(i)) != (ep(i) < center(i)))
479
839
  split += (center(i)-sp(i)) / (ep(i)-sp(i)); // segment cuts i-th coordinate plane
480
840
  split.Append(1);
481
841
  BubbleSort(split);
@@ -496,9 +856,15 @@ namespace ngfem
496
856
  }
497
857
  return;
498
858
  }
499
-
859
+
500
860
  currents.Append (tuple{sp,ep,j,num});
501
861
 
862
+ // if (currents.Size() < maxdirect || r < 1e-8)
863
+ if (currents.Size() < 4 || r < 1e-8)
864
+ return;
865
+
866
+ SendSourcesToChilds();
867
+ /*
502
868
  // if (currents.Size() < maxdirect || r < 1e-8)
503
869
  if (currents.Size() < 4 || r < 1e-8)
504
870
  return;
@@ -515,6 +881,7 @@ namespace ngfem
515
881
  charges.SetSize0();
516
882
  dipoles.SetSize0();
517
883
  currents.SetSize0();
884
+ */
518
885
  }
519
886
 
520
887
 
@@ -530,47 +897,169 @@ namespace ngfem
530
897
  return sum;
531
898
  }
532
899
 
533
- for (auto [x,c] : charges)
534
- if (double rho = L2Norm(p-x); rho > 0)
535
- sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
536
-
537
- for (auto [x,d,c] : dipoles)
538
- if (double rho = L2Norm(p-x); rho > 0)
539
- {
540
- Vec<3> drhodp = 1.0/rho * (p-x);
541
- Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
542
- (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
543
- sum += dGdrho * InnerProduct(drhodp, d) * c;
544
- }
545
-
546
- for (auto [sp,ep,j,num] : currents)
900
+ if (simd_charges.Size())
547
901
  {
548
- // should use explizit formula instead ...
902
+ // static Timer t("mptool singmp, evaluate, simd charges"); RegionTimer r(t);
903
+ // t.AddFlops (charges.Size());
549
904
 
550
- Vec<3> tau = ep-sp;
551
- Vec<3> tau_num = 1.0/num * tau;
552
- for (int i = 0; i < num; i++)
905
+ simd_entry_type vsum{0.0};
906
+ if (mp.Kappa() < 1e-12)
553
907
  {
554
- Vec<3> x = sp+(i+0.5)*tau_num;
555
-
556
- if (double rho = L2Norm(p-x); rho > 0)
908
+ for (auto [x,c] : simd_charges)
557
909
  {
558
- Vec<3> drhodp = 1.0/rho * (p-x);
559
- Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
560
- (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
561
-
562
- if constexpr (std::is_same<entry_type, Vec<3,Complex>>())
563
- sum += j*dGdrho * Cross(drhodp, tau_num);
910
+ auto rho = L2Norm(p-x);
911
+ auto kernel = 1/(4*M_PI)/rho;
912
+ kernel = If(rho > 0.0, kernel, SIMD<double,FMM_SW>(0.0));
913
+ vsum += kernel * c;
914
+
915
+ /*
916
+ auto rho2 = L2Norm2(p-x);
917
+ auto kernel = (1/(4*M_PI)) * rsqrt(rho2);
918
+ kernel = If(rho2 > 0.0, kernel, SIMD<double,FMM_SW>(0.0));
919
+ vsum += kernel * c;
920
+ */
564
921
  }
565
922
  }
566
- }
567
-
923
+ else if (mp.Kappa() < 1e-8)
924
+ for (auto [x,c] : simd_charges)
925
+ {
926
+ auto rho = L2Norm(p-x);
927
+ auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW> (1,rho*mp.Kappa()) / rho;
928
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
929
+ vsum += kernel * c;
930
+ }
931
+ else
932
+ for (auto [x,c] : simd_charges)
933
+ {
934
+ auto rho = L2Norm(p-x);
935
+ auto [si,co] = sincos(rho*mp.Kappa());
936
+ auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) / rho;
937
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
938
+ vsum += kernel * c;
939
+ }
940
+
941
+ sum += HSum(vsum);
942
+ }
943
+ else
944
+ {
945
+ if (mp.Kappa() < 1e-8)
946
+ {
947
+ for (auto [x,c] : charges)
948
+ if (double rho = L2Norm(p-x); rho > 0)
949
+ sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
950
+ }
951
+ else
952
+ for (auto [x,c] : charges)
953
+ if (double rho = L2Norm(p-x); rho > 0)
954
+ sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
955
+ }
956
+
957
+ if (simd_dipoles.Size())
958
+ {
959
+ // static Timer t("mptool singmp, evaluate, simd dipoles"); RegionTimer r(t);
960
+
961
+ simd_entry_type vsum{0.0};
962
+ for (auto [x,d,c] : simd_dipoles)
963
+ {
964
+ auto rho = L2Norm(p-x);
965
+ auto drhodp = (1.0/rho) * (p-x);
966
+ auto [si,co] = sincos(rho*mp.Kappa());
967
+ auto dGdrho = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) *
968
+ (-1.0/(rho*rho) + SIMD<Complex,FMM_SW>(0, mp.Kappa())/rho);
969
+ auto kernel = dGdrho * InnerProduct(drhodp, d);
970
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
971
+ vsum += kernel * c;
972
+ }
973
+ sum += HSum(vsum);
974
+ }
975
+ else
976
+ {
977
+ for (auto [x,d,c] : dipoles)
978
+ if (double rho = L2Norm(p-x); rho > 0)
979
+ {
980
+ Vec<3> drhodp = 1.0/rho * (p-x);
981
+ Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
982
+ (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
983
+ sum += dGdrho * InnerProduct(drhodp, d) * c;
984
+ }
985
+ }
986
+
987
+
988
+
989
+ if (simd_chargedipoles.Size())
990
+ {
991
+ // static Timer t("mptool singmp, evaluate, simd chargedipoles"); RegionTimer r(t);
992
+ // t.AddFlops (simd_chargedipoles.Size()*FMM_SW);
993
+
994
+ simd_entry_type vsum{0.0};
995
+ for (auto [x,c,d,c2] : simd_chargedipoles)
996
+ {
997
+ auto rho = L2Norm(p-x);
998
+ auto rhokappa = rho*mp.Kappa();
999
+ auto invrho = If(rho>0.0, 1.0/rho, SIMD<double,FMM_SW>(0.0));
1000
+ auto [si,co] = sincos(rhokappa);
1001
+
1002
+ auto kernelc = (1/(4*M_PI))*invrho*SIMD<Complex,FMM_SW>(co,si);
1003
+ vsum += kernelc * c;
1004
+
1005
+ auto kernel =
1006
+ invrho*invrho * InnerProduct(p-x, d) *
1007
+ kernelc * SIMD<Complex,FMM_SW>(-1.0, rhokappa);
1008
+
1009
+ vsum += kernel * c2;
1010
+ }
1011
+ sum += HSum(vsum);
1012
+ }
1013
+ else
1014
+ {
1015
+ // static Timer t("mptool singmp, evaluate, chargedipoles"); RegionTimer r(t);
1016
+ // t.AddFlops (chargedipoles.Size());
1017
+
1018
+ for (auto [x,c,d,c2] : chargedipoles)
1019
+ if (double rho = L2Norm(p-x); rho > 0)
1020
+ {
1021
+ sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
1022
+
1023
+ Vec<3> drhodp = 1.0/rho * (p-x);
1024
+ Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
1025
+ (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
1026
+
1027
+ sum += dGdrho * InnerProduct(drhodp, d) * c2;
1028
+ }
1029
+ }
1030
+
1031
+
1032
+
1033
+
1034
+
1035
+ for (auto [sp,ep,j,num] : currents)
1036
+ {
1037
+ // should use explizit formula instead ...
1038
+
1039
+ Vec<3> tau = ep-sp;
1040
+ Vec<3> tau_num = 1.0/num * tau;
1041
+ for (int i = 0; i < num; i++)
1042
+ {
1043
+ Vec<3> x = sp+(i+0.5)*tau_num;
1044
+
1045
+ if (double rho = L2Norm(p-x); rho > 0)
1046
+ {
1047
+ Vec<3> drhodp = 1.0/rho * (p-x);
1048
+ Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
1049
+ (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
1050
+
1051
+ if constexpr (std::is_same<entry_type, Vec<3,Complex>>())
1052
+ sum += j*dGdrho * Cross(drhodp, tau_num);
1053
+ }
1054
+ }
1055
+ }
1056
+
568
1057
  return sum;
569
1058
  }
570
1059
 
571
1060
  entry_type EvaluateDeriv(Vec<3> p, Vec<3> d) const
572
1061
  {
573
- entry_type sum = 0;
1062
+ entry_type sum{0.0};
574
1063
  if (childs[0])
575
1064
  {
576
1065
  for (auto & child : childs)
@@ -579,7 +1068,16 @@ namespace ngfem
579
1068
  }
580
1069
 
581
1070
  if (dipoles.Size())
582
- throw Exception("EvaluateDeriv not implemented for dipoles in SingularMLMultiPole");
1071
+ {
1072
+ static int cnt = 0;
1073
+ cnt++;
1074
+ if (cnt < 3)
1075
+ cout << "we know what we do - evaluateDeriv not implemented for dipoles in SingularMLExpansion" << endl;
1076
+ // return sum;
1077
+ // throw Exception("EvaluateDeriv not implemented for dipoles in SingularMLExpansion");
1078
+ }
1079
+ if (chargedipoles.Size())
1080
+ throw Exception("EvaluateDeriv not implemented for dipoles in SingularMLExpansion");
583
1081
 
584
1082
  for (auto [x,c] : charges)
585
1083
  if (double rho = L2Norm(p-x); rho > 0)
@@ -594,7 +1092,7 @@ namespace ngfem
594
1092
 
595
1093
  void CalcTotalSources()
596
1094
  {
597
- total_sources = charges.Size() + dipoles.Size();
1095
+ total_sources = charges.Size() + dipoles.Size() + chargedipoles.Size();
598
1096
  for (auto & child : childs)
599
1097
  if (child)
600
1098
  {
@@ -603,46 +1101,111 @@ namespace ngfem
603
1101
  }
604
1102
  }
605
1103
 
606
- void CalcMP()
1104
+ void CalcMP(Array<RecordingSS> * recording, Array<Node*> * nodes_to_process)
607
1105
  {
608
- mp.SH().Coefs() = 0.0;
1106
+ // mp.SH().Coefs() = 0.0;
609
1107
  if (childs[0])
610
1108
  {
611
- if (total_sources < 1000)
1109
+ if (total_sources < 1000 || recording)
612
1110
  for (auto & child : childs)
613
- child->CalcMP();
1111
+ child->CalcMP(recording, nodes_to_process);
614
1112
  else
615
1113
  ParallelFor (8, [&] (int nr)
616
1114
  {
617
- childs[nr] -> CalcMP();
1115
+ childs[nr] -> CalcMP(recording, nodes_to_process);
618
1116
  });
619
1117
 
620
1118
 
621
- for (auto & child : childs)
622
- child->mp.TransformAdd(mp, center-child->center);
1119
+ for (auto & child : childs){
1120
+ if (recording && child->mp.SH().Coefs().Size() > 0)
1121
+ *recording += RecordingSS(&child->mp, &mp, center-child->center);
1122
+ else
1123
+ child->mp.TransformAdd(mp, center-child->center);
1124
+ }
623
1125
  }
624
1126
  else
625
1127
  {
626
- if (charges.Size()+dipoles.Size()+currents.Size() == 0)
1128
+ if (charges.Size()+dipoles.Size()+chargedipoles.Size()+currents.Size() == 0)
627
1129
  {
628
- mp = MultiPole<MPSingular,entry_type> (-1, mp.Kappa());
1130
+ mp = SphericalExpansion<Singular,entry_type> (-1, mp.Kappa(), 1.);
629
1131
  return;
630
1132
  }
631
1133
 
632
- for (auto [x,c] : charges)
633
- mp.AddCharge (x-center,c);
634
-
635
- for (auto [x,d,c] : dipoles)
636
- mp.AddDipole (x-center, d, c);
1134
+ // make simd charges, comment this block for testing ...
1135
+ simd_charges.SetSize( (charges.Size()+FMM_SW-1)/FMM_SW);
1136
+ size_t i = 0, ii = 0;
1137
+ for ( ; i+FMM_SW <= charges.Size(); i+=FMM_SW, ii++)
1138
+ {
1139
+ std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
1140
+ for (int j = 0; j < FMM_SW; j++) ca[j] = charges[i+j];
1141
+ simd_charges[ii] = MakeSimd(ca);
1142
+ }
1143
+ if (i < charges.Size())
1144
+ {
1145
+ std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
1146
+ int j = 0;
1147
+ for ( ; i+j < charges.Size(); j++) ca[j] = charges[i+j];
1148
+ for ( ; j < FMM_SW; j++) ca[j] = tuple( get<0>(ca[0]), entry_type{0.0} );
1149
+ simd_charges[ii] = MakeSimd(ca);
1150
+ }
1151
+
1152
+ simd_dipoles.SetSize( (dipoles.Size()+FMM_SW-1)/FMM_SW);
1153
+ i = 0, ii = 0;
1154
+ for ( ; i+FMM_SW <= dipoles.Size(); i+=FMM_SW, ii++)
1155
+ {
1156
+ std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
1157
+ for (int j = 0; j < FMM_SW; j++) di[j] = dipoles[i+j];
1158
+ simd_dipoles[ii] = MakeSimd(di);
1159
+ }
1160
+ if (i < dipoles.Size())
1161
+ {
1162
+ std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
1163
+ int j = 0;
1164
+ for ( ; i+j < dipoles.Size(); j++) di[j] = dipoles[i+j];
1165
+ for ( ; j < FMM_SW; j++) di[j] = tuple( get<0>(di[0]), get<1>(di[0]), entry_type{0.0} );
1166
+ simd_dipoles[ii] = MakeSimd(di);
1167
+ }
637
1168
 
638
- for (auto [sp,ep,j,num] : currents)
639
- mp.AddCurrent (sp-center, ep-center, j, num);
1169
+
1170
+ simd_chargedipoles.SetSize( (chargedipoles.Size()+FMM_SW-1)/FMM_SW);
1171
+ i = 0, ii = 0;
1172
+ for ( ; i+FMM_SW <= chargedipoles.Size(); i+=FMM_SW, ii++)
1173
+ {
1174
+ std::array<tuple<Vec<3>,entry_type,Vec<3>,entry_type>, FMM_SW> di;
1175
+ for (int j = 0; j < FMM_SW; j++) di[j] = chargedipoles[i+j];
1176
+ simd_chargedipoles[ii] = MakeSimd(di);
1177
+ }
1178
+ if (i < chargedipoles.Size())
1179
+ {
1180
+ std::array<tuple<Vec<3>,entry_type,Vec<3>,entry_type>, FMM_SW> di;
1181
+ int j = 0;
1182
+ for ( ; i+j < chargedipoles.Size(); j++) di[j] = chargedipoles[i+j];
1183
+ for ( ; j < FMM_SW; j++) di[j] = tuple( get<0>(di[0]), entry_type{0.0}, get<2>(di[0]), entry_type{0.0} );
1184
+ simd_chargedipoles[ii] = MakeSimd(di);
1185
+ }
1186
+
1187
+
1188
+ if (nodes_to_process)
1189
+ *nodes_to_process += this;
1190
+ else {
1191
+ for (auto [x,c] : charges)
1192
+ mp.AddCharge (x-center,c);
1193
+
1194
+ for (auto [x,d,c] : dipoles)
1195
+ mp.AddDipole (x-center, d, c);
1196
+
1197
+ for (auto [x,c,d,c2] : chargedipoles)
1198
+ mp.AddChargeDipole (x-center, c, d, c2);
1199
+
1200
+ for (auto [sp,ep,j,num] : currents)
1201
+ mp.AddCurrent (sp-center, ep-center, j, num);
1202
+ }
640
1203
  }
641
1204
  }
642
1205
 
643
1206
  entry_type EvaluateMP(Vec<3> p) const
644
1207
  {
645
- if (charges.Size() || dipoles.Size())
1208
+ if (charges.Size() || dipoles.Size() || chargedipoles.Size())
646
1209
  return Evaluate(p);
647
1210
 
648
1211
  if (L2Norm(p-center) > 3*r)
@@ -662,7 +1225,7 @@ namespace ngfem
662
1225
  // cout << "EvaluateMPDeriv Singular, p = " << p << ", d = " << d << ", r = " << r << ", center = " << center << endl;
663
1226
  // cout << "Norm: " << L2Norm(p-center) << " > " << 3*r << endl;
664
1227
  // cout << "charges.Size() = " << charges.Size() << ", dipoles.Size() = " << dipoles.Size() << endl;
665
- if (charges.Size() || dipoles.Size() || !childs[0])
1228
+ if (charges.Size() || dipoles.Size() || chargedipoles.Size() || !childs[0])
666
1229
  return EvaluateDeriv(p, d);
667
1230
 
668
1231
  if (L2Norm(p-center) > 3*r)
@@ -685,6 +1248,8 @@ namespace ngfem
685
1248
  ost << "xi = " << x << ", ci = " << c << endl;
686
1249
  for (auto [x,d,c] : dipoles)
687
1250
  ost << "xi = " << x << ", di = " << d << ", ci = " << c << endl;
1251
+ for (auto [x,c,d,c2] : chargedipoles)
1252
+ ost << "xi = " << x << ", c = " << c << ", di = " << d << ", ci = " << c2 << endl;
688
1253
 
689
1254
  for (int i = 0; i < 8; i++)
690
1255
  if (childs[i]) childs[i] -> Print (ost, i);
@@ -707,14 +1272,23 @@ namespace ngfem
707
1272
  num += ch->NumCoefficients();
708
1273
  return num;
709
1274
  }
1275
+
1276
+ void TraverseTree (const std::function<void(Node&)> & func)
1277
+ {
1278
+ func(*this);
1279
+ for (auto & child : childs)
1280
+ if (child)
1281
+ child->TraverseTree(func);
1282
+ }
710
1283
  };
711
1284
 
712
- Node root;
1285
+ FMM_Parameters fmm_params;
1286
+ Node root;
713
1287
  bool havemp = false;
714
1288
 
715
1289
  public:
716
- SingularMLMultiPole (Vec<3> center, double r, int order, double kappa)
717
- : root(center, r, 0, order, kappa)
1290
+ SingularMLExpansion (Vec<3> center, double r, double kappa, FMM_Parameters _params = FMM_Parameters())
1291
+ : fmm_params(_params), root(center, r, 0, kappa, fmm_params)
718
1292
  {
719
1293
  nodes_on_level = 0;
720
1294
  nodes_on_level[0] = 1;
@@ -722,16 +1296,21 @@ namespace ngfem
722
1296
 
723
1297
  double Kappa() const { return root.mp.Kappa(); }
724
1298
 
725
- void AddCharge(Vec<3> x, Complex c)
1299
+ void AddCharge(Vec<3> x, entry_type c)
726
1300
  {
727
1301
  root.AddCharge(x, c);
728
1302
  }
729
1303
 
730
- void AddDipole(Vec<3> x, Vec<3> d, Complex c)
1304
+ void AddDipole(Vec<3> x, Vec<3> d, entry_type c)
731
1305
  {
732
1306
  root.AddDipole(x, d, c);
733
1307
  }
734
1308
 
1309
+ void AddChargeDipole(Vec<3> x, entry_type c, Vec<3> dir, entry_type c2)
1310
+ {
1311
+ root.AddChargeDipole(x, c, dir, c2);
1312
+ }
1313
+
735
1314
  void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
736
1315
  {
737
1316
  if constexpr (!std::is_same<entry_type, Vec<3,Complex>>())
@@ -756,7 +1335,7 @@ namespace ngfem
756
1335
  }
757
1336
  */
758
1337
  }
759
-
1338
+
760
1339
  void Print (ostream & ost) const
761
1340
  {
762
1341
  root.Print(ost);
@@ -775,6 +1354,10 @@ namespace ngfem
775
1354
  void CalcMP()
776
1355
  {
777
1356
  static Timer t("mptool compute singular MLMP"); RegionTimer rg(t);
1357
+ static Timer ts2mp("mptool compute singular MLMP - source2mp");
1358
+ static Timer tS2S("mptool compute singular MLMP - S->S");
1359
+ static Timer trec("mptool comput singular recording");
1360
+ static Timer tsort("mptool comput singular sort");
778
1361
 
779
1362
  /*
780
1363
  int maxlevel = 0;
@@ -784,9 +1367,96 @@ namespace ngfem
784
1367
  for (int i = 0; i <= maxlevel; i++)
785
1368
  cout << "sing " << i << ": " << nodes_on_level[i] << endl;
786
1369
  */
1370
+
787
1371
  root.CalcTotalSources();
788
- root.CalcMP();
1372
+
1373
+ if constexpr (false)
1374
+ // direct evaluation of S->S
1375
+ root.CalcMP(nullptr, nullptr);
1376
+ else
1377
+ {
1378
+
1379
+ Array<RecordingSS> recording;
1380
+ Array<Node*> nodes_to_process;
1381
+
1382
+ {
1383
+ RegionTimer reg(trec);
1384
+ root.CalcMP(&recording, &nodes_to_process);
1385
+ }
1386
+
1387
+ {
1388
+ RegionTimer rs2mp(ts2mp);
1389
+ ParallelFor(nodes_to_process.Size(), [&](int i)
1390
+ {
1391
+ auto node = nodes_to_process[i];
1392
+ for (auto [x,c]: node->charges)
1393
+ node->mp.AddCharge(x-node->center, c);
1394
+ for (auto [x,d,c]: node->dipoles)
1395
+ node->mp.AddDipole(x-node->center, d, c);
1396
+ for (auto [x,c,d,c2]: node->chargedipoles)
1397
+ node->mp.AddChargeDipole(x-node->center, c, d, c2);
1398
+ for (auto [sp,ep,j,num]: node->currents)
1399
+ node->mp.AddCurrent(sp-node->center, ep-node->center, j, num);
1400
+ }, TasksPerThread(4));
1401
+ }
1402
+
1403
+ {
1404
+ RegionTimer reg(tsort);
1405
+ QuickSort (recording, [] (auto & a, auto & b)
1406
+ {
1407
+ if (a.len < (1-1e-8) * b.len) return true;
1408
+ if (a.len > (1+1e-8) * b.len) return false;
1409
+ return a.theta < b.theta;
1410
+ });
1411
+ }
789
1412
 
1413
+ double current_len = -1e100;
1414
+ double current_theta = -1e100;
1415
+ Array<RecordingSS*> current_batch;
1416
+ Array<Array<RecordingSS*>> batch_group;
1417
+ Array<double> group_lengths;
1418
+ Array<double> group_thetas;
1419
+ for (auto & record : recording)
1420
+ {
1421
+ bool len_changed = fabs(record.len - current_len) > 1e-8;
1422
+ bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
1423
+ if ((len_changed || theta_changed) && current_batch.Size() > 0) {
1424
+ batch_group.Append(current_batch);
1425
+ group_lengths.Append(current_len);
1426
+ group_thetas.Append(current_theta);
1427
+ current_batch.SetSize(0);
1428
+ }
1429
+
1430
+ current_len = record.len;
1431
+ current_theta = record.theta;
1432
+ current_batch.Append(&record);
1433
+ }
1434
+
1435
+ if (current_batch.Size() > 0) {
1436
+ batch_group.Append(current_batch);
1437
+ group_lengths.Append(current_len);
1438
+ group_thetas.Append(current_theta);
1439
+ }
1440
+
1441
+ {
1442
+ RegionTimer rS2S(tS2S);
1443
+ // ParallelFor(batch_group.Size(), [&](int i) {
1444
+ for (int i = 0; i < batch_group.Size(); i++){
1445
+ // *testout << "Processing batch " << i << " of size " << batch_group[i].Size() << ", with len = " << group_lengths[i] << ", theta = " << group_thetas[i] << endl;
1446
+ int chunk_size = 24;
1447
+ if (batch_group[i].Size() < chunk_size)
1448
+ ProcessBatchSS(batch_group[i], group_lengths[i], group_thetas[i]);
1449
+ else
1450
+ ParallelForRange(IntRange(batch_group[i].Size()), [&](IntRange range) {
1451
+ auto sub_batch = batch_group[i].Range(range.First(), range.Next());
1452
+ ProcessBatchSS(sub_batch, group_lengths[i], group_thetas[i]);
1453
+ }, TasksPerThread(4));
1454
+ }
1455
+ }
1456
+ }
1457
+
1458
+ // cout << "have singular:" << endl;
1459
+ // PrintStatistics (cout);
790
1460
  havemp = true;
791
1461
  }
792
1462
 
@@ -798,23 +1468,198 @@ namespace ngfem
798
1468
  return root.Evaluate(p);
799
1469
  }
800
1470
 
1471
+
1472
+ void PrintStatistics (ostream & ost)
1473
+ {
1474
+ int levels = 0;
1475
+ int cnt = 0;
1476
+ root.TraverseTree( [&](Node & node) {
1477
+ levels = max(levels, node.level);
1478
+ cnt++;
1479
+ });
1480
+ ost << "levels: " << levels << endl;
1481
+ ost << "nodes: " << cnt << endl;
1482
+
1483
+ Array<int> num_on_level(levels+1);
1484
+ Array<int> order_on_level(levels+1);
1485
+ Array<size_t> coefs_on_level(levels+1);
1486
+ num_on_level = 0;
1487
+ order_on_level = 0;
1488
+ root.TraverseTree( [&](Node & node) {
1489
+ num_on_level[node.level]++;
1490
+ order_on_level[node.level] = max(order_on_level[node.level],node.mp.Order());
1491
+ coefs_on_level[node.level] += node.mp.SH().Coefs().Size();
1492
+ });
1493
+
1494
+ cout << "num on level" << endl;
1495
+ for (int i = 0; i < num_on_level.Size(); i++)
1496
+ cout << i << ": " << num_on_level[i] << ", order = " << order_on_level[i] << ", coefs " << coefs_on_level[i] << endl;
1497
+
1498
+ size_t totcoefs = 0;
1499
+ for (auto n : coefs_on_level)
1500
+ totcoefs += n;
1501
+ cout << "total mem in coefs: " << sizeof(entry_type)*totcoefs / sqr(1024) << " MB" << endl;
1502
+ }
1503
+
1504
+
1505
+
801
1506
  template <typename entry_type2>
802
- friend class RegularMLMultiPole;
1507
+ friend class RegularMLExpansion;
803
1508
  };
804
1509
 
805
1510
 
806
1511
  template <typename entry_type>
807
- inline ostream & operator<< (ostream & ost, const SingularMLMultiPole<entry_type> & mlmp)
1512
+ inline ostream & operator<< (ostream & ost, const SingularMLExpansion<entry_type> & mlmp)
808
1513
  {
809
1514
  mlmp.Print(ost);
810
1515
  return ost;
811
1516
  }
812
1517
 
813
1518
 
1519
+ // *********************************** Regular multilevel Expansion
1520
+
1521
+
814
1522
  template <typename elem_type=Complex>
815
- class NGS_DLL_HEADER RegularMLMultiPole
1523
+ class NGS_DLL_HEADER RegularMLExpansion
816
1524
  {
817
1525
  static Array<size_t> nodes_on_level;
1526
+
1527
+
1528
+ struct RecordingRS
1529
+ {
1530
+ const SphericalExpansion<Singular,elem_type> * mpS;
1531
+ SphericalExpansion<Regular,elem_type> * mpR;
1532
+ Vec<3> dist;
1533
+ double len, theta, phi;
1534
+ public:
1535
+ RecordingRS() = default;
1536
+ RecordingRS (const SphericalExpansion<Singular,elem_type> * ampS,
1537
+ SphericalExpansion<Regular,elem_type> * ampR,
1538
+ Vec<3> adist)
1539
+ : mpS(ampS), mpR(ampR), dist(adist)
1540
+ {
1541
+ std::tie(len, theta, phi) = SphericalCoordinates(dist);
1542
+ }
1543
+ };
1544
+
1545
+ static void ProcessBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
1546
+ // static Timer t("ProcessBatchRS"); RegionTimer reg(t, batch.Size());
1547
+ constexpr int vec_length = VecLength<elem_type>;
1548
+ int batch_size = batch.Size();
1549
+ int N = batch_size * vec_length;
1550
+ // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(elem_type).name() << ", len = " << len << ", theta = " << theta << endl;
1551
+
1552
+ if (N <= 1 || batch_size <= 1) {
1553
+ for (auto* rec : batch) {
1554
+ rec->mpS->TransformAdd(*rec->mpR, rec->dist);
1555
+ }
1556
+ }
1557
+ else if (N <= 3) {
1558
+ ProcessVectorizedBatchRS<3, vec_length>(batch, len, theta);
1559
+ }
1560
+ else if (N <= 4) {
1561
+ ProcessVectorizedBatchRS<4, vec_length>(batch, len, theta);
1562
+ }
1563
+ else if (N <= 6) {
1564
+ ProcessVectorizedBatchRS<6, vec_length>(batch, len, theta);
1565
+ }
1566
+ else if (N <= 12) {
1567
+ ProcessVectorizedBatchRS<12, vec_length>(batch, len, theta);
1568
+ }
1569
+ else if (N <= 24) {
1570
+ ProcessVectorizedBatchRS<24, vec_length>(batch, len, theta);
1571
+ }
1572
+ else if (N <= 48) {
1573
+ ProcessVectorizedBatchRS<48, vec_length>(batch, len, theta);
1574
+ }
1575
+ else if (N <= 96) {
1576
+ ProcessVectorizedBatchRS<96, vec_length>(batch, len, theta);
1577
+ }
1578
+ else if (N <= 192) {
1579
+ ProcessVectorizedBatchRS<192, vec_length>(batch, len, theta);
1580
+ }
1581
+ else {
1582
+ // Split large batches
1583
+ /*
1584
+ ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
1585
+ ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
1586
+ */
1587
+
1588
+ /*
1589
+ ParallelFor (2, [&] (int i)
1590
+ {
1591
+ if (i == 0)
1592
+ ProcessBatchRS(batch.Range(0, 192 / vec_length), len, theta);
1593
+ else
1594
+ ProcessBatchRS(batch.Range(192 / vec_length, batch_size), len, theta);
1595
+ }, 2);
1596
+ */
1597
+
1598
+
1599
+ size_t chunksize = 192/vec_length;
1600
+ size_t num = (batch.Size()+chunksize-1) / chunksize;
1601
+ ParallelFor (num, [&](int i)
1602
+ {
1603
+ ProcessBatchRS(batch.Range(i*chunksize, min((i+1)*chunksize, batch.Size())), len, theta);
1604
+ }, num);
1605
+
1606
+ }
1607
+ }
1608
+
1609
+
1610
+ template<int N, int vec_length>
1611
+ static void ProcessVectorizedBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
1612
+
1613
+ // static Timer t("ProcessVectorizedBatch, N = "+ToString(N) + ", vec_len = " + ToString(vec_length));
1614
+ // RegionTimer reg(t, batch[0]->mpS->SH().Order());
1615
+ // static Timer ttobatch("mptools - copy to batch 2");
1616
+ // static Timer tfrombatch("mptools - copy from batch 2");
1617
+
1618
+ // *testout << "Processing vectorized batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
1619
+ SphericalExpansion<Singular, Vec<N,Complex>> vec_source(batch[0]->mpS->Order(), batch[0]->mpS->Kappa(), batch[0]->mpS->RTyp());
1620
+ // SphericalExpansion<Singular, elem_type> tmp_source{*batch[0]->mpS};
1621
+ SphericalExpansion<Regular, elem_type> tmp_target{*batch[0]->mpR};
1622
+ SphericalExpansion<Regular, Vec<N,Complex>> vec_target(batch[0]->mpR->Order(), batch[0]->mpR->Kappa(), batch[0]->mpR->RTyp());
1623
+
1624
+ // Copy multipoles into vectorized multipole
1625
+ // ttobatch.Start();
1626
+ for (int i = 0; i < batch.Size(); i++)
1627
+ {
1628
+ auto source_i = VecVector2Matrix (batch[i]->mpS->SH().Coefs());
1629
+ auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
1630
+ batch[i]->mpS->SH().RotateZ(batch[i]->phi,
1631
+ [source_i, source_mati] (size_t ii, Complex factor)
1632
+ {
1633
+ source_mati.Row(ii) = factor * source_i.Row(ii);
1634
+ });
1635
+ }
1636
+
1637
+ // ttobatch.Stop();
1638
+
1639
+ vec_source.SH().RotateY(theta);
1640
+ vec_source.ShiftZ(-len, vec_target);
1641
+ vec_target.SH().RotateY(-theta);
1642
+
1643
+ // Copy vectorized multipole into individual multipoles
1644
+ // tfrombatch.Start();
1645
+ for (int i = 0; i < batch.Size(); i++) {
1646
+ // auto source_i = VecVector2Matrix (tmp_target.SH().Coefs());
1647
+ auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
1648
+ auto targeti = VecVector2Matrix(batch[i]->mpR->SH().Coefs());
1649
+
1650
+ tmp_target.SH().RotateZ(-batch[i]->phi,
1651
+ [source_mati, targeti] (size_t ii, Complex factor)
1652
+ {
1653
+ // source_i.Row(ii) = factor * source_mati.Row(ii);
1654
+ AtomicAdd (VectorView(targeti.Row(ii)), factor * source_mati.Row(ii));
1655
+ });
1656
+ // for (int j = 0; j < tmp_target.SH().Coefs().Size(); j++)
1657
+ // AtomicAdd(batch[i]->mpR->SH().Coefs()[j], tmp_target.SH().Coefs()[j]);
1658
+ }
1659
+ // tfrombatch.Stop();
1660
+
1661
+ }
1662
+
818
1663
 
819
1664
  struct Node
820
1665
  {
@@ -822,22 +1667,35 @@ namespace ngfem
822
1667
  double r;
823
1668
  int level;
824
1669
  std::array<unique_ptr<Node>,8> childs;
825
- MultiPole<MPRegular,elem_type> mp;
1670
+ SphericalExpansion<Regular,elem_type> mp;
826
1671
  Array<Vec<3>> targets;
1672
+ Array<tuple<Vec<3>,double>> vol_targets;
827
1673
  int total_targets;
1674
+ std::mutex node_mutex;
1675
+ atomic<bool> have_childs{false};
828
1676
 
829
- Array<const typename SingularMLMultiPole<elem_type>::Node*> singnodes;
1677
+ Array<const typename SingularMLExpansion<elem_type>::Node*> singnodes;
1678
+ const FMM_Parameters & params;
830
1679
 
831
- Node (Vec<3> acenter, double ar, int alevel, int order, double kappa)
832
- : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*kappa), kappa, 1.0/min(1.0, 0.25*r*kappa))
1680
+
1681
+ Node (Vec<3> acenter, double ar, int alevel, double kappa, const FMM_Parameters & _params)
1682
+ : center(acenter), r(ar), level(alevel),
1683
+ // mp(MPOrder(ar*kappa), kappa, ar) // 1.0/min(1.0, 0.25*r*kappa))
1684
+ mp(-1, kappa, ar), params(_params)
833
1685
  // : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*kappa), kappa, 1.0)
834
1686
  {
835
1687
  if (level < nodes_on_level.Size())
836
1688
  nodes_on_level[level]++;
837
1689
  }
838
1690
 
839
-
840
- void CreateChilds()
1691
+ void Allocate()
1692
+ {
1693
+ // mp = SphericalExpansion<Regular,elem_type>(MPOrder(r*mp.Kappa()), mp.Kappa(), r);
1694
+ mp = SphericalExpansion<Regular,elem_type>(params.minorder+2*r*mp.Kappa(), mp.Kappa(), r);
1695
+ }
1696
+
1697
+
1698
+ void CreateChilds(bool allocate = false)
841
1699
  {
842
1700
  if (childs[0]) throw Exception("have already childs");
843
1701
  // create children nodes:
@@ -847,15 +1705,19 @@ namespace ngfem
847
1705
  cc(0) += (i&1) ? r/2 : -r/2;
848
1706
  cc(1) += (i&2) ? r/2 : -r/2;
849
1707
  cc(2) += (i&4) ? r/2 : -r/2;
850
- childs[i] = make_unique<Node> (cc, r/2, level+1, max(mp.SH().Order()/2, 8), mp.Kappa());
1708
+ childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa(), params);
1709
+ if (allocate)
1710
+ childs[i] -> Allocate();
851
1711
  }
1712
+ have_childs = true;
852
1713
  }
853
-
854
- void AddSingularNode (const typename SingularMLMultiPole<elem_type>::Node & singnode, bool allow_refine)
1714
+
1715
+ void AddSingularNode (const typename SingularMLExpansion<elem_type>::Node & singnode, bool allow_refine,
1716
+ Array<RecordingRS> * recording)
855
1717
  {
856
1718
  if (mp.SH().Order() < 0) return;
857
1719
  if (singnode.mp.SH().Order() < 0) return;
858
- if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
1720
+ // if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
859
1721
  if (level > 20)
860
1722
  {
861
1723
  singnodes.Append(&singnode);
@@ -874,12 +1736,15 @@ namespace ngfem
874
1736
  singnode.childs[0]->mp.Order() < singnode.mp.Order())
875
1737
  {
876
1738
  for (auto & child : singnode.childs)
877
- AddSingularNode (*child, allow_refine);
1739
+ AddSingularNode (*child, allow_refine, recording);
878
1740
  return;
879
1741
  }
880
1742
 
881
1743
  // static Timer t("mptool transform Helmholtz-criterion"); RegionTimer r(t);
882
- singnode.mp.TransformAdd(mp, dist);
1744
+ if (recording)
1745
+ *recording += RecordingRS(&singnode.mp, &mp, dist);
1746
+ else
1747
+ singnode.mp.TransformAdd(mp, dist);
883
1748
  return;
884
1749
  }
885
1750
 
@@ -895,70 +1760,70 @@ namespace ngfem
895
1760
  if (allow_refine)
896
1761
  {
897
1762
  if (!childs[0])
898
- CreateChilds();
1763
+ CreateChilds(true);
899
1764
 
900
1765
  for (auto & ch : childs)
901
- ch -> AddSingularNode (singnode, allow_refine);
1766
+ ch -> AddSingularNode (singnode, allow_refine, recording);
902
1767
  }
903
1768
  else
904
1769
  {
905
- if (total_targets < 1000)
1770
+ if (total_targets < 1000 || recording)
906
1771
  {
907
1772
  for (auto & ch : childs)
908
1773
  if (ch)
909
- ch -> AddSingularNode (singnode, allow_refine);
1774
+ ch -> AddSingularNode (singnode, allow_refine, recording);
910
1775
  }
911
1776
  else
912
1777
  ParallelFor (8, [&] (int nr)
913
1778
  {
914
1779
  if (childs[nr])
915
- childs[nr] -> AddSingularNode (singnode, allow_refine);
1780
+ childs[nr] -> AddSingularNode (singnode, allow_refine, recording);
916
1781
  });
917
1782
 
918
- if (targets.Size())
1783
+ if (targets.Size()+vol_targets.Size())
919
1784
  singnodes.Append(&singnode);
920
1785
  }
921
1786
  }
922
1787
  else
923
1788
  {
924
1789
  for (auto & childsing : singnode.childs)
925
- AddSingularNode (*childsing, allow_refine);
1790
+ AddSingularNode (*childsing, allow_refine, recording);
926
1791
  }
927
1792
  }
928
1793
 
929
1794
  void LocalizeExpansion(bool allow_refine)
930
1795
  {
931
1796
  if (allow_refine)
932
- if (mp.Order() > 20 && !childs[0])
933
- CreateChilds();
1797
+ if (mp.Order() > 30 && !childs[0])
1798
+ CreateChilds(allow_refine);
934
1799
 
935
1800
  if (childs[0])
936
1801
  {
937
- for (auto & ch : childs)
1802
+ if (total_targets < 1000)
938
1803
  {
939
- if (L2Norm(mp.SH().Coefs()) > 0)
940
- mp.TransformAdd (ch->mp, ch->center-center);
941
- ch->LocalizeExpansion(allow_refine);
1804
+ for (int nr = 0; nr < 8; nr++)
1805
+ {
1806
+ if (L2Norm(mp.SH().Coefs()) > 0)
1807
+ mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
1808
+ childs[nr]->LocalizeExpansion(allow_refine);
1809
+ }
942
1810
  }
943
- mp = MultiPole<MPRegular,elem_type>(-1, mp.Kappa());
1811
+ else
1812
+ ParallelFor(8, [&] (int nr)
1813
+ {
1814
+ if (L2Norm(mp.SH().Coefs()) > 0)
1815
+ mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
1816
+ childs[nr]->LocalizeExpansion(allow_refine);
1817
+ });
1818
+ mp = SphericalExpansion<Regular,elem_type>(-1, mp.Kappa(), 1.);
944
1819
  //mp.SH().Coefs()=0.0;
945
1820
  }
946
1821
  }
947
1822
 
948
1823
  elem_type Evaluate (Vec<3> p) const
949
1824
  {
950
- // *testout << "eval p = " << p << ", level = " << level << ", center = " << center << ", r = " << r << endl;
951
1825
  elem_type sum{0.0};
952
- /*
953
- if (childs[0])
954
- {
955
- int childnum = 0;
956
- if (p(0) > center(0)) childnum += 1;
957
- if (p(1) > center(1)) childnum += 2;
958
- if (p(2) > center(2)) childnum += 4;
959
- sum = childs[childnum]->Evaluate(p);
960
- }
961
- */
1826
+
962
1827
  int childnum = 0;
963
1828
  if (p(0) > center(0)) childnum += 1;
964
1829
  if (p(1) > center(1)) childnum += 2;
@@ -966,13 +1831,16 @@ namespace ngfem
966
1831
  if (childs[childnum])
967
1832
  sum = childs[childnum]->Evaluate(p);
968
1833
  else
969
- sum = mp.Eval(p-center);
970
-
971
-
972
- static Timer t("mptool direct evaluate"); RegionTimer r(t);
973
- for (auto sn : singnodes)
974
- sum += sn->EvaluateMP(p);
1834
+ {
1835
+ // static Timer t("mptool regmp, evaluate reg"); RegionTimer r(t);
1836
+ sum = mp.Eval(p-center);
1837
+ }
975
1838
 
1839
+ {
1840
+ // static Timer t("mptool regmp, evaluate, singnode"); RegionTimer r(t);
1841
+ for (auto sn : singnodes)
1842
+ sum += sn->EvaluateMP(p);
1843
+ }
976
1844
  return sum;
977
1845
  }
978
1846
 
@@ -998,6 +1866,14 @@ namespace ngfem
998
1866
  return sum;
999
1867
  }
1000
1868
 
1869
+ void TraverseTree (const std::function<void(Node&)> & func)
1870
+ {
1871
+ func(*this);
1872
+ for (auto & child : childs)
1873
+ if (child)
1874
+ child->TraverseTree(func);
1875
+ }
1876
+
1001
1877
  double Norm() const
1002
1878
  {
1003
1879
  double norm = L2Norm(mp.SH().Coefs());
@@ -1015,36 +1891,100 @@ namespace ngfem
1015
1891
  num += ch->NumCoefficients();
1016
1892
  return num;
1017
1893
  }
1018
-
1894
+
1895
+ int GetChildNum (Vec<3> x) const
1896
+ {
1897
+ int childnum = 0;
1898
+ if (x(0) > center(0)) childnum += 1;
1899
+ if (x(1) > center(1)) childnum += 2;
1900
+ if (x(2) > center(2)) childnum += 4;
1901
+ return childnum;
1902
+ }
1903
+
1019
1904
  void AddTarget (Vec<3> x)
1020
1905
  {
1021
- if (childs[0])
1906
+ // if (childs[0])
1907
+ if (have_childs) // quick check without locking
1022
1908
  {
1023
1909
  // directly send to childs:
1024
- int childnum = 0;
1025
- if (x(0) > center(0)) childnum += 1;
1026
- if (x(1) > center(1)) childnum += 2;
1027
- if (x(2) > center(2)) childnum += 4;
1910
+ int childnum = GetChildNum(x);
1028
1911
  childs[childnum] -> AddTarget( x );
1029
1912
  return;
1030
1913
  }
1031
1914
 
1915
+ lock_guard<mutex> guard(node_mutex);
1916
+
1917
+ if (have_childs) // test again after locking
1918
+ {
1919
+ // directly send to childs:
1920
+ int childnum = GetChildNum(x);
1921
+ childs[childnum] -> AddTarget(x);
1922
+ return;
1923
+ }
1924
+
1032
1925
  targets.Append( x );
1033
1926
 
1034
- if (r*mp.Kappa() < 1e-8) return;
1035
- if (targets.Size() < maxdirect && r*mp.Kappa() < 1)
1927
+ // if (r*mp.Kappa() < 1e-8) return;
1928
+ if (level > 20) return;
1929
+ if (targets.Size() < params.maxdirect && r*mp.Kappa() < 5)
1930
+ return;
1931
+
1932
+ CreateChilds();
1933
+
1934
+ for (auto t : targets)
1935
+ AddTarget (t);
1936
+ for (auto [x,r] : vol_targets)
1937
+ AddVolumeTarget (x,r);
1938
+
1939
+ targets.SetSize0();
1940
+ vol_targets.SetSize0();
1941
+ }
1942
+
1943
+
1944
+ void AddVolumeTarget (Vec<3> x, double tr)
1945
+ {
1946
+ if (MaxNorm(x-center) > r+tr) return;
1947
+
1948
+ if (have_childs)
1949
+ {
1950
+ for (auto & child : childs)
1951
+ child->AddVolumeTarget(x, tr);
1952
+ return;
1953
+ }
1954
+
1955
+
1956
+ lock_guard<mutex> guard(node_mutex);
1957
+
1958
+ if (have_childs)
1959
+ {
1960
+ for (auto & child : childs)
1961
+ child->AddVolumeTarget(x, tr);
1962
+ return;
1963
+ }
1964
+
1965
+
1966
+ vol_targets.Append (tuple(x,tr));
1967
+
1968
+ if (level > 20) return;
1969
+ if (vol_targets.Size() < params.maxdirect && (r*mp.Kappa() < 5))
1036
1970
  return;
1037
1971
 
1038
1972
  CreateChilds();
1039
1973
 
1040
1974
  for (auto t : targets)
1041
1975
  AddTarget (t);
1976
+ for (auto [x,r] : vol_targets)
1977
+ AddVolumeTarget (x,r);
1978
+
1042
1979
  targets.SetSize0();
1980
+ vol_targets.SetSize0();
1043
1981
  }
1044
1982
 
1983
+
1984
+
1045
1985
  void CalcTotalTargets()
1046
1986
  {
1047
- total_targets = targets.Size();
1987
+ total_targets = targets.Size() + vol_targets.Size();
1048
1988
  for (auto & child : childs)
1049
1989
  if (child)
1050
1990
  {
@@ -1064,8 +2004,21 @@ namespace ngfem
1064
2004
  }
1065
2005
 
1066
2006
  if (total_targets == 0)
1067
- mp = MultiPole<MPRegular>(-1, mp.Kappa());
2007
+ mp = SphericalExpansion<Regular,elem_type>(-1, mp.Kappa(),1.);
2008
+ }
2009
+
2010
+ void AllocateMemory()
2011
+ {
2012
+ for (auto & child : childs)
2013
+ if (child)
2014
+ child->AllocateMemory();
2015
+
2016
+ if (total_targets > 0)
2017
+ Allocate();
2018
+ // mp = SphericalExpansion<Regular,elem_type>(MPOrder(r*mp.Kappa()), mp.Kappa(), r); // -1, mp.Kappa(),1.);
1068
2019
  }
2020
+
2021
+
1069
2022
 
1070
2023
 
1071
2024
  void Print (ostream & ost, size_t childnr = -1) const
@@ -1082,21 +2035,24 @@ namespace ngfem
1082
2035
  }
1083
2036
 
1084
2037
  };
1085
-
2038
+
2039
+ FMM_Parameters fmm_params;
1086
2040
  Node root;
1087
- shared_ptr<SingularMLMultiPole<elem_type>> singmp;
2041
+ shared_ptr<SingularMLExpansion<elem_type>> singmp;
1088
2042
 
1089
2043
  public:
1090
- RegularMLMultiPole (shared_ptr<SingularMLMultiPole<elem_type>> asingmp, Vec<3> center, double r, int order)
1091
- : root(center, r, 0, order, asingmp->Kappa()), singmp(asingmp)
1092
- {
2044
+ RegularMLExpansion (shared_ptr<SingularMLExpansion<elem_type>> asingmp, Vec<3> center, double r,
2045
+ const FMM_Parameters & _params)
2046
+ : fmm_params(_params), root(center, r, 0, asingmp->Kappa(), fmm_params), singmp(asingmp)
2047
+ {
1093
2048
  if (!singmp->havemp) throw Exception("first call Calc for singular MP");
1094
-
2049
+ root.Allocate();
2050
+
1095
2051
  nodes_on_level = 0;
1096
2052
  nodes_on_level[0] = 1;
1097
2053
  {
1098
- static Timer t("mptool compute regular MLMP"); RegionTimer rg(t);
1099
- root.AddSingularNode(singmp->root, true);
2054
+ static Timer t("mptool compute regular MLMP"); RegionTimer rg(t);
2055
+ root.AddSingularNode(singmp->root, true, nullptr);
1100
2056
  // cout << "norm after S->R conversion: " << root.Norm() << endl;
1101
2057
  }
1102
2058
 
@@ -1117,39 +2073,163 @@ namespace ngfem
1117
2073
  }
1118
2074
  }
1119
2075
 
1120
- RegularMLMultiPole (Vec<3> center, double r, int order, double kappa)
1121
- : root(center, r, 0, order, kappa)
1122
- {
1123
- nodes_on_level = 0;
1124
- nodes_on_level[0] = 1;
1125
- }
1126
-
2076
+ RegularMLExpansion (Vec<3> center, double r, double kappa, const FMM_Parameters & _params)
2077
+ : fmm_params(_params), root(center, r, 0, kappa, fmm_params)
2078
+ {
2079
+ nodes_on_level = 0;
2080
+ nodes_on_level[0] = 1;
2081
+ }
2082
+
1127
2083
  void AddTarget (Vec<3> t)
1128
2084
  {
1129
2085
  root.AddTarget (t);
1130
2086
  }
1131
2087
 
1132
- void CalcMP(shared_ptr<SingularMLMultiPole<elem_type>> asingmp)
2088
+ void AddVolumeTarget (Vec<3> t, double r)
2089
+ {
2090
+ root.AddVolumeTarget (t, r);
2091
+ }
2092
+
2093
+ void CalcMP(shared_ptr<SingularMLExpansion<elem_type>> asingmp, bool onlytargets = true)
1133
2094
  {
2095
+ static Timer t("mptool regular MLMP"); RegionTimer rg(t);
2096
+ static Timer tremove("removeempty");
2097
+ static Timer trec("mptool regular MLMP - recording");
2098
+ static Timer tsort("mptool regular MLMP - sort");
2099
+
1134
2100
  singmp = asingmp;
1135
2101
 
2102
+
1136
2103
  root.CalcTotalTargets();
1137
- root.RemoveEmptyTrees();
1138
-
1139
- root.AddSingularNode(singmp->root, false);
2104
+ // cout << "before remove empty trees:" << endl;
2105
+ // PrintStatistics(cout);
1140
2106
 
2107
+ /*
2108
+ tremove.Start();
2109
+ if (onlytargets)
2110
+ root.RemoveEmptyTrees();
2111
+ tremove.Stop();
2112
+ */
2113
+
2114
+ root.AllocateMemory();
2115
+
2116
+ // cout << "after allocating regular:" << endl;
2117
+ // PrintStatistics(cout);
2118
+
2119
+ // cout << "starting S-R converion" << endl;
2120
+ // PrintStatistics(cout);
2121
+
2122
+
2123
+ if constexpr (false)
2124
+ {
2125
+ root.AddSingularNode(singmp->root, !onlytargets, nullptr);
2126
+ }
2127
+ else
2128
+ { // use recording
2129
+ Array<RecordingRS> recording;
2130
+ {
2131
+ RegionTimer rrec(trec);
2132
+ root.AddSingularNode(singmp->root, !onlytargets, &recording);
2133
+ }
2134
+
2135
+ // cout << "recorded: " << recording.Size() << endl;
2136
+ {
2137
+ RegionTimer reg(tsort);
2138
+ QuickSort (recording, [] (auto & a, auto & b)
2139
+ {
2140
+ if (a.len < (1-1e-8) * b.len) return true;
2141
+ if (a.len > (1+1e-8) * b.len) return false;
2142
+ return a.theta < b.theta;
2143
+ });
2144
+ }
2145
+
2146
+ double current_len = -1e100;
2147
+ double current_theta = -1e100;
2148
+ Array<RecordingRS*> current_batch;
2149
+ Array<Array<RecordingRS*>> batch_group;
2150
+ Array<double> group_lengths;
2151
+ Array<double> group_thetas;
2152
+ for (auto & record : recording)
2153
+ {
2154
+ bool len_changed = fabs(record.len - current_len) > 1e-8;
2155
+ bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
2156
+ if ((len_changed || theta_changed) && current_batch.Size() > 0) {
2157
+ // ProcessBatch(current_batch, current_len, current_theta);
2158
+ batch_group.Append(current_batch);
2159
+ group_lengths.Append(current_len);
2160
+ group_thetas.Append(current_theta);
2161
+ current_batch.SetSize(0);
2162
+ }
2163
+
2164
+ current_len = record.len;
2165
+ current_theta = record.theta;
2166
+ current_batch.Append(&record);
2167
+ }
2168
+ if (current_batch.Size() > 0) {
2169
+ // ProcessBatch(current_batch, current_len, current_theta);
2170
+ batch_group.Append(current_batch);
2171
+ group_lengths.Append(current_len);
2172
+ group_thetas.Append(current_theta);
2173
+ }
2174
+
2175
+ ParallelFor(batch_group.Size(), [&](int i) {
2176
+ ProcessBatchRS(batch_group[i], group_lengths[i], group_thetas[i]);
2177
+ }, TasksPerThread(4));
2178
+ }
2179
+
2180
+
1141
2181
  /*
1142
2182
  int maxlevel = 0;
1143
- for (auto [i,num] : Enumerate(RegularMLMultiPole::nodes_on_level))
2183
+ for (auto [i,num] : Enumerate(RegularMLExpansion::nodes_on_level))
1144
2184
  if (num > 0) maxlevel = i;
1145
2185
 
1146
2186
  for (int i = 0; i <= maxlevel; i++)
1147
- cout << "reg " << i << ": " << RegularMLMultiPole::nodes_on_level[i] << endl;
2187
+ cout << "reg " << i << ": " << RegularMLExpansion::nodes_on_level[i] << endl;
1148
2188
  */
1149
2189
 
1150
- root.LocalizeExpansion(false);
2190
+ // cout << "starting R-R converion" << endl;
2191
+ // PrintStatistics(cout);
2192
+
2193
+ static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
2194
+ root.LocalizeExpansion(!onlytargets);
2195
+
2196
+
2197
+ // cout << "R-R conversion done" << endl;
2198
+ // PrintStatistics(cout);
1151
2199
  }
1152
2200
 
2201
+ void PrintStatistics (ostream & ost)
2202
+ {
2203
+ int levels = 0;
2204
+ int cnt = 0;
2205
+ root.TraverseTree( [&](Node & node) {
2206
+ levels = max(levels, node.level);
2207
+ cnt++;
2208
+ });
2209
+ ost << "levels: " << levels << endl;
2210
+ ost << "nodes: " << cnt << endl;
2211
+
2212
+ Array<int> num_on_level(levels+1);
2213
+ Array<int> order_on_level(levels+1);
2214
+ Array<size_t> coefs_on_level(levels+1);
2215
+ num_on_level = 0;
2216
+ order_on_level = 0;
2217
+ root.TraverseTree( [&](Node & node) {
2218
+ num_on_level[node.level]++;
2219
+ order_on_level[node.level] = max(order_on_level[node.level],node.mp.Order());
2220
+ coefs_on_level[node.level] += node.mp.SH().Coefs().Size();
2221
+ });
2222
+
2223
+ cout << "num on level" << endl;
2224
+ for (int i = 0; i < num_on_level.Size(); i++)
2225
+ cout << i << ": " << num_on_level[i] << ", order = " << order_on_level[i] << ", coefs " << coefs_on_level[i] << endl;
2226
+
2227
+ size_t totcoefs = 0;
2228
+ for (auto n : coefs_on_level)
2229
+ totcoefs += n;
2230
+ cout << "total mem in coefs: " << sizeof(elem_type)*totcoefs / sqr(1024) << " MB" << endl;
2231
+ }
2232
+
1153
2233
  void Print (ostream & ost) const
1154
2234
  {
1155
2235
  root.Print(ost);
@@ -1168,7 +2248,10 @@ namespace ngfem
1168
2248
  elem_type Evaluate (Vec<3> p) const
1169
2249
  {
1170
2250
  // static Timer t("mptool Eval MLMP regular"); RegionTimer r(t);
1171
- if (L2Norm(p-root.center) > root.r) return elem_type{0.0};
2251
+ // if (L2Norm(p-root.center) > root.r) return elem_type{0.0};
2252
+
2253
+ if (MaxNorm(p-root.center) > root.r)
2254
+ return singmp->Evaluate(p);
1172
2255
  return root.Evaluate(p);
1173
2256
  }
1174
2257
 
@@ -1180,11 +2263,12 @@ namespace ngfem
1180
2263
 
1181
2264
  };
1182
2265
 
2266
+
1183
2267
  template <typename elem_type>
1184
- inline ostream & operator<< (ostream & ost, const RegularMLMultiPole<elem_type> & mlmp)
2268
+ inline ostream & operator<< (ostream & ost, const RegularMLExpansion<elem_type> & mlmp)
1185
2269
  {
1186
2270
  mlmp.Print(ost);
1187
- // ost << "RegularMLMultiPole" << endl;
2271
+ // ost << "RegularMLExpansion" << endl;
1188
2272
  return ost;
1189
2273
  }
1190
2274
 
@@ -1193,126 +2277,5 @@ namespace ngfem
1193
2277
 
1194
2278
 
1195
2279
 
1196
- // ******************** Coefficient Functions *********************
1197
-
1198
-
1199
- class SphericalHarmonicsCF : public CoefficientFunction
1200
- {
1201
- SphericalHarmonics<Complex> sh;
1202
- public:
1203
- SphericalHarmonicsCF (int order)
1204
- : CoefficientFunction(1, true), sh(order) { }
1205
- Complex & Coef(int n, int m) { return sh.Coef(n,m); }
1206
-
1207
- virtual double Evaluate (const BaseMappedIntegrationPoint & ip) const override
1208
- { throw Exception("real eval not available"); }
1209
-
1210
- virtual void Evaluate (const BaseMappedIntegrationPoint & mip, FlatVector<Complex> values) const override
1211
- {
1212
- values(0) = sh.Eval(mip.GetPoint());
1213
- }
1214
-
1215
- virtual void Evaluate (const BaseMappedIntegrationRule & ir, BareSliceMatrix<Complex> values) const override
1216
- {
1217
- for (int i = 0; i < ir.Size(); i++)
1218
- {
1219
- auto & mip = ir[i];
1220
- values(i,0) = sh.Eval(mip.GetPoint());
1221
- }
1222
- }
1223
-
1224
- auto & SH() { return sh; }
1225
- };
1226
-
1227
-
1228
-
1229
- template <typename RADIAL, typename entry_type=Complex>
1230
- class MultiPoleCF : public CoefficientFunction
1231
- {
1232
- MultiPole<RADIAL, entry_type> mp;
1233
- Vec<3> center;
1234
- public:
1235
- MultiPoleCF (int order, double kappa, Vec<3> acenter, double scale = 1)
1236
- : CoefficientFunction(sizeof(entry_type)/sizeof(Complex), true), mp(order, kappa, scale), center(acenter) { }
1237
-
1238
- entry_type & Coef(int n, int m) { return mp.Coef(n,m); }
1239
- auto & SH() { return mp.SH(); }
1240
- auto & MP() { return mp; }
1241
- Vec<3> Center() const { return center; }
1242
-
1243
- virtual double Evaluate (const BaseMappedIntegrationPoint & ip) const override
1244
- { throw Exception("real eval not available"); }
1245
-
1246
- virtual void Evaluate (const BaseMappedIntegrationPoint & mip, FlatVector<Complex> values) const override
1247
- {
1248
- if constexpr (std::is_same<entry_type, Complex>())
1249
- values(0) = mp.Eval(mip.GetPoint()-center);
1250
- else
1251
- values = mp.Eval(mip.GetPoint()-center);
1252
- }
1253
-
1254
- template <typename TARGET>
1255
- void ShiftZ (double z, MultiPole<TARGET, entry_type> & target) { mp.ShiftZ(z, target); }
1256
-
1257
- using CoefficientFunction::Transform;
1258
- template <typename TARGET>
1259
- void Transform (MultiPoleCF<TARGET, entry_type> & target)
1260
- {
1261
- mp.Transform (target.MP(), target.Center()-center);
1262
- }
1263
- };
1264
-
1265
- template <typename entry_type>
1266
- class SingularMLMultiPoleCF : public CoefficientFunction
1267
- {
1268
- shared_ptr<SingularMLMultiPole<entry_type>> mlmp;
1269
- public:
1270
- SingularMLMultiPoleCF (Vec<3> center, double r, int order, double kappa)
1271
- : CoefficientFunction(sizeof(entry_type)/sizeof(Complex), true), mlmp{make_shared<SingularMLMultiPole<entry_type>>(center, r, order, kappa)} { }
1272
-
1273
- virtual double Evaluate (const BaseMappedIntegrationPoint & ip) const override
1274
- { throw Exception("real eval not available"); }
1275
-
1276
- virtual void Evaluate (const BaseMappedIntegrationPoint & mip, FlatVector<Complex> values) const override
1277
- {
1278
- // values(0) = mlmp->Evaluate(mip.GetPoint());
1279
-
1280
- if constexpr (std::is_same<entry_type, Complex>())
1281
- values(0) = mlmp->Evaluate(mip.GetPoint());
1282
- else
1283
- values = mlmp->Evaluate(mip.GetPoint());
1284
-
1285
-
1286
- }
1287
-
1288
- shared_ptr<SingularMLMultiPole<entry_type>> MLMP() { return mlmp; }
1289
- };
1290
-
1291
-
1292
- template <typename entry_type>
1293
- class RegularMLMultiPoleCF : public CoefficientFunction
1294
- {
1295
- shared_ptr<RegularMLMultiPole<entry_type>> mlmp;
1296
- public:
1297
- RegularMLMultiPoleCF (shared_ptr<SingularMLMultiPoleCF<entry_type>> asingmp, Vec<3> center, double r, int order)
1298
- : CoefficientFunction(sizeof(entry_type)/sizeof(Complex), true), mlmp{make_shared<RegularMLMultiPole<entry_type>>(asingmp->MLMP(), center, r, order)} { }
1299
-
1300
- virtual double Evaluate (const BaseMappedIntegrationPoint & ip) const override
1301
- { throw Exception("real eval not available"); }
1302
-
1303
- virtual void Evaluate (const BaseMappedIntegrationPoint & mip, FlatVector<Complex> values) const override
1304
- {
1305
- // values(0) = mlmp->Evaluate(mip.GetPoint());
1306
-
1307
- if constexpr (std::is_same<entry_type, Complex>())
1308
- values(0) = mlmp->Evaluate(mip.GetPoint());
1309
- else
1310
- values = mlmp->Evaluate(mip.GetPoint());
1311
- }
1312
-
1313
- shared_ptr<RegularMLMultiPole<entry_type>> MLMP() { return mlmp; }
1314
- };
1315
-
1316
-
1317
2280
  }
1318
2281
  #endif