ngsolve 6.2.2505__cp311-cp311-macosx_10_15_universal2.whl → 6.2.2505.post94.dev0__cp311-cp311-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ngsolve might be problematic. Click here for more details.

Files changed (60) hide show
  1. netgen/include/bilinearform.hpp +1 -1
  2. netgen/include/diffop_impl.hpp +3 -1
  3. netgen/include/fespace.hpp +4 -2
  4. netgen/include/gridfunction.hpp +1 -1
  5. netgen/include/h1amg.hpp +24 -1
  6. netgen/include/hcurlcurlfe.hpp +20 -0
  7. netgen/include/hdivhofespace.hpp +2 -0
  8. netgen/include/mptools.hpp +832 -97
  9. netgen/include/ngblas.hpp +113 -4
  10. netgen/include/recursive_pol.hpp +63 -11
  11. netgen/include/simd_complex.hpp +20 -0
  12. netgen/include/sparsematrix_dyn.hpp +2 -2
  13. netgen/include/sparsematrix_impl.hpp +25 -0
  14. netgen/include/vector.hpp +15 -2
  15. netgen/libngbla.dylib +0 -0
  16. netgen/libngcomp.dylib +0 -0
  17. netgen/libngfem.dylib +0 -0
  18. netgen/libngla.dylib +0 -0
  19. netgen/libngsbem.dylib +0 -0
  20. netgen/libngstd.dylib +0 -0
  21. ngsolve/cmake/NGSolveConfig.cmake +1 -1
  22. ngsolve/config/config.py +5 -5
  23. {ngsolve-6.2.2505.dist-info → ngsolve-6.2.2505.post94.dev0.dist-info}/METADATA +2 -2
  24. {ngsolve-6.2.2505.dist-info → ngsolve-6.2.2505.post94.dev0.dist-info}/RECORD +60 -60
  25. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/Netgen.icns +0 -0
  26. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/bin/ngscxx +0 -0
  27. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/bin/ngsld +0 -0
  28. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/bin/ngsolve.tcl +0 -0
  29. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/bin/ngspy +0 -0
  30. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/beam.geo +0 -0
  31. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/beam.vol +0 -0
  32. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/chip.in2d +0 -0
  33. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/chip.vol +0 -0
  34. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/coil.geo +0 -0
  35. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/coil.vol +0 -0
  36. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/coilshield.geo +0 -0
  37. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/coilshield.vol +0 -0
  38. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/cube.geo +0 -0
  39. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/cube.vol +0 -0
  40. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d10_DGdoubleglazing.pde +0 -0
  41. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d11_chip_nitsche.pde +0 -0
  42. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d1_square.pde +0 -0
  43. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d2_chip.pde +0 -0
  44. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d3_helmholtz.pde +0 -0
  45. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d4_cube.pde +0 -0
  46. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d5_beam.pde +0 -0
  47. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d6_shaft.pde +0 -0
  48. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d7_coil.pde +0 -0
  49. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d8_coilshield.pde +0 -0
  50. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/d9_hybridDG.pde +0 -0
  51. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/doubleglazing.in2d +0 -0
  52. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/doubleglazing.vol +0 -0
  53. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/piezo2d40round4.vol.gz +0 -0
  54. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/shaft.geo +0 -0
  55. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/shaft.vol +0 -0
  56. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/square.in2d +0 -0
  57. {ngsolve-6.2.2505.data → ngsolve-6.2.2505.post94.dev0.data}/data/share/ngsolve/square.vol +0 -0
  58. {ngsolve-6.2.2505.dist-info → ngsolve-6.2.2505.post94.dev0.dist-info}/LICENSE +0 -0
  59. {ngsolve-6.2.2505.dist-info → ngsolve-6.2.2505.post94.dev0.dist-info}/WHEEL +0 -0
  60. {ngsolve-6.2.2505.dist-info → ngsolve-6.2.2505.post94.dev0.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,157 @@ namespace ngsbem
20
20
  {
21
21
  using namespace ngfem;
22
22
 
23
+ template<typename T>
24
+ constexpr int VecLength = 1; // Default: Complex has length 1
25
+
26
+ template<int N>
27
+ constexpr int VecLength<Vec<N, Complex>> = N; // Specialization: Vec<N,Complex> has length N
28
+
29
+
30
+
31
+ constexpr int FMM_SW = 4;
32
+
33
+
34
+ // ************************ SIMD - creation (should end up in simd.hpp) *************
35
+
36
+
37
+ template <int S, typename T, int SW>
38
+ Vec<S,T> HSum (Vec<S,SIMD<T,SW>> v)
39
+ {
40
+ Vec<S,T> res;
41
+ for (int i = 0; i < S; i++)
42
+ res(i) = HSum(v(i));
43
+ // Iterate<S> ([&](auto i) {
44
+ // res.HTData().template Elem<i.value>() = HSum(v.HTData().template Elem<i.value>());
45
+ // });
46
+ return res;
47
+ }
48
+
49
+
50
+ template <typename T, size_t S> class MakeSimdCl;
51
+
52
+ template <typename T, size_t S>
53
+ auto MakeSimd (array<T,S> aa) { return MakeSimdCl(aa).Get(); }
54
+
55
+
56
+ template <typename T, size_t S>
57
+ class MakeSimdCl
58
+ {
59
+ array<T,S> a;
60
+ public:
61
+ MakeSimdCl (array<T,S> aa) : a(aa) { ; }
62
+ auto Get() const
63
+ {
64
+ SIMD<T,S> sa( [this] (auto i) { return (this->a)[i]; });
65
+ return sa;
66
+ }
67
+ };
68
+
69
+
70
+ template <typename T, size_t S, int VS>
71
+ class MakeSimdCl<Vec<VS,T>,S>
72
+ {
73
+ array<Vec<VS,T>,S> a;
74
+ public:
75
+ MakeSimdCl (array<Vec<VS,T>,S> aa) : a(aa) { ; }
76
+
77
+ auto Get() const
78
+ {
79
+ array<T,S> ai;
80
+ Vec<VS, decltype(MakeSimd(ai))> res;
81
+ for (int i = 0; i < VS; i++)
82
+ {
83
+ for (int j = 0; j < S; j++)
84
+ ai[j] = a[j](i);
85
+ res(i) = MakeSimd(ai);
86
+ }
87
+ return res;
88
+ }
89
+ };
90
+
91
+
92
+
93
+ template <size_t S>
94
+ class MakeSimdCl<Complex,S>
95
+ {
96
+ array<Complex,S> a;
97
+ public:
98
+ MakeSimdCl (array<Complex,S> aa) : a(aa) { ; }
99
+ auto Get() const
100
+ {
101
+ array<double,S> ar, ai;
102
+ for (int j = 0; j < S; j++)
103
+ {
104
+ ar[j] = Real(a[j]);
105
+ ai[j] = Imag(a[j]);
106
+ }
107
+
108
+ return SIMD<Complex,S> (MakeSimd(ar), MakeSimd(ai));
109
+ }
110
+ };
111
+
112
+
113
+
114
+
115
+
116
+
117
+ template <typename Tfirst, size_t S, typename ...Trest>
118
+ class MakeSimdCl<std::tuple<Tfirst,Trest...>,S>
119
+ {
120
+ array<std::tuple<Tfirst,Trest...>,S> a;
121
+ public:
122
+ MakeSimdCl (array<std::tuple<Tfirst,Trest...>,S> aa) : a(aa) { ; }
123
+ auto Get() const
124
+ {
125
+ array<Tfirst,S> a0;
126
+ for (int i = 0; i < S; i++)
127
+ a0[i] = std::get<0> (a[i]);
128
+
129
+ if constexpr (std::tuple_size<tuple<Tfirst,Trest...>>::value == 1)
130
+ {
131
+ return tuple(MakeSimd(a0));
132
+ }
133
+ else
134
+ {
135
+ array<tuple<Trest...>,S> arest;
136
+ for (int i = 0; i < S; i++)
137
+ arest[i] = skip_first(a[i]);
138
+
139
+ return tuple_cat ( tuple (MakeSimd(a0)), MakeSimd(arest) );
140
+ }
141
+ }
142
+
143
+ template <typename... Ts>
144
+ static auto skip_first(const std::tuple<Ts...>& t) {
145
+ return std::apply([](auto first, auto... rest) {
146
+ return std::make_tuple(rest...);
147
+ }, t);
148
+ }
149
+ };
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+ inline std::tuple<double, double, double> SphericalCoordinates(Vec<3> dist){
161
+ double len, theta, phi;
162
+ len = L2Norm(dist);
163
+ if (len < 1e-30)
164
+ theta = 0;
165
+ else
166
+ theta = acos (dist(2) / len);
167
+ if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
168
+ phi = 0;
169
+ else
170
+ phi = atan2(dist(1), dist(0));
171
+ return {len, theta, phi};
172
+ }
173
+
23
174
 
24
175
  template <typename entry_type = Complex>
25
176
  class NGS_DLL_HEADER SphericalHarmonics
@@ -84,9 +235,69 @@ namespace ngsbem
84
235
 
85
236
  void Calc (Vec<3> x, FlatVector<Complex> shapes);
86
237
 
87
-
238
+
239
+ void FlipZ ();
88
240
  void RotateZ (double alpha);
89
- void RotateY (double alpha);
241
+
242
+ template <typename FUNC>
243
+ void RotateZ (double alpha, FUNC func) const
244
+ {
245
+ if (order < 0) return;
246
+
247
+ Vector<Complex> exp_imalpha(order+1);
248
+ Complex exp_ialpha(cos(alpha), sin(alpha));
249
+ Complex prod = 1.0;
250
+ for (int i = 0; i <= order; i++)
251
+ {
252
+ exp_imalpha(i) = prod;
253
+ prod *= exp_ialpha;
254
+ }
255
+
256
+ int ii = 0;
257
+ for (int n = 0; n <= order; n++)
258
+ {
259
+ for (int m = -n; m < 0; m++, ii++)
260
+ func(ii, conj(exp_imalpha(-m)));
261
+ for (int m = 0; m <= n; m++, ii++)
262
+ func(ii, exp_imalpha(m));
263
+ };
264
+ };
265
+
266
+ template <typename FUNC>
267
+ void RotateZFlip (double alpha, bool flip, FUNC func) const
268
+ {
269
+ if (order < 0) return;
270
+
271
+ Vector<Complex> exp_imalpha(order+1);
272
+ Complex exp_ialpha(cos(alpha), sin(alpha));
273
+ Complex prod = 1.0;
274
+ for (int i = 0; i <= order; i++)
275
+ {
276
+ exp_imalpha(i) = prod;
277
+ prod *= exp_ialpha;
278
+ }
279
+
280
+ int ii = 0;
281
+
282
+ auto FlipFactor = [] (int n, int m, bool flip)->double
283
+ {
284
+ if (flip)
285
+ return ((n-m)%2) == 1 ? -1 : 1;
286
+ return 1.0;
287
+ };
288
+
289
+ for (int n = 0; n <= order; n++)
290
+ {
291
+ for (int m = -n; m < 0; m++, ii++)
292
+ func(ii, FlipFactor(n,m,flip)*conj(exp_imalpha(-m)));
293
+ for (int m = 0; m <= n; m++, ii++)
294
+ func(ii, FlipFactor(n,m,flip)*exp_imalpha(m));
295
+ };
296
+ };
297
+
298
+
299
+
300
+ void RotateY (double alpha, bool parallel = false);
90
301
 
91
302
 
92
303
  static double CalcAmn (int m, int n)
@@ -119,11 +330,11 @@ namespace ngsbem
119
330
  // https://fortran-lang.discourse.group/t/looking-for-spherical-bessel-and-hankel-functions-of-first-and-second-kind-and-arbitrary-order/2308/2
120
331
  NGS_DLL_HEADER
121
332
  void besseljs3d (int nterms, double z, double scale,
122
- FlatVector<double> fjs, FlatVector<double> fjder);
333
+ SliceVector<double> fjs, SliceVector<double> fjder = FlatVector<double>(0, nullptr));
123
334
 
124
335
  NGS_DLL_HEADER
125
336
  void besseljs3d (int nterms, Complex z, double scale,
126
- FlatVector<Complex> fjs, FlatVector<Complex> fjder);
337
+ SliceVector<Complex> fjs, SliceVector<Complex> fjder = FlatVector<Complex>(0, nullptr));
127
338
 
128
339
 
129
340
  /*
@@ -142,14 +353,17 @@ namespace ngsbem
142
353
  FlatVector<double> jp,
143
354
  FlatVector<double> yp);
144
355
 
145
-
356
+
146
357
 
147
358
  template <typename T>
148
359
  void SphericalBessel (int n, double rho, double scale, T && values)
149
360
  {
361
+ besseljs3d (n, rho, scale, values);
362
+ /*
150
363
  Vector<double> j(n+1), jp(n+1);
151
364
  besseljs3d (n, rho, scale, j, jp);
152
365
  values = j;
366
+ */
153
367
  }
154
368
 
155
369
 
@@ -173,21 +387,6 @@ namespace ngsbem
173
387
  return;
174
388
  }
175
389
  Vector j(n+1), y(n+1), jp(n+1), yp(n+1);
176
- // SBESJY (rho, n, j, y, jp, yp);
177
-
178
- /*
179
- values = j + Complex(0,1) * y;
180
- if (scale != 1.0)
181
- {
182
- double prod = 1.0;
183
- for (int i = 0; i <= n; i++)
184
- {
185
- values(i) *= prod;
186
- prod *= scale;
187
- }
188
- }
189
- */
190
-
191
390
 
192
391
  // the bessel-evaluation with scale
193
392
  besseljs3d (n, rho, 1/scale, j, jp);
@@ -358,18 +557,7 @@ namespace ngsbem
358
557
  // static Timer t("mptool Transform "+ToString(typeid(RADIAL).name())+ToString(typeid(TARGET).name()));
359
558
  // RegionTimer reg(t);
360
559
 
361
- double len = L2Norm(dist);
362
- double theta, phi;
363
-
364
- if (len < 1e-30)
365
- theta = 0;
366
- else
367
- theta = acos (dist(2) / len);
368
-
369
- if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
370
- phi = 0;
371
- else
372
- phi = atan2(dist(1), dist(0));
560
+ auto [len, theta, phi] = SphericalCoordinates(dist);
373
561
 
374
562
 
375
563
  // MultiPole<RADIAL,entry_type> tmp{*this};
@@ -386,14 +574,18 @@ namespace ngsbem
386
574
  }
387
575
 
388
576
  template <typename TARGET>
389
- void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
577
+ void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist, bool atomic = false) const
390
578
  {
391
579
  if (SH().Order() < 0) return;
392
580
  if (target.SH().Order() < 0) return;
393
581
 
394
582
  MultiPole<TARGET,entry_type> tmp{target};
395
583
  Transform(tmp, dist);
396
- target.SH().Coefs() += tmp.SH().Coefs();
584
+ if (!atomic)
585
+ target.SH().Coefs() += tmp.SH().Coefs();
586
+ else
587
+ for (int j = 0; j < target.SH().Coefs().Size(); j++)
588
+ AtomicAdd(target.SH().Coefs()[j], tmp.SH().Coefs()[j]);
397
589
  }
398
590
 
399
591
  template <typename TARGET>
@@ -412,11 +604,124 @@ namespace ngsbem
412
604
  static constexpr int maxdirect = 100;
413
605
 
414
606
 
607
+ template <typename SCAL, auto S>
608
+ inline auto VecVector2Matrix (FlatVector<Vec<S,SCAL>> vec)
609
+ {
610
+ return FlatMatrixFixWidth<S,SCAL> (vec.Size(), vec.Data()->Data());
611
+ }
612
+
613
+ inline auto VecVector2Matrix (FlatVector<Complex> vec)
614
+ {
615
+ return FlatMatrixFixWidth<1,Complex> (vec.Size(), vec.Data());
616
+ }
617
+
618
+
415
619
  template <typename entry_type=Complex>
416
620
  class SingularMLMultiPole
417
621
  {
622
+ using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
418
623
  static Array<size_t> nodes_on_level;
419
624
 
625
+ struct RecordingSS
626
+ {
627
+ const MultiPole<MPSingular,entry_type> * mp_source;
628
+ MultiPole<MPSingular,entry_type> * mp_target;
629
+ Vec<3> dist;
630
+ double len, theta, phi;
631
+ bool flipz;
632
+ public:
633
+ RecordingSS() = default;
634
+ RecordingSS (const MultiPole<MPSingular,entry_type> * amp_source,
635
+ MultiPole<MPSingular,entry_type> * amp_target,
636
+ Vec<3> adist)
637
+ : mp_source(amp_source), mp_target(amp_target), dist(adist)
638
+ {
639
+ std::tie(len, theta, phi) = SphericalCoordinates(adist);
640
+ // flipz = false;
641
+ flipz = theta > M_PI/2;
642
+ if (flipz) theta = M_PI-theta;
643
+ }
644
+ };
645
+
646
+
647
+ static void ProcessBatch(FlatArray<RecordingSS*> batch, double len, double theta) {
648
+ constexpr int vec_length = VecLength<entry_type>;
649
+ int batch_size = batch.Size();
650
+ int N = batch_size * vec_length;
651
+ // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(entry_type).name() << ", len = " << len << ", theta = " << theta << endl;
652
+
653
+ if (N <= 1 || batch_size <= 1) {
654
+ for (auto* rec : batch) {
655
+ rec->mp_source->TransformAdd(*rec->mp_target, rec->dist, true);
656
+ }
657
+ }
658
+ else if (N <= 3) {
659
+ ProcessVectorizedBatch<3, vec_length>(batch, len, theta);
660
+ }
661
+ else if (N <= 4) {
662
+ ProcessVectorizedBatch<4, vec_length>(batch, len, theta);
663
+ }
664
+ else if (N <= 6) {
665
+ ProcessVectorizedBatch<6, vec_length>(batch, len, theta);
666
+ }
667
+ else if (N <= 12) {
668
+ ProcessVectorizedBatch<12, vec_length>(batch, len, theta);
669
+ }
670
+ else if (N <= 24) {
671
+ ProcessVectorizedBatch<24, vec_length>(batch, len, theta);
672
+ }
673
+ else if (N <= 48) {
674
+ ProcessVectorizedBatch<48, vec_length>(batch, len, theta);
675
+ }
676
+ else if (N <= 96) {
677
+ ProcessVectorizedBatch<96, vec_length>(batch, len, theta);
678
+ }
679
+ else if (N <= 192) {
680
+ ProcessVectorizedBatch<192, vec_length>(batch, len, theta);
681
+ }
682
+ else {
683
+ // Split large batches
684
+ ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
685
+ ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
686
+ }
687
+ }
688
+
689
+ template<int N, int vec_length>
690
+ static void ProcessVectorizedBatch(FlatArray<RecordingSS*> batch, double len, double theta) {
691
+
692
+ // *testout << "Processing vectorized S->S batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
693
+ MultiPole<MPSingular, Vec<N,Complex>> vec_source(batch[0]->mp_source->Order(), batch[0]->mp_source->Kappa(), batch[0]->mp_source->RTyp());
694
+ MultiPole<MPSingular, Vec<N,Complex>> vec_target(batch[0]->mp_target->Order(), batch[0]->mp_target->Kappa(), batch[0]->mp_target->RTyp());
695
+
696
+ // Copy multipoles into vectorized multipole
697
+ for (int i = 0; i < batch.Size(); i++)
698
+ {
699
+ auto source_i = VecVector2Matrix (batch[i]->mp_source->SH().Coefs());
700
+ auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
701
+ batch[i]->mp_source->SH().RotateZFlip(batch[i]->phi, batch[i]->flipz,
702
+ [source_i, source_mati] (size_t ii, Complex factor)
703
+ {
704
+ source_mati.Row(ii) = factor * source_i.Row(ii);
705
+ });
706
+ }
707
+
708
+ vec_source.SH().RotateY(theta, vec_source.SH().Order() >= 100);
709
+ vec_source.ShiftZ(-len, vec_target);
710
+ vec_target.SH().RotateY(-theta, vec_target.SH().Order() >= 100);
711
+
712
+ // Copy vectorized multipole into individual multipoles
713
+ for (int i = 0; i < batch.Size(); i++)
714
+ {
715
+ auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
716
+ auto target_mati = VecVector2Matrix (batch[i]->mp_target->SH().Coefs());
717
+ batch[i]->mp_target->SH().RotateZFlip(-batch[i]->phi, batch[i]->flipz,
718
+ [source_mati, target_mati] (size_t ii, Complex factor)
719
+ {
720
+ AtomicAdd (target_mati.Row(ii), factor * source_mati.Row(ii));
721
+ });
722
+ }
723
+ }
724
+
420
725
  struct Node
421
726
  {
422
727
  Vec<3> center;
@@ -428,7 +733,14 @@ namespace ngsbem
428
733
  Array<tuple<Vec<3>, entry_type>> charges;
429
734
  Array<tuple<Vec<3>, Vec<3>, entry_type>> dipoles;
430
735
  Array<tuple<Vec<3>, Vec<3>, Complex,int>> currents;
736
+
737
+ using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
738
+ Array<tuple<Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_charges;
739
+ Array<tuple<Vec<3,SIMD<double,FMM_SW>>, Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_dipoles;
740
+
431
741
  int total_sources;
742
+ std::mutex node_mutex;
743
+ atomic<bool> have_childs{false};
432
744
 
433
745
  Node (Vec<3> acenter, double ar, int alevel, double akappa)
434
746
  : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*akappa), akappa, ar) // min(1.0, ar*akappa))
@@ -449,12 +761,13 @@ namespace ngsbem
449
761
  cc(2) += (i&4) ? r/2 : -r/2;
450
762
  childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
451
763
  }
764
+ have_childs = true;
452
765
  }
453
766
 
454
767
 
455
768
  void AddCharge (Vec<3> x, entry_type c)
456
769
  {
457
- if (childs[0])
770
+ if (have_childs) // quick check without locking
458
771
  {
459
772
  // directly send to childs:
460
773
  int childnum = 0;
@@ -465,6 +778,21 @@ namespace ngsbem
465
778
  return;
466
779
  }
467
780
 
781
+ lock_guard<mutex> guard(node_mutex);
782
+
783
+ if (have_childs) // test again after locking
784
+ {
785
+ // directly send to childs:
786
+ int childnum = 0;
787
+ if (x(0) > center(0)) childnum += 1;
788
+ if (x(1) > center(1)) childnum += 2;
789
+ if (x(2) > center(2)) childnum += 4;
790
+ childs[childnum] -> AddCharge(x, c);
791
+ return;
792
+ }
793
+
794
+
795
+
468
796
  charges.Append( tuple{x,c} );
469
797
 
470
798
  // if (r*mp.Kappa() < 1e-8) return;
@@ -489,7 +817,7 @@ namespace ngsbem
489
817
 
490
818
  void AddDipole (Vec<3> x, Vec<3> d, entry_type c)
491
819
  {
492
- if (childs[0])
820
+ if (have_childs)
493
821
  {
494
822
  // directly send to childs:
495
823
 
@@ -501,6 +829,23 @@ namespace ngsbem
501
829
  return;
502
830
  }
503
831
 
832
+ lock_guard<mutex> guard(node_mutex);
833
+
834
+ if (have_childs)
835
+ {
836
+ // directly send to childs:
837
+
838
+ int childnum = 0;
839
+ if (x(0) > center(0)) childnum += 1;
840
+ if (x(1) > center(1)) childnum += 2;
841
+ if (x(2) > center(2)) childnum += 4;
842
+ childs[childnum] -> AddDipole(x, d, c);
843
+ return;
844
+ }
845
+
846
+
847
+
848
+
504
849
  dipoles.Append (tuple{x,d,c});
505
850
 
506
851
  if (dipoles.Size() < maxdirect || r < 1e-8)
@@ -520,6 +865,7 @@ namespace ngsbem
520
865
  currents.SetSize0();
521
866
  }
522
867
 
868
+ // not parallel yet
523
869
  void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
524
870
  {
525
871
  if (childs[0])
@@ -549,7 +895,7 @@ namespace ngsbem
549
895
  }
550
896
  return;
551
897
  }
552
-
898
+
553
899
  currents.Append (tuple{sp,ep,j,num});
554
900
 
555
901
  // if (currents.Size() < maxdirect || r < 1e-8)
@@ -583,26 +929,74 @@ namespace ngsbem
583
929
  return sum;
584
930
  }
585
931
 
586
- // static Timer t("fmm direct eval"); RegionTimer reg(t);
587
- if (mp.Kappa() < 1e-8)
932
+ {
933
+ // static Timer t("fmm direct eval"); RegionTimer reg(t);
934
+ // t.AddFlops (charges.Size());
935
+ if (simd_charges.Size())
936
+ {
937
+ simd_entry_type vsum{0.0};
938
+ if (mp.Kappa() < 1e-8)
939
+ for (auto [x,c] : simd_charges)
940
+ {
941
+ auto rho = L2Norm(p-x);
942
+ auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW> (1,rho*mp.Kappa()) / rho;
943
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
944
+ vsum += kernel * c;
945
+ }
946
+ else
947
+ for (auto [x,c] : simd_charges)
948
+ {
949
+ auto rho = L2Norm(p-x);
950
+ auto [si,co] = sincos(rho*mp.Kappa());
951
+ auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) / rho;
952
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
953
+ vsum += kernel * c;
954
+ }
955
+
956
+ sum += HSum(vsum);
957
+ }
958
+ else
959
+ {
960
+ if (mp.Kappa() < 1e-8)
961
+ {
962
+ for (auto [x,c] : charges)
963
+ if (double rho = L2Norm(p-x); rho > 0)
964
+ sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
965
+ }
966
+ else
967
+ for (auto [x,c] : charges)
968
+ if (double rho = L2Norm(p-x); rho > 0)
969
+ sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
970
+ }
971
+ }
972
+
973
+ if (simd_dipoles.Size())
974
+ {
975
+ simd_entry_type vsum{0.0};
976
+ for (auto [x,d,c] : simd_dipoles)
588
977
  {
589
- for (auto [x,c] : charges)
590
- if (double rho = L2Norm(p-x); rho > 0)
591
- sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
978
+ auto rho = L2Norm(p-x);
979
+ auto drhodp = (1.0/rho) * (p-x);
980
+ auto [si,co] = sincos(rho*mp.Kappa());
981
+ auto dGdrho = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) *
982
+ (-1.0/(rho*rho) + SIMD<Complex,FMM_SW>(0, mp.Kappa())/rho);
983
+ auto kernel = dGdrho * InnerProduct(drhodp, d);
984
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
985
+ vsum += kernel * c;
592
986
  }
987
+ sum += HSum(vsum);
988
+ }
593
989
  else
594
- for (auto [x,c] : charges)
595
- if (double rho = L2Norm(p-x); rho > 0)
596
- sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
597
-
598
- for (auto [x,d,c] : dipoles)
990
+ {
991
+ for (auto [x,d,c] : dipoles)
599
992
  if (double rho = L2Norm(p-x); rho > 0)
600
- {
993
+ {
601
994
  Vec<3> drhodp = 1.0/rho * (p-x);
602
995
  Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
603
- (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
996
+ (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
604
997
  sum += dGdrho * InnerProduct(drhodp, d) * c;
605
- }
998
+ }
999
+ }
606
1000
 
607
1001
  for (auto [sp,ep,j,num] : currents)
608
1002
  {
@@ -664,23 +1058,27 @@ namespace ngsbem
664
1058
  }
665
1059
  }
666
1060
 
667
- void CalcMP()
1061
+ void CalcMP(Array<RecordingSS> * recording, Array<Node*> * nodes_to_process)
668
1062
  {
669
- mp.SH().Coefs() = 0.0;
1063
+ // mp.SH().Coefs() = 0.0;
670
1064
  if (childs[0])
671
1065
  {
672
- if (total_sources < 1000)
1066
+ if (total_sources < 1000 || recording)
673
1067
  for (auto & child : childs)
674
- child->CalcMP();
1068
+ child->CalcMP(recording, nodes_to_process);
675
1069
  else
676
1070
  ParallelFor (8, [&] (int nr)
677
1071
  {
678
- childs[nr] -> CalcMP();
1072
+ childs[nr] -> CalcMP(recording, nodes_to_process);
679
1073
  });
680
1074
 
681
1075
 
682
- for (auto & child : childs)
683
- child->mp.TransformAdd(mp, center-child->center);
1076
+ for (auto & child : childs){
1077
+ if (recording && child->mp.SH().Coefs().Size() > 0)
1078
+ *recording += RecordingSS(&child->mp, &mp, center-child->center);
1079
+ else
1080
+ child->mp.TransformAdd(mp, center-child->center);
1081
+ }
684
1082
  }
685
1083
  else
686
1084
  {
@@ -690,14 +1088,54 @@ namespace ngsbem
690
1088
  return;
691
1089
  }
692
1090
 
693
- for (auto [x,c] : charges)
694
- mp.AddCharge (x-center,c);
1091
+ // make simd charges, comment this block for testing ...
1092
+ simd_charges.SetSize( (charges.Size()+FMM_SW-1)/FMM_SW);
1093
+ size_t i = 0, ii = 0;
1094
+ for ( ; i+FMM_SW <= charges.Size(); i+=FMM_SW, ii++)
1095
+ {
1096
+ std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
1097
+ for (int j = 0; j < FMM_SW; j++) ca[j] = charges[i+j];
1098
+ simd_charges[ii] = MakeSimd(ca);
1099
+ }
1100
+ if (i < charges.Size())
1101
+ {
1102
+ std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
1103
+ int j = 0;
1104
+ for ( ; i+j < charges.Size(); j++) ca[j] = charges[i+j];
1105
+ for ( ; j < FMM_SW; j++) ca[j] = tuple( get<0>(ca[0]), entry_type{0.0} );
1106
+ simd_charges[ii] = MakeSimd(ca);
1107
+ }
1108
+
1109
+ simd_dipoles.SetSize( (dipoles.Size()+FMM_SW-1)/FMM_SW);
1110
+ i = 0, ii = 0;
1111
+ for ( ; i+FMM_SW <= dipoles.Size(); i+=FMM_SW, ii++)
1112
+ {
1113
+ std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
1114
+ for (int j = 0; j < FMM_SW; j++) di[j] = dipoles[i+j];
1115
+ simd_dipoles[ii] = MakeSimd(di);
1116
+ }
1117
+ if (i < dipoles.Size())
1118
+ {
1119
+ std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
1120
+ int j = 0;
1121
+ for ( ; i+j < dipoles.Size(); j++) di[j] = dipoles[i+j];
1122
+ for ( ; j < FMM_SW; j++) di[j] = tuple( get<0>(di[0]), get<1>(di[0]), entry_type{0.0} );
1123
+ simd_dipoles[ii] = MakeSimd(di);
1124
+ }
1125
+
695
1126
 
696
- for (auto [x,d,c] : dipoles)
697
- mp.AddDipole (x-center, d, c);
1127
+ if (nodes_to_process)
1128
+ *nodes_to_process += this;
1129
+ else {
1130
+ for (auto [x,c] : charges)
1131
+ mp.AddCharge (x-center,c);
1132
+
1133
+ for (auto [x,d,c] : dipoles)
1134
+ mp.AddDipole (x-center, d, c);
698
1135
 
699
- for (auto [sp,ep,j,num] : currents)
700
- mp.AddCurrent (sp-center, ep-center, j, num);
1136
+ for (auto [sp,ep,j,num] : currents)
1137
+ mp.AddCurrent (sp-center, ep-center, j, num);
1138
+ }
701
1139
  }
702
1140
  }
703
1141
 
@@ -836,6 +1274,10 @@ namespace ngsbem
836
1274
  void CalcMP()
837
1275
  {
838
1276
  static Timer t("mptool compute singular MLMP"); RegionTimer rg(t);
1277
+ static Timer ts2mp("mptool compute singular MLMP - source2mp");
1278
+ static Timer tS2S("mptool compute singular MLMP - S->S");
1279
+ static Timer trec("mptool comput singular recording");
1280
+ static Timer tsort("mptool comput singular sort");
839
1281
 
840
1282
  /*
841
1283
  int maxlevel = 0;
@@ -847,7 +1289,87 @@ namespace ngsbem
847
1289
  */
848
1290
 
849
1291
  root.CalcTotalSources();
850
- root.CalcMP();
1292
+
1293
+ if (false)
1294
+ // direct evaluation of S->S
1295
+ root.CalcMP(nullptr, nullptr);
1296
+ else
1297
+ {
1298
+
1299
+ Array<RecordingSS> recording;
1300
+ Array<Node*> nodes_to_process;
1301
+
1302
+ {
1303
+ RegionTimer reg(trec);
1304
+ root.CalcMP(&recording, &nodes_to_process);
1305
+ }
1306
+
1307
+ {
1308
+ RegionTimer rs2mp(ts2mp);
1309
+ ParallelFor(nodes_to_process.Size(), [&](int i){
1310
+ auto node = nodes_to_process[i];
1311
+ for (auto [x,c]: node->charges)
1312
+ node->mp.AddCharge(x-node->center, c);
1313
+ for (auto [x,d,c]: node->dipoles)
1314
+ node->mp.AddDipole(x-node->center, d, c);
1315
+ for (auto [sp,ep,j,num]: node->currents)
1316
+ node->mp.AddCurrent(sp-node->center, ep-node->center, j, num);
1317
+ }, TasksPerThread(4));
1318
+ }
1319
+
1320
+ {
1321
+ RegionTimer reg(tsort);
1322
+ QuickSort (recording, [] (auto & a, auto & b)
1323
+ {
1324
+ if (a.len < (1-1e-8) * b.len) return true;
1325
+ if (a.len > (1+1e-8) * b.len) return false;
1326
+ return a.theta < b.theta;
1327
+ });
1328
+ }
1329
+
1330
+ double current_len = -1e100;
1331
+ double current_theta = -1e100;
1332
+ Array<RecordingSS*> current_batch;
1333
+ Array<Array<RecordingSS*>> batch_group;
1334
+ Array<double> group_lengths;
1335
+ Array<double> group_thetas;
1336
+ for (auto & record : recording)
1337
+ {
1338
+ bool len_changed = fabs(record.len - current_len) > 1e-8;
1339
+ bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
1340
+ if ((len_changed || theta_changed) && current_batch.Size() > 0) {
1341
+ batch_group.Append(current_batch);
1342
+ group_lengths.Append(current_len);
1343
+ group_thetas.Append(current_theta);
1344
+ current_batch.SetSize(0);
1345
+ }
1346
+
1347
+ current_len = record.len;
1348
+ current_theta = record.theta;
1349
+ current_batch.Append(&record);
1350
+ }
1351
+ if (current_batch.Size() > 0) {
1352
+ batch_group.Append(current_batch);
1353
+ group_lengths.Append(current_len);
1354
+ group_thetas.Append(current_theta);
1355
+ }
1356
+
1357
+ {
1358
+ RegionTimer rS2S(tS2S);
1359
+ // ParallelFor(batch_group.Size(), [&](int i) {
1360
+ for (int i = 0; i < batch_group.Size(); i++){
1361
+ // *testout << "Processing batch " << i << " of size " << batch_group[i].Size() << ", with len = " << group_lengths[i] << ", theta = " << group_thetas[i] << endl;
1362
+ int chunk_size = 24;
1363
+ if (batch_group[i].Size() < chunk_size)
1364
+ ProcessBatch(batch_group[i], group_lengths[i], group_thetas[i]);
1365
+ else
1366
+ ParallelForRange(IntRange(batch_group[i].Size()), [&](IntRange range) {
1367
+ auto sub_batch = batch_group[i].Range(range.First(), range.Next());
1368
+ ProcessBatch(sub_batch, group_lengths[i], group_thetas[i]);
1369
+ }, TasksPerThread(4));
1370
+ }
1371
+ }
1372
+ }
851
1373
 
852
1374
  havemp = true;
853
1375
  }
@@ -877,6 +1399,143 @@ namespace ngsbem
877
1399
  class NGS_DLL_HEADER RegularMLMultiPole
878
1400
  {
879
1401
  static Array<size_t> nodes_on_level;
1402
+
1403
+
1404
+ struct RecordingRS
1405
+ {
1406
+ const MultiPole<MPSingular,elem_type> * mpS;
1407
+ MultiPole<MPRegular,elem_type> * mpR;
1408
+ Vec<3> dist;
1409
+ double len, theta, phi;
1410
+ public:
1411
+ RecordingRS() = default;
1412
+ RecordingRS (const MultiPole<MPSingular,elem_type> * ampS,
1413
+ MultiPole<MPRegular,elem_type> * ampR,
1414
+ Vec<3> adist)
1415
+ : mpS(ampS), mpR(ampR), dist(adist)
1416
+ {
1417
+ std::tie(len, theta, phi) = SphericalCoordinates(dist);
1418
+ }
1419
+ };
1420
+
1421
+ static void ProcessBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
1422
+ // static Timer t("ProcessBatchRS"); RegionTimer reg(t, batch.Size());
1423
+ constexpr int vec_length = VecLength<elem_type>;
1424
+ int batch_size = batch.Size();
1425
+ int N = batch_size * vec_length;
1426
+ // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(elem_type).name() << ", len = " << len << ", theta = " << theta << endl;
1427
+
1428
+ if (N <= 1 || batch_size <= 1) {
1429
+ for (auto* rec : batch) {
1430
+ rec->mpS->TransformAdd(*rec->mpR, rec->dist);
1431
+ }
1432
+ }
1433
+ else if (N <= 3) {
1434
+ ProcessVectorizedBatch<3, vec_length>(batch, len, theta);
1435
+ }
1436
+ else if (N <= 4) {
1437
+ ProcessVectorizedBatch<4, vec_length>(batch, len, theta);
1438
+ }
1439
+ else if (N <= 6) {
1440
+ ProcessVectorizedBatch<6, vec_length>(batch, len, theta);
1441
+ }
1442
+ else if (N <= 12) {
1443
+ ProcessVectorizedBatch<12, vec_length>(batch, len, theta);
1444
+ }
1445
+ else if (N <= 24) {
1446
+ ProcessVectorizedBatch<24, vec_length>(batch, len, theta);
1447
+ }
1448
+ else if (N <= 48) {
1449
+ ProcessVectorizedBatch<48, vec_length>(batch, len, theta);
1450
+ }
1451
+ else if (N <= 96) {
1452
+ ProcessVectorizedBatch<96, vec_length>(batch, len, theta);
1453
+ }
1454
+ else if (N <= 192) {
1455
+ ProcessVectorizedBatch<192, vec_length>(batch, len, theta);
1456
+ }
1457
+ else {
1458
+ // Split large batches
1459
+ /*
1460
+ ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
1461
+ ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
1462
+ */
1463
+
1464
+ /*
1465
+ ParallelFor (2, [&] (int i)
1466
+ {
1467
+ if (i == 0)
1468
+ ProcessBatchRS(batch.Range(0, 192 / vec_length), len, theta);
1469
+ else
1470
+ ProcessBatchRS(batch.Range(192 / vec_length, batch_size), len, theta);
1471
+ }, 2);
1472
+ */
1473
+
1474
+
1475
+ size_t chunksize = 192/vec_length;
1476
+ size_t num = (batch.Size()+chunksize-1) / chunksize;
1477
+ ParallelFor (num, [&](int i)
1478
+ {
1479
+ ProcessBatchRS(batch.Range(i*chunksize, min((i+1)*chunksize, batch.Size())), len, theta);
1480
+ }, num);
1481
+
1482
+ }
1483
+ }
1484
+
1485
+
1486
+ template<int N, int vec_length>
1487
+ static void ProcessVectorizedBatch(FlatArray<RecordingRS*> batch, double len, double theta) {
1488
+
1489
+ // static Timer t("ProcessVectorizedBatch, N = "+ToString(N) + ", vec_len = " + ToString(vec_length));
1490
+ // RegionTimer reg(t, batch[0]->mpS->SH().Order());
1491
+ // static Timer ttobatch("mptools - copy to batch 2");
1492
+ // static Timer tfrombatch("mptools - copy from batch 2");
1493
+
1494
+ // *testout << "Processing vectorized batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
1495
+ MultiPole<MPSingular, Vec<N,Complex>> vec_source(batch[0]->mpS->Order(), batch[0]->mpS->Kappa(), batch[0]->mpS->RTyp());
1496
+ // MultiPole<MPSingular, elem_type> tmp_source{*batch[0]->mpS};
1497
+ MultiPole<MPRegular, elem_type> tmp_target{*batch[0]->mpR};
1498
+ MultiPole<MPRegular, Vec<N,Complex>> vec_target(batch[0]->mpR->Order(), batch[0]->mpR->Kappa(), batch[0]->mpR->RTyp());
1499
+
1500
+ // Copy multipoles into vectorized multipole
1501
+ // ttobatch.Start();
1502
+ for (int i = 0; i < batch.Size(); i++)
1503
+ {
1504
+ auto source_i = VecVector2Matrix (batch[i]->mpS->SH().Coefs());
1505
+ auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
1506
+ batch[i]->mpS->SH().RotateZ(batch[i]->phi,
1507
+ [source_i, source_mati] (size_t ii, Complex factor)
1508
+ {
1509
+ source_mati.Row(ii) = factor * source_i.Row(ii);
1510
+ });
1511
+ }
1512
+
1513
+ // ttobatch.Stop();
1514
+
1515
+ vec_source.SH().RotateY(theta);
1516
+ vec_source.ShiftZ(-len, vec_target);
1517
+ vec_target.SH().RotateY(-theta);
1518
+
1519
+ // Copy vectorized multipole into individual multipoles
1520
+ // tfrombatch.Start();
1521
+ for (int i = 0; i < batch.Size(); i++) {
1522
+ // auto source_i = VecVector2Matrix (tmp_target.SH().Coefs());
1523
+ auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
1524
+ auto targeti = VecVector2Matrix(batch[i]->mpR->SH().Coefs());
1525
+
1526
+ tmp_target.SH().RotateZ(-batch[i]->phi,
1527
+ [source_mati, targeti] (size_t ii, Complex factor)
1528
+ {
1529
+ // source_i.Row(ii) = factor * source_mati.Row(ii);
1530
+ AtomicAdd (VectorView(targeti.Row(ii)), factor * source_mati.Row(ii));
1531
+ });
1532
+ // for (int j = 0; j < tmp_target.SH().Coefs().Size(); j++)
1533
+ // AtomicAdd(batch[i]->mpR->SH().Coefs()[j], tmp_target.SH().Coefs()[j]);
1534
+ }
1535
+ // tfrombatch.Stop();
1536
+
1537
+ }
1538
+
880
1539
 
881
1540
  struct Node
882
1541
  {
@@ -887,6 +1546,8 @@ namespace ngsbem
887
1546
  MultiPole<MPRegular,elem_type> mp;
888
1547
  Array<Vec<3>> targets;
889
1548
  int total_targets;
1549
+ std::mutex node_mutex;
1550
+ atomic<bool> have_childs{false};
890
1551
 
891
1552
  Array<const typename SingularMLMultiPole<elem_type>::Node*> singnodes;
892
1553
 
@@ -911,13 +1572,15 @@ namespace ngsbem
911
1572
  cc(2) += (i&4) ? r/2 : -r/2;
912
1573
  childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
913
1574
  }
1575
+ have_childs = true;
914
1576
  }
915
-
916
- void AddSingularNode (const typename SingularMLMultiPole<elem_type>::Node & singnode, bool allow_refine)
1577
+
1578
+ void AddSingularNode (const typename SingularMLMultiPole<elem_type>::Node & singnode, bool allow_refine,
1579
+ Array<RecordingRS> * recording)
917
1580
  {
918
1581
  if (mp.SH().Order() < 0) return;
919
1582
  if (singnode.mp.SH().Order() < 0) return;
920
- if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
1583
+ // if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
921
1584
  if (level > 20)
922
1585
  {
923
1586
  singnodes.Append(&singnode);
@@ -936,12 +1599,15 @@ namespace ngsbem
936
1599
  singnode.childs[0]->mp.Order() < singnode.mp.Order())
937
1600
  {
938
1601
  for (auto & child : singnode.childs)
939
- AddSingularNode (*child, allow_refine);
1602
+ AddSingularNode (*child, allow_refine, recording);
940
1603
  return;
941
1604
  }
942
1605
 
943
1606
  // static Timer t("mptool transform Helmholtz-criterion"); RegionTimer r(t);
944
- singnode.mp.TransformAdd(mp, dist);
1607
+ if (recording)
1608
+ *recording += RecordingRS(&singnode.mp, &mp, dist);
1609
+ else
1610
+ singnode.mp.TransformAdd(mp, dist);
945
1611
  return;
946
1612
  }
947
1613
 
@@ -960,21 +1626,21 @@ namespace ngsbem
960
1626
  CreateChilds();
961
1627
 
962
1628
  for (auto & ch : childs)
963
- ch -> AddSingularNode (singnode, allow_refine);
1629
+ ch -> AddSingularNode (singnode, allow_refine, recording);
964
1630
  }
965
1631
  else
966
1632
  {
967
- if (total_targets < 1000)
1633
+ if (total_targets < 1000 || recording)
968
1634
  {
969
1635
  for (auto & ch : childs)
970
1636
  if (ch)
971
- ch -> AddSingularNode (singnode, allow_refine);
1637
+ ch -> AddSingularNode (singnode, allow_refine, recording);
972
1638
  }
973
1639
  else
974
1640
  ParallelFor (8, [&] (int nr)
975
1641
  {
976
1642
  if (childs[nr])
977
- childs[nr] -> AddSingularNode (singnode, allow_refine);
1643
+ childs[nr] -> AddSingularNode (singnode, allow_refine, recording);
978
1644
  });
979
1645
 
980
1646
  if (targets.Size())
@@ -984,7 +1650,7 @@ namespace ngsbem
984
1650
  else
985
1651
  {
986
1652
  for (auto & childsing : singnode.childs)
987
- AddSingularNode (*childsing, allow_refine);
1653
+ AddSingularNode (*childsing, allow_refine, recording);
988
1654
  }
989
1655
  }
990
1656
 
@@ -996,12 +1662,22 @@ namespace ngsbem
996
1662
 
997
1663
  if (childs[0])
998
1664
  {
999
- for (auto & ch : childs)
1665
+ if (total_targets < 1000)
1000
1666
  {
1001
- if (L2Norm(mp.SH().Coefs()) > 0)
1002
- mp.TransformAdd (ch->mp, ch->center-center);
1003
- ch->LocalizeExpansion(allow_refine);
1667
+ for (int nr = 0; nr < 8; nr++)
1668
+ {
1669
+ if (L2Norm(mp.SH().Coefs()) > 0)
1670
+ mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
1671
+ childs[nr]->LocalizeExpansion(allow_refine);
1672
+ }
1004
1673
  }
1674
+ else
1675
+ ParallelFor(8, [&] (int nr)
1676
+ {
1677
+ if (L2Norm(mp.SH().Coefs()) > 0)
1678
+ mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
1679
+ childs[nr]->LocalizeExpansion(allow_refine);
1680
+ });
1005
1681
  mp = MultiPole<MPRegular,elem_type>(-1, mp.Kappa(), 1.);
1006
1682
  //mp.SH().Coefs()=0.0;
1007
1683
  }
@@ -1009,18 +1685,8 @@ namespace ngsbem
1009
1685
 
1010
1686
  elem_type Evaluate (Vec<3> p) const
1011
1687
  {
1012
- // *testout << "eval p = " << p << ", level = " << level << ", center = " << center << ", r = " << r << endl;
1013
1688
  elem_type sum{0.0};
1014
- /*
1015
- if (childs[0])
1016
- {
1017
- int childnum = 0;
1018
- if (p(0) > center(0)) childnum += 1;
1019
- if (p(1) > center(1)) childnum += 2;
1020
- if (p(2) > center(2)) childnum += 4;
1021
- sum = childs[childnum]->Evaluate(p);
1022
- }
1023
- */
1689
+
1024
1690
  int childnum = 0;
1025
1691
  if (p(0) > center(0)) childnum += 1;
1026
1692
  if (p(1) > center(1)) childnum += 2;
@@ -1030,8 +1696,6 @@ namespace ngsbem
1030
1696
  else
1031
1697
  sum = mp.Eval(p-center);
1032
1698
 
1033
-
1034
- // static Timer t("mptool direct evaluate"); RegionTimer r(t);
1035
1699
  for (auto sn : singnodes)
1036
1700
  sum += sn->EvaluateMP(p);
1037
1701
 
@@ -1080,7 +1744,8 @@ namespace ngsbem
1080
1744
 
1081
1745
  void AddTarget (Vec<3> x)
1082
1746
  {
1083
- if (childs[0])
1747
+ // if (childs[0])
1748
+ if (have_childs) // quick check without locking
1084
1749
  {
1085
1750
  // directly send to childs:
1086
1751
  int childnum = 0;
@@ -1091,6 +1756,20 @@ namespace ngsbem
1091
1756
  return;
1092
1757
  }
1093
1758
 
1759
+ lock_guard<mutex> guard(node_mutex);
1760
+
1761
+ if (have_childs) // test again after locking
1762
+ {
1763
+ // directly send to childs:
1764
+ int childnum = 0;
1765
+ if (x(0) > center(0)) childnum += 1;
1766
+ if (x(1) > center(1)) childnum += 2;
1767
+ if (x(2) > center(2)) childnum += 4;
1768
+ childs[childnum] -> AddTarget(x);
1769
+ return;
1770
+ }
1771
+
1772
+
1094
1773
  targets.Append( x );
1095
1774
 
1096
1775
  // if (r*mp.Kappa() < 1e-8) return;
@@ -1158,8 +1837,8 @@ namespace ngsbem
1158
1837
  nodes_on_level = 0;
1159
1838
  nodes_on_level[0] = 1;
1160
1839
  {
1161
- static Timer t("mptool compute regular MLMP"); RegionTimer rg(t);
1162
- root.AddSingularNode(singmp->root, true);
1840
+ static Timer t("mptool compute regular MLMP"); RegionTimer rg(t);
1841
+ root.AddSingularNode(singmp->root, true, nullptr);
1163
1842
  // cout << "norm after S->R conversion: " << root.Norm() << endl;
1164
1843
  }
1165
1844
 
@@ -1195,14 +1874,69 @@ namespace ngsbem
1195
1874
  void CalcMP(shared_ptr<SingularMLMultiPole<elem_type>> asingmp)
1196
1875
  {
1197
1876
  static Timer t("mptool regular MLMP"); RegionTimer rg(t);
1877
+ static Timer trec("mptool regular MLMP - recording");
1878
+ static Timer tsort("mptool regular MLMP - sort");
1198
1879
 
1199
1880
  singmp = asingmp;
1200
1881
 
1201
1882
  root.CalcTotalTargets();
1202
1883
  root.RemoveEmptyTrees();
1203
-
1204
- root.AddSingularNode(singmp->root, false);
1205
1884
 
1885
+
1886
+ // root.AddSingularNode(singmp->root, false, nullptr);
1887
+ // /*
1888
+ Array<RecordingRS> recording;
1889
+ {
1890
+ RegionTimer rrec(trec);
1891
+ root.AddSingularNode(singmp->root, false, &recording);
1892
+ }
1893
+
1894
+ // cout << "recorded: " << recording.Size() << endl;
1895
+ {
1896
+ RegionTimer reg(tsort);
1897
+ QuickSort (recording, [] (auto & a, auto & b)
1898
+ {
1899
+ if (a.len < (1-1e-8) * b.len) return true;
1900
+ if (a.len > (1+1e-8) * b.len) return false;
1901
+ return a.theta < b.theta;
1902
+ });
1903
+ }
1904
+
1905
+ double current_len = -1e100;
1906
+ double current_theta = -1e100;
1907
+ Array<RecordingRS*> current_batch;
1908
+ Array<Array<RecordingRS*>> batch_group;
1909
+ Array<double> group_lengths;
1910
+ Array<double> group_thetas;
1911
+ for (auto & record : recording)
1912
+ {
1913
+ bool len_changed = fabs(record.len - current_len) > 1e-8;
1914
+ bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
1915
+ if ((len_changed || theta_changed) && current_batch.Size() > 0) {
1916
+ // ProcessBatch(current_batch, current_len, current_theta);
1917
+ batch_group.Append(current_batch);
1918
+ group_lengths.Append(current_len);
1919
+ group_thetas.Append(current_theta);
1920
+ current_batch.SetSize(0);
1921
+ }
1922
+
1923
+ current_len = record.len;
1924
+ current_theta = record.theta;
1925
+ current_batch.Append(&record);
1926
+ }
1927
+ if (current_batch.Size() > 0) {
1928
+ // ProcessBatch(current_batch, current_len, current_theta);
1929
+ batch_group.Append(current_batch);
1930
+ group_lengths.Append(current_len);
1931
+ group_thetas.Append(current_theta);
1932
+ }
1933
+
1934
+ ParallelFor(batch_group.Size(), [&](int i) {
1935
+ ProcessBatchRS(batch_group[i], group_lengths[i], group_thetas[i]);
1936
+ }, TasksPerThread(4));
1937
+ // */
1938
+
1939
+
1206
1940
  /*
1207
1941
  int maxlevel = 0;
1208
1942
  for (auto [i,num] : Enumerate(RegularMLMultiPole::nodes_on_level))
@@ -1212,7 +1946,7 @@ namespace ngsbem
1212
1946
  cout << "reg " << i << ": " << RegularMLMultiPole::nodes_on_level[i] << endl;
1213
1947
  */
1214
1948
 
1215
- static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
1949
+ static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
1216
1950
  root.LocalizeExpansion(false);
1217
1951
  }
1218
1952
 
@@ -1246,6 +1980,7 @@ namespace ngsbem
1246
1980
 
1247
1981
  };
1248
1982
 
1983
+
1249
1984
  template <typename elem_type>
1250
1985
  inline ostream & operator<< (ostream & ost, const RegularMLMultiPole<elem_type> & mlmp)
1251
1986
  {