ngsolve 6.2.2505.post17.dev0__cp39-cp39-macosx_10_15_universal2.whl → 6.2.2505.post95.dev0__cp39-cp39-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ngsolve might be problematic. Click here for more details.

Files changed (56) hide show
  1. netgen/include/bilinearform.hpp +1 -1
  2. netgen/include/diffop_impl.hpp +3 -1
  3. netgen/include/gridfunction.hpp +1 -1
  4. netgen/include/hcurlcurlfe.hpp +20 -0
  5. netgen/include/mptools.hpp +786 -101
  6. netgen/include/ngblas.hpp +11 -0
  7. netgen/include/recursive_pol.hpp +63 -11
  8. netgen/include/simd_complex.hpp +20 -0
  9. netgen/include/sparsematrix_impl.hpp +25 -0
  10. netgen/include/vector.hpp +15 -2
  11. netgen/libngbla.dylib +0 -0
  12. netgen/libngcomp.dylib +0 -0
  13. netgen/libngfem.dylib +0 -0
  14. netgen/libngla.dylib +0 -0
  15. netgen/libngsbem.dylib +0 -0
  16. netgen/libngstd.dylib +0 -0
  17. ngsolve/cmake/NGSolveConfig.cmake +1 -1
  18. ngsolve/config/config.py +5 -5
  19. {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post95.dev0.dist-info}/METADATA +2 -2
  20. {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post95.dev0.dist-info}/RECORD +56 -56
  21. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/Netgen.icns +0 -0
  22. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/bin/ngscxx +0 -0
  23. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/bin/ngsld +0 -0
  24. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/bin/ngsolve.tcl +0 -0
  25. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/bin/ngspy +0 -0
  26. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/beam.geo +0 -0
  27. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/beam.vol +0 -0
  28. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/chip.in2d +0 -0
  29. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/chip.vol +0 -0
  30. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/coil.geo +0 -0
  31. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/coil.vol +0 -0
  32. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/coilshield.geo +0 -0
  33. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/coilshield.vol +0 -0
  34. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/cube.geo +0 -0
  35. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/cube.vol +0 -0
  36. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d10_DGdoubleglazing.pde +0 -0
  37. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d11_chip_nitsche.pde +0 -0
  38. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d1_square.pde +0 -0
  39. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d2_chip.pde +0 -0
  40. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d3_helmholtz.pde +0 -0
  41. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d4_cube.pde +0 -0
  42. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d5_beam.pde +0 -0
  43. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d6_shaft.pde +0 -0
  44. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d7_coil.pde +0 -0
  45. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d8_coilshield.pde +0 -0
  46. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d9_hybridDG.pde +0 -0
  47. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/doubleglazing.in2d +0 -0
  48. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/doubleglazing.vol +0 -0
  49. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/piezo2d40round4.vol.gz +0 -0
  50. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/shaft.geo +0 -0
  51. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/shaft.vol +0 -0
  52. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/square.in2d +0 -0
  53. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/square.vol +0 -0
  54. {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post95.dev0.dist-info}/LICENSE +0 -0
  55. {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post95.dev0.dist-info}/WHEEL +0 -0
  56. {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post95.dev0.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,157 @@ namespace ngsbem
20
20
  {
21
21
  using namespace ngfem;
22
22
 
23
+ template<typename T>
24
+ constexpr int VecLength = 1; // Default: Complex has length 1
25
+
26
+ template<int N>
27
+ constexpr int VecLength<Vec<N, Complex>> = N; // Specialization: Vec<N,Complex> has length N
28
+
29
+
30
+
31
+ constexpr int FMM_SW = 4;
32
+
33
+
34
+ // ************************ SIMD - creation (should end up in simd.hpp) *************
35
+
36
+
37
+ template <int S, typename T, int SW>
38
+ Vec<S,T> HSum (Vec<S,SIMD<T,SW>> v)
39
+ {
40
+ Vec<S,T> res;
41
+ for (int i = 0; i < S; i++)
42
+ res(i) = HSum(v(i));
43
+ // Iterate<S> ([&](auto i) {
44
+ // res.HTData().template Elem<i.value>() = HSum(v.HTData().template Elem<i.value>());
45
+ // });
46
+ return res;
47
+ }
48
+
49
+
50
+ template <typename T, size_t S> class MakeSimdCl;
51
+
52
+ template <typename T, size_t S>
53
+ auto MakeSimd (array<T,S> aa) { return MakeSimdCl(aa).Get(); }
54
+
55
+
56
+ template <typename T, size_t S>
57
+ class MakeSimdCl
58
+ {
59
+ array<T,S> a;
60
+ public:
61
+ MakeSimdCl (array<T,S> aa) : a(aa) { ; }
62
+ auto Get() const
63
+ {
64
+ SIMD<T,S> sa( [this] (auto i) { return (this->a)[i]; });
65
+ return sa;
66
+ }
67
+ };
68
+
69
+
70
+ template <typename T, size_t S, int VS>
71
+ class MakeSimdCl<Vec<VS,T>,S>
72
+ {
73
+ array<Vec<VS,T>,S> a;
74
+ public:
75
+ MakeSimdCl (array<Vec<VS,T>,S> aa) : a(aa) { ; }
76
+
77
+ auto Get() const
78
+ {
79
+ array<T,S> ai;
80
+ Vec<VS, decltype(MakeSimd(ai))> res;
81
+ for (int i = 0; i < VS; i++)
82
+ {
83
+ for (int j = 0; j < S; j++)
84
+ ai[j] = a[j](i);
85
+ res(i) = MakeSimd(ai);
86
+ }
87
+ return res;
88
+ }
89
+ };
90
+
91
+
92
+
93
+ template <size_t S>
94
+ class MakeSimdCl<Complex,S>
95
+ {
96
+ array<Complex,S> a;
97
+ public:
98
+ MakeSimdCl (array<Complex,S> aa) : a(aa) { ; }
99
+ auto Get() const
100
+ {
101
+ array<double,S> ar, ai;
102
+ for (int j = 0; j < S; j++)
103
+ {
104
+ ar[j] = Real(a[j]);
105
+ ai[j] = Imag(a[j]);
106
+ }
107
+
108
+ return SIMD<Complex,S> (MakeSimd(ar), MakeSimd(ai));
109
+ }
110
+ };
111
+
112
+
113
+
114
+
115
+
116
+
117
+ template <typename Tfirst, size_t S, typename ...Trest>
118
+ class MakeSimdCl<std::tuple<Tfirst,Trest...>,S>
119
+ {
120
+ array<std::tuple<Tfirst,Trest...>,S> a;
121
+ public:
122
+ MakeSimdCl (array<std::tuple<Tfirst,Trest...>,S> aa) : a(aa) { ; }
123
+ auto Get() const
124
+ {
125
+ array<Tfirst,S> a0;
126
+ for (int i = 0; i < S; i++)
127
+ a0[i] = std::get<0> (a[i]);
128
+
129
+ if constexpr (std::tuple_size<tuple<Tfirst,Trest...>>::value == 1)
130
+ {
131
+ return tuple(MakeSimd(a0));
132
+ }
133
+ else
134
+ {
135
+ array<tuple<Trest...>,S> arest;
136
+ for (int i = 0; i < S; i++)
137
+ arest[i] = skip_first(a[i]);
138
+
139
+ return tuple_cat ( tuple (MakeSimd(a0)), MakeSimd(arest) );
140
+ }
141
+ }
142
+
143
+ template <typename... Ts>
144
+ static auto skip_first(const std::tuple<Ts...>& t) {
145
+ return std::apply([](auto first, auto... rest) {
146
+ return std::make_tuple(rest...);
147
+ }, t);
148
+ }
149
+ };
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+ inline std::tuple<double, double, double> SphericalCoordinates(Vec<3> dist){
161
+ double len, theta, phi;
162
+ len = L2Norm(dist);
163
+ if (len < 1e-30)
164
+ theta = 0;
165
+ else
166
+ theta = acos (dist(2) / len);
167
+ if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
168
+ phi = 0;
169
+ else
170
+ phi = atan2(dist(1), dist(0));
171
+ return {len, theta, phi};
172
+ }
173
+
23
174
 
24
175
  template <typename entry_type = Complex>
25
176
  class NGS_DLL_HEADER SphericalHarmonics
@@ -84,9 +235,69 @@ namespace ngsbem
84
235
 
85
236
  void Calc (Vec<3> x, FlatVector<Complex> shapes);
86
237
 
87
-
238
+
239
+ void FlipZ ();
88
240
  void RotateZ (double alpha);
89
- void RotateY (double alpha);
241
+
242
+ template <typename FUNC>
243
+ void RotateZ (double alpha, FUNC func) const
244
+ {
245
+ if (order < 0) return;
246
+
247
+ Vector<Complex> exp_imalpha(order+1);
248
+ Complex exp_ialpha(cos(alpha), sin(alpha));
249
+ Complex prod = 1.0;
250
+ for (int i = 0; i <= order; i++)
251
+ {
252
+ exp_imalpha(i) = prod;
253
+ prod *= exp_ialpha;
254
+ }
255
+
256
+ int ii = 0;
257
+ for (int n = 0; n <= order; n++)
258
+ {
259
+ for (int m = -n; m < 0; m++, ii++)
260
+ func(ii, conj(exp_imalpha(-m)));
261
+ for (int m = 0; m <= n; m++, ii++)
262
+ func(ii, exp_imalpha(m));
263
+ };
264
+ };
265
+
266
+ template <typename FUNC>
267
+ void RotateZFlip (double alpha, bool flip, FUNC func) const
268
+ {
269
+ if (order < 0) return;
270
+
271
+ Vector<Complex> exp_imalpha(order+1);
272
+ Complex exp_ialpha(cos(alpha), sin(alpha));
273
+ Complex prod = 1.0;
274
+ for (int i = 0; i <= order; i++)
275
+ {
276
+ exp_imalpha(i) = prod;
277
+ prod *= exp_ialpha;
278
+ }
279
+
280
+ int ii = 0;
281
+
282
+ auto FlipFactor = [] (int n, int m, bool flip)->double
283
+ {
284
+ if (flip)
285
+ return ((n-m)%2) == 1 ? -1 : 1;
286
+ return 1.0;
287
+ };
288
+
289
+ for (int n = 0; n <= order; n++)
290
+ {
291
+ for (int m = -n; m < 0; m++, ii++)
292
+ func(ii, FlipFactor(n,m,flip)*conj(exp_imalpha(-m)));
293
+ for (int m = 0; m <= n; m++, ii++)
294
+ func(ii, FlipFactor(n,m,flip)*exp_imalpha(m));
295
+ };
296
+ };
297
+
298
+
299
+
300
+ void RotateY (double alpha, bool parallel = false);
90
301
 
91
302
 
92
303
  static double CalcAmn (int m, int n)
@@ -119,11 +330,11 @@ namespace ngsbem
119
330
  // https://fortran-lang.discourse.group/t/looking-for-spherical-bessel-and-hankel-functions-of-first-and-second-kind-and-arbitrary-order/2308/2
120
331
  NGS_DLL_HEADER
121
332
  void besseljs3d (int nterms, double z, double scale,
122
- FlatVector<double> fjs, FlatVector<double> fjder);
333
+ SliceVector<double> fjs, SliceVector<double> fjder = FlatVector<double>(0, nullptr));
123
334
 
124
335
  NGS_DLL_HEADER
125
336
  void besseljs3d (int nterms, Complex z, double scale,
126
- FlatVector<Complex> fjs, FlatVector<Complex> fjder);
337
+ SliceVector<Complex> fjs, SliceVector<Complex> fjder = FlatVector<Complex>(0, nullptr));
127
338
 
128
339
 
129
340
  /*
@@ -142,14 +353,17 @@ namespace ngsbem
142
353
  FlatVector<double> jp,
143
354
  FlatVector<double> yp);
144
355
 
145
-
356
+
146
357
 
147
358
  template <typename T>
148
359
  void SphericalBessel (int n, double rho, double scale, T && values)
149
360
  {
361
+ besseljs3d (n, rho, scale, values);
362
+ /*
150
363
  Vector<double> j(n+1), jp(n+1);
151
364
  besseljs3d (n, rho, scale, j, jp);
152
365
  values = j;
366
+ */
153
367
  }
154
368
 
155
369
 
@@ -173,21 +387,6 @@ namespace ngsbem
173
387
  return;
174
388
  }
175
389
  Vector j(n+1), y(n+1), jp(n+1), yp(n+1);
176
- // SBESJY (rho, n, j, y, jp, yp);
177
-
178
- /*
179
- values = j + Complex(0,1) * y;
180
- if (scale != 1.0)
181
- {
182
- double prod = 1.0;
183
- for (int i = 0; i <= n; i++)
184
- {
185
- values(i) *= prod;
186
- prod *= scale;
187
- }
188
- }
189
- */
190
-
191
390
 
192
391
  // the bessel-evaluation with scale
193
392
  besseljs3d (n, rho, 1/scale, j, jp);
@@ -358,18 +557,7 @@ namespace ngsbem
358
557
  // static Timer t("mptool Transform "+ToString(typeid(RADIAL).name())+ToString(typeid(TARGET).name()));
359
558
  // RegionTimer reg(t);
360
559
 
361
- double len = L2Norm(dist);
362
- double theta, phi;
363
-
364
- if (len < 1e-30)
365
- theta = 0;
366
- else
367
- theta = acos (dist(2) / len);
368
-
369
- if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
370
- phi = 0;
371
- else
372
- phi = atan2(dist(1), dist(0));
560
+ auto [len, theta, phi] = SphericalCoordinates(dist);
373
561
 
374
562
 
375
563
  // MultiPole<RADIAL,entry_type> tmp{*this};
@@ -386,14 +574,18 @@ namespace ngsbem
386
574
  }
387
575
 
388
576
  template <typename TARGET>
389
- void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
577
+ void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist, bool atomic = false) const
390
578
  {
391
579
  if (SH().Order() < 0) return;
392
580
  if (target.SH().Order() < 0) return;
393
581
 
394
582
  MultiPole<TARGET,entry_type> tmp{target};
395
583
  Transform(tmp, dist);
396
- target.SH().Coefs() += tmp.SH().Coefs();
584
+ if (!atomic)
585
+ target.SH().Coefs() += tmp.SH().Coefs();
586
+ else
587
+ for (int j = 0; j < target.SH().Coefs().Size(); j++)
588
+ AtomicAdd(target.SH().Coefs()[j], tmp.SH().Coefs()[j]);
397
589
  }
398
590
 
399
591
  template <typename TARGET>
@@ -412,11 +604,124 @@ namespace ngsbem
412
604
  static constexpr int maxdirect = 100;
413
605
 
414
606
 
607
+ template <typename SCAL, auto S>
608
+ inline auto VecVector2Matrix (FlatVector<Vec<S,SCAL>> vec)
609
+ {
610
+ return FlatMatrixFixWidth<S,SCAL> (vec.Size(), vec.Data()->Data());
611
+ }
612
+
613
+ inline auto VecVector2Matrix (FlatVector<Complex> vec)
614
+ {
615
+ return FlatMatrixFixWidth<1,Complex> (vec.Size(), vec.Data());
616
+ }
617
+
618
+
415
619
  template <typename entry_type=Complex>
416
620
  class SingularMLMultiPole
417
621
  {
622
+ using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
418
623
  static Array<size_t> nodes_on_level;
419
624
 
625
+ struct RecordingSS
626
+ {
627
+ const MultiPole<MPSingular,entry_type> * mp_source;
628
+ MultiPole<MPSingular,entry_type> * mp_target;
629
+ Vec<3> dist;
630
+ double len, theta, phi;
631
+ bool flipz;
632
+ public:
633
+ RecordingSS() = default;
634
+ RecordingSS (const MultiPole<MPSingular,entry_type> * amp_source,
635
+ MultiPole<MPSingular,entry_type> * amp_target,
636
+ Vec<3> adist)
637
+ : mp_source(amp_source), mp_target(amp_target), dist(adist)
638
+ {
639
+ std::tie(len, theta, phi) = SphericalCoordinates(adist);
640
+ // flipz = false;
641
+ flipz = theta > M_PI/2;
642
+ if (flipz) theta = M_PI-theta;
643
+ }
644
+ };
645
+
646
+
647
+ static void ProcessBatch(FlatArray<RecordingSS*> batch, double len, double theta) {
648
+ constexpr int vec_length = VecLength<entry_type>;
649
+ int batch_size = batch.Size();
650
+ int N = batch_size * vec_length;
651
+ // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(entry_type).name() << ", len = " << len << ", theta = " << theta << endl;
652
+
653
+ if (N <= 1 || batch_size <= 1) {
654
+ for (auto* rec : batch) {
655
+ rec->mp_source->TransformAdd(*rec->mp_target, rec->dist, true);
656
+ }
657
+ }
658
+ else if (N <= 3) {
659
+ ProcessVectorizedBatch<3, vec_length>(batch, len, theta);
660
+ }
661
+ else if (N <= 4) {
662
+ ProcessVectorizedBatch<4, vec_length>(batch, len, theta);
663
+ }
664
+ else if (N <= 6) {
665
+ ProcessVectorizedBatch<6, vec_length>(batch, len, theta);
666
+ }
667
+ else if (N <= 12) {
668
+ ProcessVectorizedBatch<12, vec_length>(batch, len, theta);
669
+ }
670
+ else if (N <= 24) {
671
+ ProcessVectorizedBatch<24, vec_length>(batch, len, theta);
672
+ }
673
+ else if (N <= 48) {
674
+ ProcessVectorizedBatch<48, vec_length>(batch, len, theta);
675
+ }
676
+ else if (N <= 96) {
677
+ ProcessVectorizedBatch<96, vec_length>(batch, len, theta);
678
+ }
679
+ else if (N <= 192) {
680
+ ProcessVectorizedBatch<192, vec_length>(batch, len, theta);
681
+ }
682
+ else {
683
+ // Split large batches
684
+ ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
685
+ ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
686
+ }
687
+ }
688
+
689
+ template<int N, int vec_length>
690
+ static void ProcessVectorizedBatch(FlatArray<RecordingSS*> batch, double len, double theta) {
691
+
692
+ // *testout << "Processing vectorized S->S batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
693
+ MultiPole<MPSingular, Vec<N,Complex>> vec_source(batch[0]->mp_source->Order(), batch[0]->mp_source->Kappa(), batch[0]->mp_source->RTyp());
694
+ MultiPole<MPSingular, Vec<N,Complex>> vec_target(batch[0]->mp_target->Order(), batch[0]->mp_target->Kappa(), batch[0]->mp_target->RTyp());
695
+
696
+ // Copy multipoles into vectorized multipole
697
+ for (int i = 0; i < batch.Size(); i++)
698
+ {
699
+ auto source_i = VecVector2Matrix (batch[i]->mp_source->SH().Coefs());
700
+ auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
701
+ batch[i]->mp_source->SH().RotateZFlip(batch[i]->phi, batch[i]->flipz,
702
+ [source_i, source_mati] (size_t ii, Complex factor)
703
+ {
704
+ source_mati.Row(ii) = factor * source_i.Row(ii);
705
+ });
706
+ }
707
+
708
+ vec_source.SH().RotateY(theta, vec_source.SH().Order() >= 100);
709
+ vec_source.ShiftZ(-len, vec_target);
710
+ vec_target.SH().RotateY(-theta, vec_target.SH().Order() >= 100);
711
+
712
+ // Copy vectorized multipole into individual multipoles
713
+ for (int i = 0; i < batch.Size(); i++)
714
+ {
715
+ auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
716
+ auto target_mati = VecVector2Matrix (batch[i]->mp_target->SH().Coefs());
717
+ batch[i]->mp_target->SH().RotateZFlip(-batch[i]->phi, batch[i]->flipz,
718
+ [source_mati, target_mati] (size_t ii, Complex factor)
719
+ {
720
+ AtomicAdd (target_mati.Row(ii), factor * source_mati.Row(ii));
721
+ });
722
+ }
723
+ }
724
+
420
725
  struct Node
421
726
  {
422
727
  Vec<3> center;
@@ -428,7 +733,14 @@ namespace ngsbem
428
733
  Array<tuple<Vec<3>, entry_type>> charges;
429
734
  Array<tuple<Vec<3>, Vec<3>, entry_type>> dipoles;
430
735
  Array<tuple<Vec<3>, Vec<3>, Complex,int>> currents;
736
+
737
+ using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
738
+ Array<tuple<Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_charges;
739
+ Array<tuple<Vec<3,SIMD<double,FMM_SW>>, Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_dipoles;
740
+
431
741
  int total_sources;
742
+ std::mutex node_mutex;
743
+ atomic<bool> have_childs{false};
432
744
 
433
745
  Node (Vec<3> acenter, double ar, int alevel, double akappa)
434
746
  : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*akappa), akappa, ar) // min(1.0, ar*akappa))
@@ -449,12 +761,26 @@ namespace ngsbem
449
761
  cc(2) += (i&4) ? r/2 : -r/2;
450
762
  childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
451
763
  }
764
+ have_childs = true;
452
765
  }
453
766
 
454
767
 
455
768
  void AddCharge (Vec<3> x, entry_type c)
456
769
  {
457
- if (childs[0])
770
+ if (have_childs) // quick check without locking
771
+ {
772
+ // directly send to childs:
773
+ int childnum = 0;
774
+ if (x(0) > center(0)) childnum += 1;
775
+ if (x(1) > center(1)) childnum += 2;
776
+ if (x(2) > center(2)) childnum += 4;
777
+ childs[childnum] -> AddCharge(x, c);
778
+ return;
779
+ }
780
+
781
+ lock_guard<mutex> guard(node_mutex);
782
+
783
+ if (have_childs) // test again after locking
458
784
  {
459
785
  // directly send to childs:
460
786
  int childnum = 0;
@@ -465,6 +791,8 @@ namespace ngsbem
465
791
  return;
466
792
  }
467
793
 
794
+
795
+
468
796
  charges.Append( tuple{x,c} );
469
797
 
470
798
  // if (r*mp.Kappa() < 1e-8) return;
@@ -489,7 +817,7 @@ namespace ngsbem
489
817
 
490
818
  void AddDipole (Vec<3> x, Vec<3> d, entry_type c)
491
819
  {
492
- if (childs[0])
820
+ if (have_childs)
493
821
  {
494
822
  // directly send to childs:
495
823
 
@@ -501,6 +829,23 @@ namespace ngsbem
501
829
  return;
502
830
  }
503
831
 
832
+ lock_guard<mutex> guard(node_mutex);
833
+
834
+ if (have_childs)
835
+ {
836
+ // directly send to childs:
837
+
838
+ int childnum = 0;
839
+ if (x(0) > center(0)) childnum += 1;
840
+ if (x(1) > center(1)) childnum += 2;
841
+ if (x(2) > center(2)) childnum += 4;
842
+ childs[childnum] -> AddDipole(x, d, c);
843
+ return;
844
+ }
845
+
846
+
847
+
848
+
504
849
  dipoles.Append (tuple{x,d,c});
505
850
 
506
851
  if (dipoles.Size() < maxdirect || r < 1e-8)
@@ -520,6 +865,7 @@ namespace ngsbem
520
865
  currents.SetSize0();
521
866
  }
522
867
 
868
+ // not parallel yet
523
869
  void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
524
870
  {
525
871
  if (childs[0])
@@ -549,7 +895,7 @@ namespace ngsbem
549
895
  }
550
896
  return;
551
897
  }
552
-
898
+
553
899
  currents.Append (tuple{sp,ep,j,num});
554
900
 
555
901
  // if (currents.Size() < maxdirect || r < 1e-8)
@@ -583,26 +929,74 @@ namespace ngsbem
583
929
  return sum;
584
930
  }
585
931
 
586
- // static Timer t("fmm direct eval"); RegionTimer reg(t);
587
- if (mp.Kappa() < 1e-8)
932
+ {
933
+ // static Timer t("fmm direct eval"); RegionTimer reg(t);
934
+ // t.AddFlops (charges.Size());
935
+ if (simd_charges.Size())
588
936
  {
589
- for (auto [x,c] : charges)
590
- if (double rho = L2Norm(p-x); rho > 0)
591
- sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
937
+ simd_entry_type vsum{0.0};
938
+ if (mp.Kappa() < 1e-8)
939
+ for (auto [x,c] : simd_charges)
940
+ {
941
+ auto rho = L2Norm(p-x);
942
+ auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW> (1,rho*mp.Kappa()) / rho;
943
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
944
+ vsum += kernel * c;
945
+ }
946
+ else
947
+ for (auto [x,c] : simd_charges)
948
+ {
949
+ auto rho = L2Norm(p-x);
950
+ auto [si,co] = sincos(rho*mp.Kappa());
951
+ auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) / rho;
952
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
953
+ vsum += kernel * c;
954
+ }
955
+
956
+ sum += HSum(vsum);
592
957
  }
593
958
  else
594
- for (auto [x,c] : charges)
595
- if (double rho = L2Norm(p-x); rho > 0)
596
- sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
597
-
598
- for (auto [x,d,c] : dipoles)
959
+ {
960
+ if (mp.Kappa() < 1e-8)
961
+ {
962
+ for (auto [x,c] : charges)
963
+ if (double rho = L2Norm(p-x); rho > 0)
964
+ sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
965
+ }
966
+ else
967
+ for (auto [x,c] : charges)
968
+ if (double rho = L2Norm(p-x); rho > 0)
969
+ sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
970
+ }
971
+ }
972
+
973
+ if (simd_dipoles.Size())
974
+ {
975
+ simd_entry_type vsum{0.0};
976
+ for (auto [x,d,c] : simd_dipoles)
977
+ {
978
+ auto rho = L2Norm(p-x);
979
+ auto drhodp = (1.0/rho) * (p-x);
980
+ auto [si,co] = sincos(rho*mp.Kappa());
981
+ auto dGdrho = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) *
982
+ (-1.0/(rho*rho) + SIMD<Complex,FMM_SW>(0, mp.Kappa())/rho);
983
+ auto kernel = dGdrho * InnerProduct(drhodp, d);
984
+ kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
985
+ vsum += kernel * c;
986
+ }
987
+ sum += HSum(vsum);
988
+ }
989
+ else
990
+ {
991
+ for (auto [x,d,c] : dipoles)
599
992
  if (double rho = L2Norm(p-x); rho > 0)
600
- {
993
+ {
601
994
  Vec<3> drhodp = 1.0/rho * (p-x);
602
995
  Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
603
- (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
996
+ (Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
604
997
  sum += dGdrho * InnerProduct(drhodp, d) * c;
605
- }
998
+ }
999
+ }
606
1000
 
607
1001
  for (auto [sp,ep,j,num] : currents)
608
1002
  {
@@ -664,23 +1058,27 @@ namespace ngsbem
664
1058
  }
665
1059
  }
666
1060
 
667
- void CalcMP()
1061
+ void CalcMP(Array<RecordingSS> * recording, Array<Node*> * nodes_to_process)
668
1062
  {
669
- mp.SH().Coefs() = 0.0;
1063
+ // mp.SH().Coefs() = 0.0;
670
1064
  if (childs[0])
671
1065
  {
672
- if (total_sources < 1000)
1066
+ if (total_sources < 1000 || recording)
673
1067
  for (auto & child : childs)
674
- child->CalcMP();
1068
+ child->CalcMP(recording, nodes_to_process);
675
1069
  else
676
1070
  ParallelFor (8, [&] (int nr)
677
1071
  {
678
- childs[nr] -> CalcMP();
1072
+ childs[nr] -> CalcMP(recording, nodes_to_process);
679
1073
  });
680
1074
 
681
1075
 
682
- for (auto & child : childs)
683
- child->mp.TransformAdd(mp, center-child->center);
1076
+ for (auto & child : childs){
1077
+ if (recording && child->mp.SH().Coefs().Size() > 0)
1078
+ *recording += RecordingSS(&child->mp, &mp, center-child->center);
1079
+ else
1080
+ child->mp.TransformAdd(mp, center-child->center);
1081
+ }
684
1082
  }
685
1083
  else
686
1084
  {
@@ -690,14 +1088,54 @@ namespace ngsbem
690
1088
  return;
691
1089
  }
692
1090
 
693
- for (auto [x,c] : charges)
694
- mp.AddCharge (x-center,c);
1091
+ // make simd charges, comment this block for testing ...
1092
+ simd_charges.SetSize( (charges.Size()+FMM_SW-1)/FMM_SW);
1093
+ size_t i = 0, ii = 0;
1094
+ for ( ; i+FMM_SW <= charges.Size(); i+=FMM_SW, ii++)
1095
+ {
1096
+ std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
1097
+ for (int j = 0; j < FMM_SW; j++) ca[j] = charges[i+j];
1098
+ simd_charges[ii] = MakeSimd(ca);
1099
+ }
1100
+ if (i < charges.Size())
1101
+ {
1102
+ std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
1103
+ int j = 0;
1104
+ for ( ; i+j < charges.Size(); j++) ca[j] = charges[i+j];
1105
+ for ( ; j < FMM_SW; j++) ca[j] = tuple( get<0>(ca[0]), entry_type{0.0} );
1106
+ simd_charges[ii] = MakeSimd(ca);
1107
+ }
1108
+
1109
+ simd_dipoles.SetSize( (dipoles.Size()+FMM_SW-1)/FMM_SW);
1110
+ i = 0, ii = 0;
1111
+ for ( ; i+FMM_SW <= dipoles.Size(); i+=FMM_SW, ii++)
1112
+ {
1113
+ std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
1114
+ for (int j = 0; j < FMM_SW; j++) di[j] = dipoles[i+j];
1115
+ simd_dipoles[ii] = MakeSimd(di);
1116
+ }
1117
+ if (i < dipoles.Size())
1118
+ {
1119
+ std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
1120
+ int j = 0;
1121
+ for ( ; i+j < dipoles.Size(); j++) di[j] = dipoles[i+j];
1122
+ for ( ; j < FMM_SW; j++) di[j] = tuple( get<0>(di[0]), get<1>(di[0]), entry_type{0.0} );
1123
+ simd_dipoles[ii] = MakeSimd(di);
1124
+ }
1125
+
695
1126
 
696
- for (auto [x,d,c] : dipoles)
697
- mp.AddDipole (x-center, d, c);
1127
+ if (nodes_to_process)
1128
+ *nodes_to_process += this;
1129
+ else {
1130
+ for (auto [x,c] : charges)
1131
+ mp.AddCharge (x-center,c);
1132
+
1133
+ for (auto [x,d,c] : dipoles)
1134
+ mp.AddDipole (x-center, d, c);
698
1135
 
699
- for (auto [sp,ep,j,num] : currents)
700
- mp.AddCurrent (sp-center, ep-center, j, num);
1136
+ for (auto [sp,ep,j,num] : currents)
1137
+ mp.AddCurrent (sp-center, ep-center, j, num);
1138
+ }
701
1139
  }
702
1140
  }
703
1141
 
@@ -836,6 +1274,10 @@ namespace ngsbem
836
1274
  void CalcMP()
837
1275
  {
838
1276
  static Timer t("mptool compute singular MLMP"); RegionTimer rg(t);
1277
+ static Timer ts2mp("mptool compute singular MLMP - source2mp");
1278
+ static Timer tS2S("mptool compute singular MLMP - S->S");
1279
+ static Timer trec("mptool comput singular recording");
1280
+ static Timer tsort("mptool comput singular sort");
839
1281
 
840
1282
  /*
841
1283
  int maxlevel = 0;
@@ -847,7 +1289,87 @@ namespace ngsbem
847
1289
  */
848
1290
 
849
1291
  root.CalcTotalSources();
850
- root.CalcMP();
1292
+
1293
+ if (false)
1294
+ // direct evaluation of S->S
1295
+ root.CalcMP(nullptr, nullptr);
1296
+ else
1297
+ {
1298
+
1299
+ Array<RecordingSS> recording;
1300
+ Array<Node*> nodes_to_process;
1301
+
1302
+ {
1303
+ RegionTimer reg(trec);
1304
+ root.CalcMP(&recording, &nodes_to_process);
1305
+ }
1306
+
1307
+ {
1308
+ RegionTimer rs2mp(ts2mp);
1309
+ ParallelFor(nodes_to_process.Size(), [&](int i){
1310
+ auto node = nodes_to_process[i];
1311
+ for (auto [x,c]: node->charges)
1312
+ node->mp.AddCharge(x-node->center, c);
1313
+ for (auto [x,d,c]: node->dipoles)
1314
+ node->mp.AddDipole(x-node->center, d, c);
1315
+ for (auto [sp,ep,j,num]: node->currents)
1316
+ node->mp.AddCurrent(sp-node->center, ep-node->center, j, num);
1317
+ }, TasksPerThread(4));
1318
+ }
1319
+
1320
+ {
1321
+ RegionTimer reg(tsort);
1322
+ QuickSort (recording, [] (auto & a, auto & b)
1323
+ {
1324
+ if (a.len < (1-1e-8) * b.len) return true;
1325
+ if (a.len > (1+1e-8) * b.len) return false;
1326
+ return a.theta < b.theta;
1327
+ });
1328
+ }
1329
+
1330
+ double current_len = -1e100;
1331
+ double current_theta = -1e100;
1332
+ Array<RecordingSS*> current_batch;
1333
+ Array<Array<RecordingSS*>> batch_group;
1334
+ Array<double> group_lengths;
1335
+ Array<double> group_thetas;
1336
+ for (auto & record : recording)
1337
+ {
1338
+ bool len_changed = fabs(record.len - current_len) > 1e-8;
1339
+ bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
1340
+ if ((len_changed || theta_changed) && current_batch.Size() > 0) {
1341
+ batch_group.Append(current_batch);
1342
+ group_lengths.Append(current_len);
1343
+ group_thetas.Append(current_theta);
1344
+ current_batch.SetSize(0);
1345
+ }
1346
+
1347
+ current_len = record.len;
1348
+ current_theta = record.theta;
1349
+ current_batch.Append(&record);
1350
+ }
1351
+ if (current_batch.Size() > 0) {
1352
+ batch_group.Append(current_batch);
1353
+ group_lengths.Append(current_len);
1354
+ group_thetas.Append(current_theta);
1355
+ }
1356
+
1357
+ {
1358
+ RegionTimer rS2S(tS2S);
1359
+ // ParallelFor(batch_group.Size(), [&](int i) {
1360
+ for (int i = 0; i < batch_group.Size(); i++){
1361
+ // *testout << "Processing batch " << i << " of size " << batch_group[i].Size() << ", with len = " << group_lengths[i] << ", theta = " << group_thetas[i] << endl;
1362
+ int chunk_size = 24;
1363
+ if (batch_group[i].Size() < chunk_size)
1364
+ ProcessBatch(batch_group[i], group_lengths[i], group_thetas[i]);
1365
+ else
1366
+ ParallelForRange(IntRange(batch_group[i].Size()), [&](IntRange range) {
1367
+ auto sub_batch = batch_group[i].Range(range.First(), range.Next());
1368
+ ProcessBatch(sub_batch, group_lengths[i], group_thetas[i]);
1369
+ }, TasksPerThread(4));
1370
+ }
1371
+ }
1372
+ }
851
1373
 
852
1374
  havemp = true;
853
1375
  }
@@ -892,19 +1414,128 @@ namespace ngsbem
892
1414
  Vec<3> adist)
893
1415
  : mpS(ampS), mpR(ampR), dist(adist)
894
1416
  {
895
- len = L2Norm(dist);
896
- if (len < 1e-30)
897
- theta = 0;
898
- else
899
- theta = acos (dist(2) / len);
900
-
901
- if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
902
- phi = 0;
903
- else
904
- phi = atan2(dist(1), dist(0));
1417
+ std::tie(len, theta, phi) = SphericalCoordinates(dist);
905
1418
  }
906
1419
  };
907
1420
 
1421
+ static void ProcessBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
1422
+ // static Timer t("ProcessBatchRS"); RegionTimer reg(t, batch.Size());
1423
+ constexpr int vec_length = VecLength<elem_type>;
1424
+ int batch_size = batch.Size();
1425
+ int N = batch_size * vec_length;
1426
+ // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(elem_type).name() << ", len = " << len << ", theta = " << theta << endl;
1427
+
1428
+ if (N <= 1 || batch_size <= 1) {
1429
+ for (auto* rec : batch) {
1430
+ rec->mpS->TransformAdd(*rec->mpR, rec->dist);
1431
+ }
1432
+ }
1433
+ else if (N <= 3) {
1434
+ ProcessVectorizedBatch<3, vec_length>(batch, len, theta);
1435
+ }
1436
+ else if (N <= 4) {
1437
+ ProcessVectorizedBatch<4, vec_length>(batch, len, theta);
1438
+ }
1439
+ else if (N <= 6) {
1440
+ ProcessVectorizedBatch<6, vec_length>(batch, len, theta);
1441
+ }
1442
+ else if (N <= 12) {
1443
+ ProcessVectorizedBatch<12, vec_length>(batch, len, theta);
1444
+ }
1445
+ else if (N <= 24) {
1446
+ ProcessVectorizedBatch<24, vec_length>(batch, len, theta);
1447
+ }
1448
+ else if (N <= 48) {
1449
+ ProcessVectorizedBatch<48, vec_length>(batch, len, theta);
1450
+ }
1451
+ else if (N <= 96) {
1452
+ ProcessVectorizedBatch<96, vec_length>(batch, len, theta);
1453
+ }
1454
+ else if (N <= 192) {
1455
+ ProcessVectorizedBatch<192, vec_length>(batch, len, theta);
1456
+ }
1457
+ else {
1458
+ // Split large batches
1459
+ /*
1460
+ ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
1461
+ ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
1462
+ */
1463
+
1464
+ /*
1465
+ ParallelFor (2, [&] (int i)
1466
+ {
1467
+ if (i == 0)
1468
+ ProcessBatchRS(batch.Range(0, 192 / vec_length), len, theta);
1469
+ else
1470
+ ProcessBatchRS(batch.Range(192 / vec_length, batch_size), len, theta);
1471
+ }, 2);
1472
+ */
1473
+
1474
+
1475
+ size_t chunksize = 192/vec_length;
1476
+ size_t num = (batch.Size()+chunksize-1) / chunksize;
1477
+ ParallelFor (num, [&](int i)
1478
+ {
1479
+ ProcessBatchRS(batch.Range(i*chunksize, min((i+1)*chunksize, batch.Size())), len, theta);
1480
+ }, num);
1481
+
1482
+ }
1483
+ }
1484
+
1485
+
1486
+ template<int N, int vec_length>
1487
+ static void ProcessVectorizedBatch(FlatArray<RecordingRS*> batch, double len, double theta) {
1488
+
1489
+ // static Timer t("ProcessVectorizedBatch, N = "+ToString(N) + ", vec_len = " + ToString(vec_length));
1490
+ // RegionTimer reg(t, batch[0]->mpS->SH().Order());
1491
+ // static Timer ttobatch("mptools - copy to batch 2");
1492
+ // static Timer tfrombatch("mptools - copy from batch 2");
1493
+
1494
+ // *testout << "Processing vectorized batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
1495
+ MultiPole<MPSingular, Vec<N,Complex>> vec_source(batch[0]->mpS->Order(), batch[0]->mpS->Kappa(), batch[0]->mpS->RTyp());
1496
+ // MultiPole<MPSingular, elem_type> tmp_source{*batch[0]->mpS};
1497
+ MultiPole<MPRegular, elem_type> tmp_target{*batch[0]->mpR};
1498
+ MultiPole<MPRegular, Vec<N,Complex>> vec_target(batch[0]->mpR->Order(), batch[0]->mpR->Kappa(), batch[0]->mpR->RTyp());
1499
+
1500
+ // Copy multipoles into vectorized multipole
1501
+ // ttobatch.Start();
1502
+ for (int i = 0; i < batch.Size(); i++)
1503
+ {
1504
+ auto source_i = VecVector2Matrix (batch[i]->mpS->SH().Coefs());
1505
+ auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
1506
+ batch[i]->mpS->SH().RotateZ(batch[i]->phi,
1507
+ [source_i, source_mati] (size_t ii, Complex factor)
1508
+ {
1509
+ source_mati.Row(ii) = factor * source_i.Row(ii);
1510
+ });
1511
+ }
1512
+
1513
+ // ttobatch.Stop();
1514
+
1515
+ vec_source.SH().RotateY(theta);
1516
+ vec_source.ShiftZ(-len, vec_target);
1517
+ vec_target.SH().RotateY(-theta);
1518
+
1519
+ // Copy vectorized multipole into individual multipoles
1520
+ // tfrombatch.Start();
1521
+ for (int i = 0; i < batch.Size(); i++) {
1522
+ // auto source_i = VecVector2Matrix (tmp_target.SH().Coefs());
1523
+ auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
1524
+ auto targeti = VecVector2Matrix(batch[i]->mpR->SH().Coefs());
1525
+
1526
+ tmp_target.SH().RotateZ(-batch[i]->phi,
1527
+ [source_mati, targeti] (size_t ii, Complex factor)
1528
+ {
1529
+ // source_i.Row(ii) = factor * source_mati.Row(ii);
1530
+ AtomicAdd (VectorView(targeti.Row(ii)), factor * source_mati.Row(ii));
1531
+ });
1532
+ // for (int j = 0; j < tmp_target.SH().Coefs().Size(); j++)
1533
+ // AtomicAdd(batch[i]->mpR->SH().Coefs()[j], tmp_target.SH().Coefs()[j]);
1534
+ }
1535
+ // tfrombatch.Stop();
1536
+
1537
+ }
1538
+
908
1539
 
909
1540
  struct Node
910
1541
  {
@@ -915,6 +1546,8 @@ namespace ngsbem
915
1546
  MultiPole<MPRegular,elem_type> mp;
916
1547
  Array<Vec<3>> targets;
917
1548
  int total_targets;
1549
+ std::mutex node_mutex;
1550
+ atomic<bool> have_childs{false};
918
1551
 
919
1552
  Array<const typename SingularMLMultiPole<elem_type>::Node*> singnodes;
920
1553
 
@@ -939,6 +1572,7 @@ namespace ngsbem
939
1572
  cc(2) += (i&4) ? r/2 : -r/2;
940
1573
  childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
941
1574
  }
1575
+ have_childs = true;
942
1576
  }
943
1577
 
944
1578
  void AddSingularNode (const typename SingularMLMultiPole<elem_type>::Node & singnode, bool allow_refine,
@@ -946,7 +1580,7 @@ namespace ngsbem
946
1580
  {
947
1581
  if (mp.SH().Order() < 0) return;
948
1582
  if (singnode.mp.SH().Order() < 0) return;
949
- if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
1583
+ // if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
950
1584
  if (level > 20)
951
1585
  {
952
1586
  singnodes.Append(&singnode);
@@ -1028,12 +1662,22 @@ namespace ngsbem
1028
1662
 
1029
1663
  if (childs[0])
1030
1664
  {
1031
- for (auto & ch : childs)
1665
+ if (total_targets < 1000)
1032
1666
  {
1033
- if (L2Norm(mp.SH().Coefs()) > 0)
1034
- mp.TransformAdd (ch->mp, ch->center-center);
1035
- ch->LocalizeExpansion(allow_refine);
1667
+ for (int nr = 0; nr < 8; nr++)
1668
+ {
1669
+ if (L2Norm(mp.SH().Coefs()) > 0)
1670
+ mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
1671
+ childs[nr]->LocalizeExpansion(allow_refine);
1672
+ }
1036
1673
  }
1674
+ else
1675
+ ParallelFor(8, [&] (int nr)
1676
+ {
1677
+ if (L2Norm(mp.SH().Coefs()) > 0)
1678
+ mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
1679
+ childs[nr]->LocalizeExpansion(allow_refine);
1680
+ });
1037
1681
  mp = MultiPole<MPRegular,elem_type>(-1, mp.Kappa(), 1.);
1038
1682
  //mp.SH().Coefs()=0.0;
1039
1683
  }
@@ -1041,18 +1685,8 @@ namespace ngsbem
1041
1685
 
1042
1686
  elem_type Evaluate (Vec<3> p) const
1043
1687
  {
1044
- // *testout << "eval p = " << p << ", level = " << level << ", center = " << center << ", r = " << r << endl;
1045
1688
  elem_type sum{0.0};
1046
- /*
1047
- if (childs[0])
1048
- {
1049
- int childnum = 0;
1050
- if (p(0) > center(0)) childnum += 1;
1051
- if (p(1) > center(1)) childnum += 2;
1052
- if (p(2) > center(2)) childnum += 4;
1053
- sum = childs[childnum]->Evaluate(p);
1054
- }
1055
- */
1689
+
1056
1690
  int childnum = 0;
1057
1691
  if (p(0) > center(0)) childnum += 1;
1058
1692
  if (p(1) > center(1)) childnum += 2;
@@ -1062,8 +1696,6 @@ namespace ngsbem
1062
1696
  else
1063
1697
  sum = mp.Eval(p-center);
1064
1698
 
1065
-
1066
- // static Timer t("mptool direct evaluate"); RegionTimer r(t);
1067
1699
  for (auto sn : singnodes)
1068
1700
  sum += sn->EvaluateMP(p);
1069
1701
 
@@ -1112,7 +1744,8 @@ namespace ngsbem
1112
1744
 
1113
1745
  void AddTarget (Vec<3> x)
1114
1746
  {
1115
- if (childs[0])
1747
+ // if (childs[0])
1748
+ if (have_childs) // quick check without locking
1116
1749
  {
1117
1750
  // directly send to childs:
1118
1751
  int childnum = 0;
@@ -1123,6 +1756,20 @@ namespace ngsbem
1123
1756
  return;
1124
1757
  }
1125
1758
 
1759
+ lock_guard<mutex> guard(node_mutex);
1760
+
1761
+ if (have_childs) // test again after locking
1762
+ {
1763
+ // directly send to childs:
1764
+ int childnum = 0;
1765
+ if (x(0) > center(0)) childnum += 1;
1766
+ if (x(1) > center(1)) childnum += 2;
1767
+ if (x(2) > center(2)) childnum += 4;
1768
+ childs[childnum] -> AddTarget(x);
1769
+ return;
1770
+ }
1771
+
1772
+
1126
1773
  targets.Append( x );
1127
1774
 
1128
1775
  // if (r*mp.Kappa() < 1e-8) return;
@@ -1227,6 +1874,8 @@ namespace ngsbem
1227
1874
  void CalcMP(shared_ptr<SingularMLMultiPole<elem_type>> asingmp)
1228
1875
  {
1229
1876
  static Timer t("mptool regular MLMP"); RegionTimer rg(t);
1877
+ static Timer trec("mptool regular MLMP - recording");
1878
+ static Timer tsort("mptool regular MLMP - sort");
1230
1879
 
1231
1880
  singmp = asingmp;
1232
1881
 
@@ -1234,23 +1883,58 @@ namespace ngsbem
1234
1883
  root.RemoveEmptyTrees();
1235
1884
 
1236
1885
 
1237
- root.AddSingularNode(singmp->root, false, nullptr);
1238
- /*
1239
- Array<RecordingRS> recording;
1240
- root.AddSingularNode(singmp->root, false, &recording);
1886
+ // root.AddSingularNode(singmp->root, false, nullptr);
1887
+ // /*
1888
+ Array<RecordingRS> recording;
1889
+ {
1890
+ RegionTimer rrec(trec);
1891
+ root.AddSingularNode(singmp->root, false, &recording);
1892
+ }
1893
+
1241
1894
  // cout << "recorded: " << recording.Size() << endl;
1895
+ {
1896
+ RegionTimer reg(tsort);
1242
1897
  QuickSort (recording, [] (auto & a, auto & b)
1243
1898
  {
1244
1899
  if (a.len < (1-1e-8) * b.len) return true;
1245
1900
  if (a.len > (1+1e-8) * b.len) return false;
1246
1901
  return a.theta < b.theta;
1247
1902
  });
1903
+ }
1904
+
1905
+ double current_len = -1e100;
1906
+ double current_theta = -1e100;
1907
+ Array<RecordingRS*> current_batch;
1908
+ Array<Array<RecordingRS*>> batch_group;
1909
+ Array<double> group_lengths;
1910
+ Array<double> group_thetas;
1248
1911
  for (auto & record : recording)
1249
1912
  {
1250
- record.mpS->TransformAdd(*record.mpR, record.dist);
1251
- // *testout << record.len << ", " << record.theta << ", " << record.phi << endl;
1913
+ bool len_changed = fabs(record.len - current_len) > 1e-8;
1914
+ bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
1915
+ if ((len_changed || theta_changed) && current_batch.Size() > 0) {
1916
+ // ProcessBatch(current_batch, current_len, current_theta);
1917
+ batch_group.Append(current_batch);
1918
+ group_lengths.Append(current_len);
1919
+ group_thetas.Append(current_theta);
1920
+ current_batch.SetSize(0);
1921
+ }
1922
+
1923
+ current_len = record.len;
1924
+ current_theta = record.theta;
1925
+ current_batch.Append(&record);
1252
1926
  }
1253
- */
1927
+ if (current_batch.Size() > 0) {
1928
+ // ProcessBatch(current_batch, current_len, current_theta);
1929
+ batch_group.Append(current_batch);
1930
+ group_lengths.Append(current_len);
1931
+ group_thetas.Append(current_theta);
1932
+ }
1933
+
1934
+ ParallelFor(batch_group.Size(), [&](int i) {
1935
+ ProcessBatchRS(batch_group[i], group_lengths[i], group_thetas[i]);
1936
+ }, TasksPerThread(4));
1937
+ // */
1254
1938
 
1255
1939
 
1256
1940
  /*
@@ -1262,7 +1946,7 @@ namespace ngsbem
1262
1946
  cout << "reg " << i << ": " << RegularMLMultiPole::nodes_on_level[i] << endl;
1263
1947
  */
1264
1948
 
1265
- static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
1949
+ static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
1266
1950
  root.LocalizeExpansion(false);
1267
1951
  }
1268
1952
 
@@ -1296,6 +1980,7 @@ namespace ngsbem
1296
1980
 
1297
1981
  };
1298
1982
 
1983
+
1299
1984
  template <typename elem_type>
1300
1985
  inline ostream & operator<< (ostream & ost, const RegularMLMultiPole<elem_type> & mlmp)
1301
1986
  {