ngsolve 6.2.2505.post17.dev0__cp313-cp313-win_amd64.whl → 6.2.2505.post70.dev0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ngsolve might be problematic. Click here for more details.

Files changed (47) hide show
  1. netgen/include/bilinearform.hpp +1 -1
  2. netgen/include/diffop_impl.hpp +3 -1
  3. netgen/include/gridfunction.hpp +1 -1
  4. netgen/include/mptools.hpp +553 -89
  5. netgen/include/ngblas.hpp +11 -0
  6. netgen/include/recursive_pol.hpp +63 -11
  7. netgen/include/sparsematrix_impl.hpp +25 -0
  8. netgen/include/vector.hpp +13 -1
  9. netgen/lib/libngsolve.lib +0 -0
  10. netgen/libngsolve.dll +0 -0
  11. ngsolve/cmake/NGSolveConfig.cmake +5 -5
  12. ngsolve/config/config.py +5 -5
  13. ngsolve/ngslib.pyd +0 -0
  14. {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post70.dev0.dist-info}/METADATA +2 -2
  15. {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post70.dev0.dist-info}/RECORD +47 -47
  16. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/Scripts/ngsolve.tcl +0 -0
  17. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/beam.geo +0 -0
  18. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/beam.vol +0 -0
  19. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/chip.in2d +0 -0
  20. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/chip.vol +0 -0
  21. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/coil.geo +0 -0
  22. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/coil.vol +0 -0
  23. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/coilshield.geo +0 -0
  24. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/coilshield.vol +0 -0
  25. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/cube.geo +0 -0
  26. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/cube.vol +0 -0
  27. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d10_DGdoubleglazing.pde +0 -0
  28. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d11_chip_nitsche.pde +0 -0
  29. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d1_square.pde +0 -0
  30. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d2_chip.pde +0 -0
  31. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d3_helmholtz.pde +0 -0
  32. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d4_cube.pde +0 -0
  33. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d5_beam.pde +0 -0
  34. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d6_shaft.pde +0 -0
  35. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d7_coil.pde +0 -0
  36. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d8_coilshield.pde +0 -0
  37. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/d9_hybridDG.pde +0 -0
  38. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/doubleglazing.in2d +0 -0
  39. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/doubleglazing.vol +0 -0
  40. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/piezo2d40round4.vol.gz +0 -0
  41. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/shaft.geo +0 -0
  42. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/shaft.vol +0 -0
  43. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/square.in2d +0 -0
  44. {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post70.dev0.data}/data/share/ngsolve/square.vol +0 -0
  45. {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post70.dev0.dist-info}/WHEEL +0 -0
  46. {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post70.dev0.dist-info}/licenses/LICENSE +0 -0
  47. {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post70.dev0.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,26 @@ namespace ngsbem
20
20
  {
21
21
  using namespace ngfem;
22
22
 
23
+ template<typename T>
24
+ constexpr int VecLength = 1; // Default: Complex has length 1
25
+
26
+ template<int N>
27
+ constexpr int VecLength<Vec<N, Complex>> = N; // Specialization: Vec<N,Complex> has length N
28
+
29
+ inline std::tuple<double, double, double> SphericalCoordinates(Vec<3> dist){
30
+ double len, theta, phi;
31
+ len = L2Norm(dist);
32
+ if (len < 1e-30)
33
+ theta = 0;
34
+ else
35
+ theta = acos (dist(2) / len);
36
+ if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
37
+ phi = 0;
38
+ else
39
+ phi = atan2(dist(1), dist(0));
40
+ return {len, theta, phi};
41
+ }
42
+
23
43
 
24
44
  template <typename entry_type = Complex>
25
45
  class NGS_DLL_HEADER SphericalHarmonics
@@ -84,9 +104,69 @@ namespace ngsbem
84
104
 
85
105
  void Calc (Vec<3> x, FlatVector<Complex> shapes);
86
106
 
87
-
107
+
108
+ void FlipZ ();
88
109
  void RotateZ (double alpha);
89
- void RotateY (double alpha);
110
+
111
+ template <typename FUNC>
112
+ void RotateZ (double alpha, FUNC func) const
113
+ {
114
+ if (order < 0) return;
115
+
116
+ Vector<Complex> exp_imalpha(order+1);
117
+ Complex exp_ialpha(cos(alpha), sin(alpha));
118
+ Complex prod = 1.0;
119
+ for (int i = 0; i <= order; i++)
120
+ {
121
+ exp_imalpha(i) = prod;
122
+ prod *= exp_ialpha;
123
+ }
124
+
125
+ int ii = 0;
126
+ for (int n = 0; n <= order; n++)
127
+ {
128
+ for (int m = -n; m < 0; m++, ii++)
129
+ func(ii, conj(exp_imalpha(-m)));
130
+ for (int m = 0; m <= n; m++, ii++)
131
+ func(ii, exp_imalpha(m));
132
+ };
133
+ };
134
+
135
+ template <typename FUNC>
136
+ void RotateZFlip (double alpha, bool flip, FUNC func) const
137
+ {
138
+ if (order < 0) return;
139
+
140
+ Vector<Complex> exp_imalpha(order+1);
141
+ Complex exp_ialpha(cos(alpha), sin(alpha));
142
+ Complex prod = 1.0;
143
+ for (int i = 0; i <= order; i++)
144
+ {
145
+ exp_imalpha(i) = prod;
146
+ prod *= exp_ialpha;
147
+ }
148
+
149
+ int ii = 0;
150
+
151
+ auto FlipFactor = [] (int n, int m, bool flip)->double
152
+ {
153
+ if (flip)
154
+ return ((n-m)%2) == 1 ? -1 : 1;
155
+ return 1.0;
156
+ };
157
+
158
+ for (int n = 0; n <= order; n++)
159
+ {
160
+ for (int m = -n; m < 0; m++, ii++)
161
+ func(ii, FlipFactor(n,m,flip)*conj(exp_imalpha(-m)));
162
+ for (int m = 0; m <= n; m++, ii++)
163
+ func(ii, FlipFactor(n,m,flip)*exp_imalpha(m));
164
+ };
165
+ };
166
+
167
+
168
+
169
+ void RotateY (double alpha, bool parallel = false);
90
170
 
91
171
 
92
172
  static double CalcAmn (int m, int n)
@@ -119,11 +199,11 @@ namespace ngsbem
119
199
  // https://fortran-lang.discourse.group/t/looking-for-spherical-bessel-and-hankel-functions-of-first-and-second-kind-and-arbitrary-order/2308/2
120
200
  NGS_DLL_HEADER
121
201
  void besseljs3d (int nterms, double z, double scale,
122
- FlatVector<double> fjs, FlatVector<double> fjder);
202
+ SliceVector<double> fjs, SliceVector<double> fjder = FlatVector<double>(0, nullptr));
123
203
 
124
204
  NGS_DLL_HEADER
125
205
  void besseljs3d (int nterms, Complex z, double scale,
126
- FlatVector<Complex> fjs, FlatVector<Complex> fjder);
206
+ SliceVector<Complex> fjs, SliceVector<Complex> fjder = FlatVector<Complex>(0, nullptr));
127
207
 
128
208
 
129
209
  /*
@@ -142,14 +222,17 @@ namespace ngsbem
142
222
  FlatVector<double> jp,
143
223
  FlatVector<double> yp);
144
224
 
145
-
225
+
146
226
 
147
227
  template <typename T>
148
228
  void SphericalBessel (int n, double rho, double scale, T && values)
149
229
  {
230
+ besseljs3d (n, rho, scale, values);
231
+ /*
150
232
  Vector<double> j(n+1), jp(n+1);
151
233
  besseljs3d (n, rho, scale, j, jp);
152
234
  values = j;
235
+ */
153
236
  }
154
237
 
155
238
 
@@ -173,21 +256,6 @@ namespace ngsbem
173
256
  return;
174
257
  }
175
258
  Vector j(n+1), y(n+1), jp(n+1), yp(n+1);
176
- // SBESJY (rho, n, j, y, jp, yp);
177
-
178
- /*
179
- values = j + Complex(0,1) * y;
180
- if (scale != 1.0)
181
- {
182
- double prod = 1.0;
183
- for (int i = 0; i <= n; i++)
184
- {
185
- values(i) *= prod;
186
- prod *= scale;
187
- }
188
- }
189
- */
190
-
191
259
 
192
260
  // the bessel-evaluation with scale
193
261
  besseljs3d (n, rho, 1/scale, j, jp);
@@ -358,18 +426,7 @@ namespace ngsbem
358
426
  // static Timer t("mptool Transform "+ToString(typeid(RADIAL).name())+ToString(typeid(TARGET).name()));
359
427
  // RegionTimer reg(t);
360
428
 
361
- double len = L2Norm(dist);
362
- double theta, phi;
363
-
364
- if (len < 1e-30)
365
- theta = 0;
366
- else
367
- theta = acos (dist(2) / len);
368
-
369
- if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
370
- phi = 0;
371
- else
372
- phi = atan2(dist(1), dist(0));
429
+ auto [len, theta, phi] = SphericalCoordinates(dist);
373
430
 
374
431
 
375
432
  // MultiPole<RADIAL,entry_type> tmp{*this};
@@ -386,14 +443,18 @@ namespace ngsbem
386
443
  }
387
444
 
388
445
  template <typename TARGET>
389
- void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
446
+ void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist, bool atomic = false) const
390
447
  {
391
448
  if (SH().Order() < 0) return;
392
449
  if (target.SH().Order() < 0) return;
393
450
 
394
451
  MultiPole<TARGET,entry_type> tmp{target};
395
452
  Transform(tmp, dist);
396
- target.SH().Coefs() += tmp.SH().Coefs();
453
+ if (!atomic)
454
+ target.SH().Coefs() += tmp.SH().Coefs();
455
+ else
456
+ for (int j = 0; j < target.SH().Coefs().Size(); j++)
457
+ AtomicAdd(target.SH().Coefs()[j], tmp.SH().Coefs()[j]);
397
458
  }
398
459
 
399
460
  template <typename TARGET>
@@ -412,11 +473,123 @@ namespace ngsbem
412
473
  static constexpr int maxdirect = 100;
413
474
 
414
475
 
476
+ template <typename SCAL, auto S>
477
+ inline auto VecVector2Matrix (FlatVector<Vec<S,SCAL>> vec)
478
+ {
479
+ return FlatMatrixFixWidth<S,SCAL> (vec.Size(), vec.Data()->Data());
480
+ }
481
+
482
+ inline auto VecVector2Matrix (FlatVector<Complex> vec)
483
+ {
484
+ return FlatMatrixFixWidth<1,Complex> (vec.Size(), vec.Data());
485
+ }
486
+
487
+
415
488
  template <typename entry_type=Complex>
416
489
  class SingularMLMultiPole
417
490
  {
418
491
  static Array<size_t> nodes_on_level;
419
492
 
493
+ struct RecordingSS
494
+ {
495
+ const MultiPole<MPSingular,entry_type> * mp_source;
496
+ MultiPole<MPSingular,entry_type> * mp_target;
497
+ Vec<3> dist;
498
+ double len, theta, phi;
499
+ bool flipz;
500
+ public:
501
+ RecordingSS() = default;
502
+ RecordingSS (const MultiPole<MPSingular,entry_type> * amp_source,
503
+ MultiPole<MPSingular,entry_type> * amp_target,
504
+ Vec<3> adist)
505
+ : mp_source(amp_source), mp_target(amp_target), dist(adist)
506
+ {
507
+ std::tie(len, theta, phi) = SphericalCoordinates(adist);
508
+ // flipz = false;
509
+ flipz = theta > M_PI/2;
510
+ if (flipz) theta = M_PI-theta;
511
+ }
512
+ };
513
+
514
+
515
+ static void ProcessBatch(FlatArray<RecordingSS*> batch, double len, double theta) {
516
+ constexpr int vec_length = VecLength<entry_type>;
517
+ int batch_size = batch.Size();
518
+ int N = batch_size * vec_length;
519
+ // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(entry_type).name() << ", len = " << len << ", theta = " << theta << endl;
520
+
521
+ if (N <= 1 || batch_size <= 1) {
522
+ for (auto* rec : batch) {
523
+ rec->mp_source->TransformAdd(*rec->mp_target, rec->dist, true);
524
+ }
525
+ }
526
+ else if (N <= 3) {
527
+ ProcessVectorizedBatch<3, vec_length>(batch, len, theta);
528
+ }
529
+ else if (N <= 4) {
530
+ ProcessVectorizedBatch<4, vec_length>(batch, len, theta);
531
+ }
532
+ else if (N <= 6) {
533
+ ProcessVectorizedBatch<6, vec_length>(batch, len, theta);
534
+ }
535
+ else if (N <= 12) {
536
+ ProcessVectorizedBatch<12, vec_length>(batch, len, theta);
537
+ }
538
+ else if (N <= 24) {
539
+ ProcessVectorizedBatch<24, vec_length>(batch, len, theta);
540
+ }
541
+ else if (N <= 48) {
542
+ ProcessVectorizedBatch<48, vec_length>(batch, len, theta);
543
+ }
544
+ else if (N <= 96) {
545
+ ProcessVectorizedBatch<96, vec_length>(batch, len, theta);
546
+ }
547
+ else if (N <= 192) {
548
+ ProcessVectorizedBatch<192, vec_length>(batch, len, theta);
549
+ }
550
+ else {
551
+ // Split large batches
552
+ ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
553
+ ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
554
+ }
555
+ }
556
+
557
+ template<int N, int vec_length>
558
+ static void ProcessVectorizedBatch(FlatArray<RecordingSS*> batch, double len, double theta) {
559
+
560
+ // *testout << "Processing vectorized S->S batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
561
+ MultiPole<MPSingular, Vec<N,Complex>> vec_source(batch[0]->mp_source->Order(), batch[0]->mp_source->Kappa(), batch[0]->mp_source->RTyp());
562
+ MultiPole<MPSingular, Vec<N,Complex>> vec_target(batch[0]->mp_target->Order(), batch[0]->mp_target->Kappa(), batch[0]->mp_target->RTyp());
563
+
564
+ // Copy multipoles into vectorized multipole
565
+ for (int i = 0; i < batch.Size(); i++)
566
+ {
567
+ auto source_i = VecVector2Matrix (batch[i]->mp_source->SH().Coefs());
568
+ auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
569
+ batch[i]->mp_source->SH().RotateZFlip(batch[i]->phi, batch[i]->flipz,
570
+ [source_i, source_mati] (size_t ii, Complex factor)
571
+ {
572
+ source_mati.Row(ii) = factor * source_i.Row(ii);
573
+ });
574
+ }
575
+
576
+ vec_source.SH().RotateY(theta, vec_source.SH().Order() >= 100);
577
+ vec_source.ShiftZ(-len, vec_target);
578
+ vec_target.SH().RotateY(-theta, vec_target.SH().Order() >= 100);
579
+
580
+ // Copy vectorized multipole into individual multipoles
581
+ for (int i = 0; i < batch.Size(); i++)
582
+ {
583
+ auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
584
+ auto target_mati = VecVector2Matrix (batch[i]->mp_target->SH().Coefs());
585
+ batch[i]->mp_target->SH().RotateZFlip(-batch[i]->phi, batch[i]->flipz,
586
+ [source_mati, target_mati] (size_t ii, Complex factor)
587
+ {
588
+ AtomicAdd (target_mati.Row(ii), factor * source_mati.Row(ii));
589
+ });
590
+ }
591
+ }
592
+
420
593
  struct Node
421
594
  {
422
595
  Vec<3> center;
@@ -429,6 +602,8 @@ namespace ngsbem
429
602
  Array<tuple<Vec<3>, Vec<3>, entry_type>> dipoles;
430
603
  Array<tuple<Vec<3>, Vec<3>, Complex,int>> currents;
431
604
  int total_sources;
605
+ std::mutex node_mutex;
606
+ atomic<bool> have_childs{false};
432
607
 
433
608
  Node (Vec<3> acenter, double ar, int alevel, double akappa)
434
609
  : center(acenter), r(ar), level(alevel), mp(MPOrder(ar*akappa), akappa, ar) // min(1.0, ar*akappa))
@@ -449,12 +624,13 @@ namespace ngsbem
449
624
  cc(2) += (i&4) ? r/2 : -r/2;
450
625
  childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
451
626
  }
627
+ have_childs = true;
452
628
  }
453
629
 
454
630
 
455
631
  void AddCharge (Vec<3> x, entry_type c)
456
632
  {
457
- if (childs[0])
633
+ if (have_childs) // quick check without locking
458
634
  {
459
635
  // directly send to childs:
460
636
  int childnum = 0;
@@ -465,6 +641,21 @@ namespace ngsbem
465
641
  return;
466
642
  }
467
643
 
644
+ lock_guard<mutex> guard(node_mutex);
645
+
646
+ if (have_childs) // test again after locking
647
+ {
648
+ // directly send to childs:
649
+ int childnum = 0;
650
+ if (x(0) > center(0)) childnum += 1;
651
+ if (x(1) > center(1)) childnum += 2;
652
+ if (x(2) > center(2)) childnum += 4;
653
+ childs[childnum] -> AddCharge(x, c);
654
+ return;
655
+ }
656
+
657
+
658
+
468
659
  charges.Append( tuple{x,c} );
469
660
 
470
661
  // if (r*mp.Kappa() < 1e-8) return;
@@ -489,7 +680,21 @@ namespace ngsbem
489
680
 
490
681
  void AddDipole (Vec<3> x, Vec<3> d, entry_type c)
491
682
  {
492
- if (childs[0])
683
+ if (have_childs)
684
+ {
685
+ // directly send to childs:
686
+
687
+ int childnum = 0;
688
+ if (x(0) > center(0)) childnum += 1;
689
+ if (x(1) > center(1)) childnum += 2;
690
+ if (x(2) > center(2)) childnum += 4;
691
+ childs[childnum] -> AddDipole(x, d, c);
692
+ return;
693
+ }
694
+
695
+ lock_guard<mutex> guard(node_mutex);
696
+
697
+ if (have_childs)
493
698
  {
494
699
  // directly send to childs:
495
700
 
@@ -501,6 +706,9 @@ namespace ngsbem
501
706
  return;
502
707
  }
503
708
 
709
+
710
+
711
+
504
712
  dipoles.Append (tuple{x,d,c});
505
713
 
506
714
  if (dipoles.Size() < maxdirect || r < 1e-8)
@@ -520,6 +728,7 @@ namespace ngsbem
520
728
  currents.SetSize0();
521
729
  }
522
730
 
731
+ // not parallel yet
523
732
  void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
524
733
  {
525
734
  if (childs[0])
@@ -549,7 +758,7 @@ namespace ngsbem
549
758
  }
550
759
  return;
551
760
  }
552
-
761
+
553
762
  currents.Append (tuple{sp,ep,j,num});
554
763
 
555
764
  // if (currents.Size() < maxdirect || r < 1e-8)
@@ -664,23 +873,27 @@ namespace ngsbem
664
873
  }
665
874
  }
666
875
 
667
- void CalcMP()
876
+ void CalcMP(Array<RecordingSS> * recording, Array<Node*> * nodes_to_process)
668
877
  {
669
- mp.SH().Coefs() = 0.0;
878
+ // mp.SH().Coefs() = 0.0;
670
879
  if (childs[0])
671
880
  {
672
- if (total_sources < 1000)
881
+ if (total_sources < 1000 || recording)
673
882
  for (auto & child : childs)
674
- child->CalcMP();
883
+ child->CalcMP(recording, nodes_to_process);
675
884
  else
676
885
  ParallelFor (8, [&] (int nr)
677
886
  {
678
- childs[nr] -> CalcMP();
887
+ childs[nr] -> CalcMP(recording, nodes_to_process);
679
888
  });
680
889
 
681
890
 
682
- for (auto & child : childs)
683
- child->mp.TransformAdd(mp, center-child->center);
891
+ for (auto & child : childs){
892
+ if (recording && child->mp.SH().Coefs().Size() > 0)
893
+ *recording += RecordingSS(&child->mp, &mp, center-child->center);
894
+ else
895
+ child->mp.TransformAdd(mp, center-child->center);
896
+ }
684
897
  }
685
898
  else
686
899
  {
@@ -690,14 +903,18 @@ namespace ngsbem
690
903
  return;
691
904
  }
692
905
 
693
- for (auto [x,c] : charges)
694
- mp.AddCharge (x-center,c);
695
-
696
- for (auto [x,d,c] : dipoles)
697
- mp.AddDipole (x-center, d, c);
906
+ if (nodes_to_process)
907
+ *nodes_to_process += this;
908
+ else {
909
+ for (auto [x,c] : charges)
910
+ mp.AddCharge (x-center,c);
911
+
912
+ for (auto [x,d,c] : dipoles)
913
+ mp.AddDipole (x-center, d, c);
698
914
 
699
- for (auto [sp,ep,j,num] : currents)
700
- mp.AddCurrent (sp-center, ep-center, j, num);
915
+ for (auto [sp,ep,j,num] : currents)
916
+ mp.AddCurrent (sp-center, ep-center, j, num);
917
+ }
701
918
  }
702
919
  }
703
920
 
@@ -836,6 +1053,10 @@ namespace ngsbem
836
1053
  void CalcMP()
837
1054
  {
838
1055
  static Timer t("mptool compute singular MLMP"); RegionTimer rg(t);
1056
+ static Timer ts2mp("mptool compute singular MLMP - source2mp");
1057
+ static Timer tS2S("mptool compute singular MLMP - S->S");
1058
+ static Timer trec("mptool comput singular recording");
1059
+ static Timer tsort("mptool comput singular sort");
839
1060
 
840
1061
  /*
841
1062
  int maxlevel = 0;
@@ -847,7 +1068,87 @@ namespace ngsbem
847
1068
  */
848
1069
 
849
1070
  root.CalcTotalSources();
850
- root.CalcMP();
1071
+
1072
+ if (false)
1073
+ // direct evaluation of S->S
1074
+ root.CalcMP(nullptr, nullptr);
1075
+ else
1076
+ {
1077
+
1078
+ Array<RecordingSS> recording;
1079
+ Array<Node*> nodes_to_process;
1080
+
1081
+ {
1082
+ RegionTimer reg(trec);
1083
+ root.CalcMP(&recording, &nodes_to_process);
1084
+ }
1085
+
1086
+ {
1087
+ RegionTimer rs2mp(ts2mp);
1088
+ ParallelFor(nodes_to_process.Size(), [&](int i){
1089
+ auto node = nodes_to_process[i];
1090
+ for (auto [x,c]: node->charges)
1091
+ node->mp.AddCharge(x-node->center, c);
1092
+ for (auto [x,d,c]: node->dipoles)
1093
+ node->mp.AddDipole(x-node->center, d, c);
1094
+ for (auto [sp,ep,j,num]: node->currents)
1095
+ node->mp.AddCurrent(sp-node->center, ep-node->center, j, num);
1096
+ }, TasksPerThread(4));
1097
+ }
1098
+
1099
+ {
1100
+ RegionTimer reg(tsort);
1101
+ QuickSort (recording, [] (auto & a, auto & b)
1102
+ {
1103
+ if (a.len < (1-1e-8) * b.len) return true;
1104
+ if (a.len > (1+1e-8) * b.len) return false;
1105
+ return a.theta < b.theta;
1106
+ });
1107
+ }
1108
+
1109
+ double current_len = -1e100;
1110
+ double current_theta = -1e100;
1111
+ Array<RecordingSS*> current_batch;
1112
+ Array<Array<RecordingSS*>> batch_group;
1113
+ Array<double> group_lengths;
1114
+ Array<double> group_thetas;
1115
+ for (auto & record : recording)
1116
+ {
1117
+ bool len_changed = fabs(record.len - current_len) > 1e-8;
1118
+ bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
1119
+ if ((len_changed || theta_changed) && current_batch.Size() > 0) {
1120
+ batch_group.Append(current_batch);
1121
+ group_lengths.Append(current_len);
1122
+ group_thetas.Append(current_theta);
1123
+ current_batch.SetSize(0);
1124
+ }
1125
+
1126
+ current_len = record.len;
1127
+ current_theta = record.theta;
1128
+ current_batch.Append(&record);
1129
+ }
1130
+ if (current_batch.Size() > 0) {
1131
+ batch_group.Append(current_batch);
1132
+ group_lengths.Append(current_len);
1133
+ group_thetas.Append(current_theta);
1134
+ }
1135
+
1136
+ {
1137
+ RegionTimer rS2S(tS2S);
1138
+ // ParallelFor(batch_group.Size(), [&](int i) {
1139
+ for (int i = 0; i < batch_group.Size(); i++){
1140
+ // *testout << "Processing batch " << i << " of size " << batch_group[i].Size() << ", with len = " << group_lengths[i] << ", theta = " << group_thetas[i] << endl;
1141
+ int chunk_size = 24;
1142
+ if (batch_group[i].Size() < chunk_size)
1143
+ ProcessBatch(batch_group[i], group_lengths[i], group_thetas[i]);
1144
+ else
1145
+ ParallelForRange(IntRange(batch_group[i].Size()), [&](IntRange range) {
1146
+ auto sub_batch = batch_group[i].Range(range.First(), range.Next());
1147
+ ProcessBatch(sub_batch, group_lengths[i], group_thetas[i]);
1148
+ }, TasksPerThread(4));
1149
+ }
1150
+ }
1151
+ }
851
1152
 
852
1153
  havemp = true;
853
1154
  }
@@ -892,19 +1193,128 @@ namespace ngsbem
892
1193
  Vec<3> adist)
893
1194
  : mpS(ampS), mpR(ampR), dist(adist)
894
1195
  {
895
- len = L2Norm(dist);
896
- if (len < 1e-30)
897
- theta = 0;
898
- else
899
- theta = acos (dist(2) / len);
900
-
901
- if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
902
- phi = 0;
903
- else
904
- phi = atan2(dist(1), dist(0));
1196
+ std::tie(len, theta, phi) = SphericalCoordinates(dist);
905
1197
  }
906
1198
  };
907
1199
 
1200
+ static void ProcessBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
1201
+ // static Timer t("ProcessBatchRS"); RegionTimer reg(t, batch.Size());
1202
+ constexpr int vec_length = VecLength<elem_type>;
1203
+ int batch_size = batch.Size();
1204
+ int N = batch_size * vec_length;
1205
+ // *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(elem_type).name() << ", len = " << len << ", theta = " << theta << endl;
1206
+
1207
+ if (N <= 1 || batch_size <= 1) {
1208
+ for (auto* rec : batch) {
1209
+ rec->mpS->TransformAdd(*rec->mpR, rec->dist);
1210
+ }
1211
+ }
1212
+ else if (N <= 3) {
1213
+ ProcessVectorizedBatch<3, vec_length>(batch, len, theta);
1214
+ }
1215
+ else if (N <= 4) {
1216
+ ProcessVectorizedBatch<4, vec_length>(batch, len, theta);
1217
+ }
1218
+ else if (N <= 6) {
1219
+ ProcessVectorizedBatch<6, vec_length>(batch, len, theta);
1220
+ }
1221
+ else if (N <= 12) {
1222
+ ProcessVectorizedBatch<12, vec_length>(batch, len, theta);
1223
+ }
1224
+ else if (N <= 24) {
1225
+ ProcessVectorizedBatch<24, vec_length>(batch, len, theta);
1226
+ }
1227
+ else if (N <= 48) {
1228
+ ProcessVectorizedBatch<48, vec_length>(batch, len, theta);
1229
+ }
1230
+ else if (N <= 96) {
1231
+ ProcessVectorizedBatch<96, vec_length>(batch, len, theta);
1232
+ }
1233
+ else if (N <= 192) {
1234
+ ProcessVectorizedBatch<192, vec_length>(batch, len, theta);
1235
+ }
1236
+ else {
1237
+ // Split large batches
1238
+ /*
1239
+ ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
1240
+ ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
1241
+ */
1242
+
1243
+ /*
1244
+ ParallelFor (2, [&] (int i)
1245
+ {
1246
+ if (i == 0)
1247
+ ProcessBatchRS(batch.Range(0, 192 / vec_length), len, theta);
1248
+ else
1249
+ ProcessBatchRS(batch.Range(192 / vec_length, batch_size), len, theta);
1250
+ }, 2);
1251
+ */
1252
+
1253
+
1254
+ size_t chunksize = 192/vec_length;
1255
+ size_t num = (batch.Size()+chunksize-1) / chunksize;
1256
+ ParallelFor (num, [&](int i)
1257
+ {
1258
+ ProcessBatchRS(batch.Range(i*chunksize, min((i+1)*chunksize, batch.Size())), len, theta);
1259
+ }, num);
1260
+
1261
+ }
1262
+ }
1263
+
1264
+
1265
+ template<int N, int vec_length>
1266
+ static void ProcessVectorizedBatch(FlatArray<RecordingRS*> batch, double len, double theta) {
1267
+
1268
+ // static Timer t("ProcessVectorizedBatch, N = "+ToString(N) + ", vec_len = " + ToString(vec_length));
1269
+ // RegionTimer reg(t, batch[0]->mpS->SH().Order());
1270
+ // static Timer ttobatch("mptools - copy to batch 2");
1271
+ // static Timer tfrombatch("mptools - copy from batch 2");
1272
+
1273
+ // *testout << "Processing vectorized batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
1274
+ MultiPole<MPSingular, Vec<N,Complex>> vec_source(batch[0]->mpS->Order(), batch[0]->mpS->Kappa(), batch[0]->mpS->RTyp());
1275
+ // MultiPole<MPSingular, elem_type> tmp_source{*batch[0]->mpS};
1276
+ MultiPole<MPRegular, elem_type> tmp_target{*batch[0]->mpR};
1277
+ MultiPole<MPRegular, Vec<N,Complex>> vec_target(batch[0]->mpR->Order(), batch[0]->mpR->Kappa(), batch[0]->mpR->RTyp());
1278
+
1279
+ // Copy multipoles into vectorized multipole
1280
+ // ttobatch.Start();
1281
+ for (int i = 0; i < batch.Size(); i++)
1282
+ {
1283
+ auto source_i = VecVector2Matrix (batch[i]->mpS->SH().Coefs());
1284
+ auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
1285
+ batch[i]->mpS->SH().RotateZ(batch[i]->phi,
1286
+ [source_i, source_mati] (size_t ii, Complex factor)
1287
+ {
1288
+ source_mati.Row(ii) = factor * source_i.Row(ii);
1289
+ });
1290
+ }
1291
+
1292
+ // ttobatch.Stop();
1293
+
1294
+ vec_source.SH().RotateY(theta);
1295
+ vec_source.ShiftZ(-len, vec_target);
1296
+ vec_target.SH().RotateY(-theta);
1297
+
1298
+ // Copy vectorized multipole into individual multipoles
1299
+ // tfrombatch.Start();
1300
+ for (int i = 0; i < batch.Size(); i++) {
1301
+ // auto source_i = VecVector2Matrix (tmp_target.SH().Coefs());
1302
+ auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
1303
+ auto targeti = VecVector2Matrix(batch[i]->mpR->SH().Coefs());
1304
+
1305
+ tmp_target.SH().RotateZ(-batch[i]->phi,
1306
+ [source_mati, targeti] (size_t ii, Complex factor)
1307
+ {
1308
+ // source_i.Row(ii) = factor * source_mati.Row(ii);
1309
+ AtomicAdd (VectorView(targeti.Row(ii)), factor * source_mati.Row(ii));
1310
+ });
1311
+ // for (int j = 0; j < tmp_target.SH().Coefs().Size(); j++)
1312
+ // AtomicAdd(batch[i]->mpR->SH().Coefs()[j], tmp_target.SH().Coefs()[j]);
1313
+ }
1314
+ // tfrombatch.Stop();
1315
+
1316
+ }
1317
+
908
1318
 
909
1319
  struct Node
910
1320
  {
@@ -915,6 +1325,8 @@ namespace ngsbem
915
1325
  MultiPole<MPRegular,elem_type> mp;
916
1326
  Array<Vec<3>> targets;
917
1327
  int total_targets;
1328
+ std::mutex node_mutex;
1329
+ atomic<bool> have_childs{false};
918
1330
 
919
1331
  Array<const typename SingularMLMultiPole<elem_type>::Node*> singnodes;
920
1332
 
@@ -939,6 +1351,7 @@ namespace ngsbem
939
1351
  cc(2) += (i&4) ? r/2 : -r/2;
940
1352
  childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
941
1353
  }
1354
+ have_childs = true;
942
1355
  }
943
1356
 
944
1357
  void AddSingularNode (const typename SingularMLMultiPole<elem_type>::Node & singnode, bool allow_refine,
@@ -946,7 +1359,7 @@ namespace ngsbem
946
1359
  {
947
1360
  if (mp.SH().Order() < 0) return;
948
1361
  if (singnode.mp.SH().Order() < 0) return;
949
- if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
1362
+ // if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
950
1363
  if (level > 20)
951
1364
  {
952
1365
  singnodes.Append(&singnode);
@@ -1028,12 +1441,22 @@ namespace ngsbem
1028
1441
 
1029
1442
  if (childs[0])
1030
1443
  {
1031
- for (auto & ch : childs)
1444
+ if (total_targets < 1000)
1032
1445
  {
1033
- if (L2Norm(mp.SH().Coefs()) > 0)
1034
- mp.TransformAdd (ch->mp, ch->center-center);
1035
- ch->LocalizeExpansion(allow_refine);
1446
+ for (int nr = 0; nr < 8; nr++)
1447
+ {
1448
+ if (L2Norm(mp.SH().Coefs()) > 0)
1449
+ mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
1450
+ childs[nr]->LocalizeExpansion(allow_refine);
1451
+ }
1036
1452
  }
1453
+ else
1454
+ ParallelFor(8, [&] (int nr)
1455
+ {
1456
+ if (L2Norm(mp.SH().Coefs()) > 0)
1457
+ mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
1458
+ childs[nr]->LocalizeExpansion(allow_refine);
1459
+ });
1037
1460
  mp = MultiPole<MPRegular,elem_type>(-1, mp.Kappa(), 1.);
1038
1461
  //mp.SH().Coefs()=0.0;
1039
1462
  }
@@ -1041,18 +1464,8 @@ namespace ngsbem
1041
1464
 
1042
1465
  elem_type Evaluate (Vec<3> p) const
1043
1466
  {
1044
- // *testout << "eval p = " << p << ", level = " << level << ", center = " << center << ", r = " << r << endl;
1045
1467
  elem_type sum{0.0};
1046
- /*
1047
- if (childs[0])
1048
- {
1049
- int childnum = 0;
1050
- if (p(0) > center(0)) childnum += 1;
1051
- if (p(1) > center(1)) childnum += 2;
1052
- if (p(2) > center(2)) childnum += 4;
1053
- sum = childs[childnum]->Evaluate(p);
1054
- }
1055
- */
1468
+
1056
1469
  int childnum = 0;
1057
1470
  if (p(0) > center(0)) childnum += 1;
1058
1471
  if (p(1) > center(1)) childnum += 2;
@@ -1062,8 +1475,6 @@ namespace ngsbem
1062
1475
  else
1063
1476
  sum = mp.Eval(p-center);
1064
1477
 
1065
-
1066
- // static Timer t("mptool direct evaluate"); RegionTimer r(t);
1067
1478
  for (auto sn : singnodes)
1068
1479
  sum += sn->EvaluateMP(p);
1069
1480
 
@@ -1112,7 +1523,8 @@ namespace ngsbem
1112
1523
 
1113
1524
  void AddTarget (Vec<3> x)
1114
1525
  {
1115
- if (childs[0])
1526
+ // if (childs[0])
1527
+ if (have_childs) // quick check without locking
1116
1528
  {
1117
1529
  // directly send to childs:
1118
1530
  int childnum = 0;
@@ -1123,6 +1535,20 @@ namespace ngsbem
1123
1535
  return;
1124
1536
  }
1125
1537
 
1538
+ lock_guard<mutex> guard(node_mutex);
1539
+
1540
+ if (have_childs) // test again after locking
1541
+ {
1542
+ // directly send to childs:
1543
+ int childnum = 0;
1544
+ if (x(0) > center(0)) childnum += 1;
1545
+ if (x(1) > center(1)) childnum += 2;
1546
+ if (x(2) > center(2)) childnum += 4;
1547
+ childs[childnum] -> AddTarget(x);
1548
+ return;
1549
+ }
1550
+
1551
+
1126
1552
  targets.Append( x );
1127
1553
 
1128
1554
  // if (r*mp.Kappa() < 1e-8) return;
@@ -1227,6 +1653,8 @@ namespace ngsbem
1227
1653
  void CalcMP(shared_ptr<SingularMLMultiPole<elem_type>> asingmp)
1228
1654
  {
1229
1655
  static Timer t("mptool regular MLMP"); RegionTimer rg(t);
1656
+ static Timer trec("mptool regular MLMP - recording");
1657
+ static Timer tsort("mptool regular MLMP - sort");
1230
1658
 
1231
1659
  singmp = asingmp;
1232
1660
 
@@ -1234,23 +1662,58 @@ namespace ngsbem
1234
1662
  root.RemoveEmptyTrees();
1235
1663
 
1236
1664
 
1237
- root.AddSingularNode(singmp->root, false, nullptr);
1238
- /*
1239
- Array<RecordingRS> recording;
1240
- root.AddSingularNode(singmp->root, false, &recording);
1665
+ // root.AddSingularNode(singmp->root, false, nullptr);
1666
+ // /*
1667
+ Array<RecordingRS> recording;
1668
+ {
1669
+ RegionTimer rrec(trec);
1670
+ root.AddSingularNode(singmp->root, false, &recording);
1671
+ }
1672
+
1241
1673
  // cout << "recorded: " << recording.Size() << endl;
1674
+ {
1675
+ RegionTimer reg(tsort);
1242
1676
  QuickSort (recording, [] (auto & a, auto & b)
1243
1677
  {
1244
1678
  if (a.len < (1-1e-8) * b.len) return true;
1245
1679
  if (a.len > (1+1e-8) * b.len) return false;
1246
1680
  return a.theta < b.theta;
1247
1681
  });
1682
+ }
1683
+
1684
+ double current_len = -1e100;
1685
+ double current_theta = -1e100;
1686
+ Array<RecordingRS*> current_batch;
1687
+ Array<Array<RecordingRS*>> batch_group;
1688
+ Array<double> group_lengths;
1689
+ Array<double> group_thetas;
1248
1690
  for (auto & record : recording)
1249
1691
  {
1250
- record.mpS->TransformAdd(*record.mpR, record.dist);
1251
- // *testout << record.len << ", " << record.theta << ", " << record.phi << endl;
1692
+ bool len_changed = fabs(record.len - current_len) > 1e-8;
1693
+ bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
1694
+ if ((len_changed || theta_changed) && current_batch.Size() > 0) {
1695
+ // ProcessBatch(current_batch, current_len, current_theta);
1696
+ batch_group.Append(current_batch);
1697
+ group_lengths.Append(current_len);
1698
+ group_thetas.Append(current_theta);
1699
+ current_batch.SetSize(0);
1700
+ }
1701
+
1702
+ current_len = record.len;
1703
+ current_theta = record.theta;
1704
+ current_batch.Append(&record);
1252
1705
  }
1253
- */
1706
+ if (current_batch.Size() > 0) {
1707
+ // ProcessBatch(current_batch, current_len, current_theta);
1708
+ batch_group.Append(current_batch);
1709
+ group_lengths.Append(current_len);
1710
+ group_thetas.Append(current_theta);
1711
+ }
1712
+
1713
+ ParallelFor(batch_group.Size(), [&](int i) {
1714
+ ProcessBatchRS(batch_group[i], group_lengths[i], group_thetas[i]);
1715
+ }, TasksPerThread(4));
1716
+ // */
1254
1717
 
1255
1718
 
1256
1719
  /*
@@ -1262,7 +1725,7 @@ namespace ngsbem
1262
1725
  cout << "reg " << i << ": " << RegularMLMultiPole::nodes_on_level[i] << endl;
1263
1726
  */
1264
1727
 
1265
- static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
1728
+ static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
1266
1729
  root.LocalizeExpansion(false);
1267
1730
  }
1268
1731
 
@@ -1296,6 +1759,7 @@ namespace ngsbem
1296
1759
 
1297
1760
  };
1298
1761
 
1762
+
1299
1763
  template <typename elem_type>
1300
1764
  inline ostream & operator<< (ostream & ost, const RegularMLMultiPole<elem_type> & mlmp)
1301
1765
  {