ngsolve 6.2.2505.post17.dev0__cp311-cp311-macosx_10_15_universal2.whl → 6.2.2505.post95.dev0__cp311-cp311-macosx_10_15_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ngsolve might be problematic. Click here for more details.
- netgen/include/bilinearform.hpp +1 -1
- netgen/include/diffop_impl.hpp +3 -1
- netgen/include/gridfunction.hpp +1 -1
- netgen/include/hcurlcurlfe.hpp +20 -0
- netgen/include/mptools.hpp +786 -101
- netgen/include/ngblas.hpp +11 -0
- netgen/include/recursive_pol.hpp +63 -11
- netgen/include/simd_complex.hpp +20 -0
- netgen/include/sparsematrix_impl.hpp +25 -0
- netgen/include/vector.hpp +15 -2
- netgen/libngbla.dylib +0 -0
- netgen/libngcomp.dylib +0 -0
- netgen/libngfem.dylib +0 -0
- netgen/libngla.dylib +0 -0
- netgen/libngsbem.dylib +0 -0
- netgen/libngstd.dylib +0 -0
- ngsolve/cmake/NGSolveConfig.cmake +1 -1
- ngsolve/config/config.py +5 -5
- {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post95.dev0.dist-info}/METADATA +2 -2
- {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post95.dev0.dist-info}/RECORD +56 -56
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/Netgen.icns +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/bin/ngscxx +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/bin/ngsld +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/bin/ngsolve.tcl +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/bin/ngspy +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/beam.geo +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/beam.vol +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/chip.in2d +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/chip.vol +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/coil.geo +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/coil.vol +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/coilshield.geo +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/coilshield.vol +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/cube.geo +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/cube.vol +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d10_DGdoubleglazing.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d11_chip_nitsche.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d1_square.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d2_chip.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d3_helmholtz.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d4_cube.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d5_beam.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d6_shaft.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d7_coil.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d8_coilshield.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/d9_hybridDG.pde +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/doubleglazing.in2d +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/doubleglazing.vol +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/piezo2d40round4.vol.gz +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/shaft.geo +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/shaft.vol +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/square.in2d +0 -0
- {ngsolve-6.2.2505.post17.dev0.data → ngsolve-6.2.2505.post95.dev0.data}/data/share/ngsolve/square.vol +0 -0
- {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post95.dev0.dist-info}/LICENSE +0 -0
- {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post95.dev0.dist-info}/WHEEL +0 -0
- {ngsolve-6.2.2505.post17.dev0.dist-info → ngsolve-6.2.2505.post95.dev0.dist-info}/top_level.txt +0 -0
netgen/include/mptools.hpp
CHANGED
|
@@ -20,6 +20,157 @@ namespace ngsbem
|
|
|
20
20
|
{
|
|
21
21
|
using namespace ngfem;
|
|
22
22
|
|
|
23
|
+
template<typename T>
|
|
24
|
+
constexpr int VecLength = 1; // Default: Complex has length 1
|
|
25
|
+
|
|
26
|
+
template<int N>
|
|
27
|
+
constexpr int VecLength<Vec<N, Complex>> = N; // Specialization: Vec<N,Complex> has length N
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
constexpr int FMM_SW = 4;
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
// ************************ SIMD - creation (should end up in simd.hpp) *************
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
template <int S, typename T, int SW>
|
|
38
|
+
Vec<S,T> HSum (Vec<S,SIMD<T,SW>> v)
|
|
39
|
+
{
|
|
40
|
+
Vec<S,T> res;
|
|
41
|
+
for (int i = 0; i < S; i++)
|
|
42
|
+
res(i) = HSum(v(i));
|
|
43
|
+
// Iterate<S> ([&](auto i) {
|
|
44
|
+
// res.HTData().template Elem<i.value>() = HSum(v.HTData().template Elem<i.value>());
|
|
45
|
+
// });
|
|
46
|
+
return res;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
template <typename T, size_t S> class MakeSimdCl;
|
|
51
|
+
|
|
52
|
+
template <typename T, size_t S>
|
|
53
|
+
auto MakeSimd (array<T,S> aa) { return MakeSimdCl(aa).Get(); }
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
template <typename T, size_t S>
|
|
57
|
+
class MakeSimdCl
|
|
58
|
+
{
|
|
59
|
+
array<T,S> a;
|
|
60
|
+
public:
|
|
61
|
+
MakeSimdCl (array<T,S> aa) : a(aa) { ; }
|
|
62
|
+
auto Get() const
|
|
63
|
+
{
|
|
64
|
+
SIMD<T,S> sa( [this] (auto i) { return (this->a)[i]; });
|
|
65
|
+
return sa;
|
|
66
|
+
}
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
template <typename T, size_t S, int VS>
|
|
71
|
+
class MakeSimdCl<Vec<VS,T>,S>
|
|
72
|
+
{
|
|
73
|
+
array<Vec<VS,T>,S> a;
|
|
74
|
+
public:
|
|
75
|
+
MakeSimdCl (array<Vec<VS,T>,S> aa) : a(aa) { ; }
|
|
76
|
+
|
|
77
|
+
auto Get() const
|
|
78
|
+
{
|
|
79
|
+
array<T,S> ai;
|
|
80
|
+
Vec<VS, decltype(MakeSimd(ai))> res;
|
|
81
|
+
for (int i = 0; i < VS; i++)
|
|
82
|
+
{
|
|
83
|
+
for (int j = 0; j < S; j++)
|
|
84
|
+
ai[j] = a[j](i);
|
|
85
|
+
res(i) = MakeSimd(ai);
|
|
86
|
+
}
|
|
87
|
+
return res;
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
template <size_t S>
|
|
94
|
+
class MakeSimdCl<Complex,S>
|
|
95
|
+
{
|
|
96
|
+
array<Complex,S> a;
|
|
97
|
+
public:
|
|
98
|
+
MakeSimdCl (array<Complex,S> aa) : a(aa) { ; }
|
|
99
|
+
auto Get() const
|
|
100
|
+
{
|
|
101
|
+
array<double,S> ar, ai;
|
|
102
|
+
for (int j = 0; j < S; j++)
|
|
103
|
+
{
|
|
104
|
+
ar[j] = Real(a[j]);
|
|
105
|
+
ai[j] = Imag(a[j]);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return SIMD<Complex,S> (MakeSimd(ar), MakeSimd(ai));
|
|
109
|
+
}
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
template <typename Tfirst, size_t S, typename ...Trest>
|
|
118
|
+
class MakeSimdCl<std::tuple<Tfirst,Trest...>,S>
|
|
119
|
+
{
|
|
120
|
+
array<std::tuple<Tfirst,Trest...>,S> a;
|
|
121
|
+
public:
|
|
122
|
+
MakeSimdCl (array<std::tuple<Tfirst,Trest...>,S> aa) : a(aa) { ; }
|
|
123
|
+
auto Get() const
|
|
124
|
+
{
|
|
125
|
+
array<Tfirst,S> a0;
|
|
126
|
+
for (int i = 0; i < S; i++)
|
|
127
|
+
a0[i] = std::get<0> (a[i]);
|
|
128
|
+
|
|
129
|
+
if constexpr (std::tuple_size<tuple<Tfirst,Trest...>>::value == 1)
|
|
130
|
+
{
|
|
131
|
+
return tuple(MakeSimd(a0));
|
|
132
|
+
}
|
|
133
|
+
else
|
|
134
|
+
{
|
|
135
|
+
array<tuple<Trest...>,S> arest;
|
|
136
|
+
for (int i = 0; i < S; i++)
|
|
137
|
+
arest[i] = skip_first(a[i]);
|
|
138
|
+
|
|
139
|
+
return tuple_cat ( tuple (MakeSimd(a0)), MakeSimd(arest) );
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
template <typename... Ts>
|
|
144
|
+
static auto skip_first(const std::tuple<Ts...>& t) {
|
|
145
|
+
return std::apply([](auto first, auto... rest) {
|
|
146
|
+
return std::make_tuple(rest...);
|
|
147
|
+
}, t);
|
|
148
|
+
}
|
|
149
|
+
};
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
inline std::tuple<double, double, double> SphericalCoordinates(Vec<3> dist){
|
|
161
|
+
double len, theta, phi;
|
|
162
|
+
len = L2Norm(dist);
|
|
163
|
+
if (len < 1e-30)
|
|
164
|
+
theta = 0;
|
|
165
|
+
else
|
|
166
|
+
theta = acos (dist(2) / len);
|
|
167
|
+
if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
|
|
168
|
+
phi = 0;
|
|
169
|
+
else
|
|
170
|
+
phi = atan2(dist(1), dist(0));
|
|
171
|
+
return {len, theta, phi};
|
|
172
|
+
}
|
|
173
|
+
|
|
23
174
|
|
|
24
175
|
template <typename entry_type = Complex>
|
|
25
176
|
class NGS_DLL_HEADER SphericalHarmonics
|
|
@@ -84,9 +235,69 @@ namespace ngsbem
|
|
|
84
235
|
|
|
85
236
|
void Calc (Vec<3> x, FlatVector<Complex> shapes);
|
|
86
237
|
|
|
87
|
-
|
|
238
|
+
|
|
239
|
+
void FlipZ ();
|
|
88
240
|
void RotateZ (double alpha);
|
|
89
|
-
|
|
241
|
+
|
|
242
|
+
template <typename FUNC>
|
|
243
|
+
void RotateZ (double alpha, FUNC func) const
|
|
244
|
+
{
|
|
245
|
+
if (order < 0) return;
|
|
246
|
+
|
|
247
|
+
Vector<Complex> exp_imalpha(order+1);
|
|
248
|
+
Complex exp_ialpha(cos(alpha), sin(alpha));
|
|
249
|
+
Complex prod = 1.0;
|
|
250
|
+
for (int i = 0; i <= order; i++)
|
|
251
|
+
{
|
|
252
|
+
exp_imalpha(i) = prod;
|
|
253
|
+
prod *= exp_ialpha;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
int ii = 0;
|
|
257
|
+
for (int n = 0; n <= order; n++)
|
|
258
|
+
{
|
|
259
|
+
for (int m = -n; m < 0; m++, ii++)
|
|
260
|
+
func(ii, conj(exp_imalpha(-m)));
|
|
261
|
+
for (int m = 0; m <= n; m++, ii++)
|
|
262
|
+
func(ii, exp_imalpha(m));
|
|
263
|
+
};
|
|
264
|
+
};
|
|
265
|
+
|
|
266
|
+
template <typename FUNC>
|
|
267
|
+
void RotateZFlip (double alpha, bool flip, FUNC func) const
|
|
268
|
+
{
|
|
269
|
+
if (order < 0) return;
|
|
270
|
+
|
|
271
|
+
Vector<Complex> exp_imalpha(order+1);
|
|
272
|
+
Complex exp_ialpha(cos(alpha), sin(alpha));
|
|
273
|
+
Complex prod = 1.0;
|
|
274
|
+
for (int i = 0; i <= order; i++)
|
|
275
|
+
{
|
|
276
|
+
exp_imalpha(i) = prod;
|
|
277
|
+
prod *= exp_ialpha;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
int ii = 0;
|
|
281
|
+
|
|
282
|
+
auto FlipFactor = [] (int n, int m, bool flip)->double
|
|
283
|
+
{
|
|
284
|
+
if (flip)
|
|
285
|
+
return ((n-m)%2) == 1 ? -1 : 1;
|
|
286
|
+
return 1.0;
|
|
287
|
+
};
|
|
288
|
+
|
|
289
|
+
for (int n = 0; n <= order; n++)
|
|
290
|
+
{
|
|
291
|
+
for (int m = -n; m < 0; m++, ii++)
|
|
292
|
+
func(ii, FlipFactor(n,m,flip)*conj(exp_imalpha(-m)));
|
|
293
|
+
for (int m = 0; m <= n; m++, ii++)
|
|
294
|
+
func(ii, FlipFactor(n,m,flip)*exp_imalpha(m));
|
|
295
|
+
};
|
|
296
|
+
};
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
void RotateY (double alpha, bool parallel = false);
|
|
90
301
|
|
|
91
302
|
|
|
92
303
|
static double CalcAmn (int m, int n)
|
|
@@ -119,11 +330,11 @@ namespace ngsbem
|
|
|
119
330
|
// https://fortran-lang.discourse.group/t/looking-for-spherical-bessel-and-hankel-functions-of-first-and-second-kind-and-arbitrary-order/2308/2
|
|
120
331
|
NGS_DLL_HEADER
|
|
121
332
|
void besseljs3d (int nterms, double z, double scale,
|
|
122
|
-
|
|
333
|
+
SliceVector<double> fjs, SliceVector<double> fjder = FlatVector<double>(0, nullptr));
|
|
123
334
|
|
|
124
335
|
NGS_DLL_HEADER
|
|
125
336
|
void besseljs3d (int nterms, Complex z, double scale,
|
|
126
|
-
|
|
337
|
+
SliceVector<Complex> fjs, SliceVector<Complex> fjder = FlatVector<Complex>(0, nullptr));
|
|
127
338
|
|
|
128
339
|
|
|
129
340
|
/*
|
|
@@ -142,14 +353,17 @@ namespace ngsbem
|
|
|
142
353
|
FlatVector<double> jp,
|
|
143
354
|
FlatVector<double> yp);
|
|
144
355
|
|
|
145
|
-
|
|
356
|
+
|
|
146
357
|
|
|
147
358
|
template <typename T>
|
|
148
359
|
void SphericalBessel (int n, double rho, double scale, T && values)
|
|
149
360
|
{
|
|
361
|
+
besseljs3d (n, rho, scale, values);
|
|
362
|
+
/*
|
|
150
363
|
Vector<double> j(n+1), jp(n+1);
|
|
151
364
|
besseljs3d (n, rho, scale, j, jp);
|
|
152
365
|
values = j;
|
|
366
|
+
*/
|
|
153
367
|
}
|
|
154
368
|
|
|
155
369
|
|
|
@@ -173,21 +387,6 @@ namespace ngsbem
|
|
|
173
387
|
return;
|
|
174
388
|
}
|
|
175
389
|
Vector j(n+1), y(n+1), jp(n+1), yp(n+1);
|
|
176
|
-
// SBESJY (rho, n, j, y, jp, yp);
|
|
177
|
-
|
|
178
|
-
/*
|
|
179
|
-
values = j + Complex(0,1) * y;
|
|
180
|
-
if (scale != 1.0)
|
|
181
|
-
{
|
|
182
|
-
double prod = 1.0;
|
|
183
|
-
for (int i = 0; i <= n; i++)
|
|
184
|
-
{
|
|
185
|
-
values(i) *= prod;
|
|
186
|
-
prod *= scale;
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
*/
|
|
190
|
-
|
|
191
390
|
|
|
192
391
|
// the bessel-evaluation with scale
|
|
193
392
|
besseljs3d (n, rho, 1/scale, j, jp);
|
|
@@ -358,18 +557,7 @@ namespace ngsbem
|
|
|
358
557
|
// static Timer t("mptool Transform "+ToString(typeid(RADIAL).name())+ToString(typeid(TARGET).name()));
|
|
359
558
|
// RegionTimer reg(t);
|
|
360
559
|
|
|
361
|
-
|
|
362
|
-
double theta, phi;
|
|
363
|
-
|
|
364
|
-
if (len < 1e-30)
|
|
365
|
-
theta = 0;
|
|
366
|
-
else
|
|
367
|
-
theta = acos (dist(2) / len);
|
|
368
|
-
|
|
369
|
-
if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
|
|
370
|
-
phi = 0;
|
|
371
|
-
else
|
|
372
|
-
phi = atan2(dist(1), dist(0));
|
|
560
|
+
auto [len, theta, phi] = SphericalCoordinates(dist);
|
|
373
561
|
|
|
374
562
|
|
|
375
563
|
// MultiPole<RADIAL,entry_type> tmp{*this};
|
|
@@ -386,14 +574,18 @@ namespace ngsbem
|
|
|
386
574
|
}
|
|
387
575
|
|
|
388
576
|
template <typename TARGET>
|
|
389
|
-
void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist) const
|
|
577
|
+
void TransformAdd (MultiPole<TARGET,entry_type> & target, Vec<3> dist, bool atomic = false) const
|
|
390
578
|
{
|
|
391
579
|
if (SH().Order() < 0) return;
|
|
392
580
|
if (target.SH().Order() < 0) return;
|
|
393
581
|
|
|
394
582
|
MultiPole<TARGET,entry_type> tmp{target};
|
|
395
583
|
Transform(tmp, dist);
|
|
396
|
-
|
|
584
|
+
if (!atomic)
|
|
585
|
+
target.SH().Coefs() += tmp.SH().Coefs();
|
|
586
|
+
else
|
|
587
|
+
for (int j = 0; j < target.SH().Coefs().Size(); j++)
|
|
588
|
+
AtomicAdd(target.SH().Coefs()[j], tmp.SH().Coefs()[j]);
|
|
397
589
|
}
|
|
398
590
|
|
|
399
591
|
template <typename TARGET>
|
|
@@ -412,11 +604,124 @@ namespace ngsbem
|
|
|
412
604
|
static constexpr int maxdirect = 100;
|
|
413
605
|
|
|
414
606
|
|
|
607
|
+
template <typename SCAL, auto S>
|
|
608
|
+
inline auto VecVector2Matrix (FlatVector<Vec<S,SCAL>> vec)
|
|
609
|
+
{
|
|
610
|
+
return FlatMatrixFixWidth<S,SCAL> (vec.Size(), vec.Data()->Data());
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
inline auto VecVector2Matrix (FlatVector<Complex> vec)
|
|
614
|
+
{
|
|
615
|
+
return FlatMatrixFixWidth<1,Complex> (vec.Size(), vec.Data());
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
|
|
415
619
|
template <typename entry_type=Complex>
|
|
416
620
|
class SingularMLMultiPole
|
|
417
621
|
{
|
|
622
|
+
using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
|
|
418
623
|
static Array<size_t> nodes_on_level;
|
|
419
624
|
|
|
625
|
+
struct RecordingSS
|
|
626
|
+
{
|
|
627
|
+
const MultiPole<MPSingular,entry_type> * mp_source;
|
|
628
|
+
MultiPole<MPSingular,entry_type> * mp_target;
|
|
629
|
+
Vec<3> dist;
|
|
630
|
+
double len, theta, phi;
|
|
631
|
+
bool flipz;
|
|
632
|
+
public:
|
|
633
|
+
RecordingSS() = default;
|
|
634
|
+
RecordingSS (const MultiPole<MPSingular,entry_type> * amp_source,
|
|
635
|
+
MultiPole<MPSingular,entry_type> * amp_target,
|
|
636
|
+
Vec<3> adist)
|
|
637
|
+
: mp_source(amp_source), mp_target(amp_target), dist(adist)
|
|
638
|
+
{
|
|
639
|
+
std::tie(len, theta, phi) = SphericalCoordinates(adist);
|
|
640
|
+
// flipz = false;
|
|
641
|
+
flipz = theta > M_PI/2;
|
|
642
|
+
if (flipz) theta = M_PI-theta;
|
|
643
|
+
}
|
|
644
|
+
};
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
static void ProcessBatch(FlatArray<RecordingSS*> batch, double len, double theta) {
|
|
648
|
+
constexpr int vec_length = VecLength<entry_type>;
|
|
649
|
+
int batch_size = batch.Size();
|
|
650
|
+
int N = batch_size * vec_length;
|
|
651
|
+
// *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(entry_type).name() << ", len = " << len << ", theta = " << theta << endl;
|
|
652
|
+
|
|
653
|
+
if (N <= 1 || batch_size <= 1) {
|
|
654
|
+
for (auto* rec : batch) {
|
|
655
|
+
rec->mp_source->TransformAdd(*rec->mp_target, rec->dist, true);
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
else if (N <= 3) {
|
|
659
|
+
ProcessVectorizedBatch<3, vec_length>(batch, len, theta);
|
|
660
|
+
}
|
|
661
|
+
else if (N <= 4) {
|
|
662
|
+
ProcessVectorizedBatch<4, vec_length>(batch, len, theta);
|
|
663
|
+
}
|
|
664
|
+
else if (N <= 6) {
|
|
665
|
+
ProcessVectorizedBatch<6, vec_length>(batch, len, theta);
|
|
666
|
+
}
|
|
667
|
+
else if (N <= 12) {
|
|
668
|
+
ProcessVectorizedBatch<12, vec_length>(batch, len, theta);
|
|
669
|
+
}
|
|
670
|
+
else if (N <= 24) {
|
|
671
|
+
ProcessVectorizedBatch<24, vec_length>(batch, len, theta);
|
|
672
|
+
}
|
|
673
|
+
else if (N <= 48) {
|
|
674
|
+
ProcessVectorizedBatch<48, vec_length>(batch, len, theta);
|
|
675
|
+
}
|
|
676
|
+
else if (N <= 96) {
|
|
677
|
+
ProcessVectorizedBatch<96, vec_length>(batch, len, theta);
|
|
678
|
+
}
|
|
679
|
+
else if (N <= 192) {
|
|
680
|
+
ProcessVectorizedBatch<192, vec_length>(batch, len, theta);
|
|
681
|
+
}
|
|
682
|
+
else {
|
|
683
|
+
// Split large batches
|
|
684
|
+
ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
|
|
685
|
+
ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
template<int N, int vec_length>
|
|
690
|
+
static void ProcessVectorizedBatch(FlatArray<RecordingSS*> batch, double len, double theta) {
|
|
691
|
+
|
|
692
|
+
// *testout << "Processing vectorized S->S batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
|
|
693
|
+
MultiPole<MPSingular, Vec<N,Complex>> vec_source(batch[0]->mp_source->Order(), batch[0]->mp_source->Kappa(), batch[0]->mp_source->RTyp());
|
|
694
|
+
MultiPole<MPSingular, Vec<N,Complex>> vec_target(batch[0]->mp_target->Order(), batch[0]->mp_target->Kappa(), batch[0]->mp_target->RTyp());
|
|
695
|
+
|
|
696
|
+
// Copy multipoles into vectorized multipole
|
|
697
|
+
for (int i = 0; i < batch.Size(); i++)
|
|
698
|
+
{
|
|
699
|
+
auto source_i = VecVector2Matrix (batch[i]->mp_source->SH().Coefs());
|
|
700
|
+
auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
|
|
701
|
+
batch[i]->mp_source->SH().RotateZFlip(batch[i]->phi, batch[i]->flipz,
|
|
702
|
+
[source_i, source_mati] (size_t ii, Complex factor)
|
|
703
|
+
{
|
|
704
|
+
source_mati.Row(ii) = factor * source_i.Row(ii);
|
|
705
|
+
});
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
vec_source.SH().RotateY(theta, vec_source.SH().Order() >= 100);
|
|
709
|
+
vec_source.ShiftZ(-len, vec_target);
|
|
710
|
+
vec_target.SH().RotateY(-theta, vec_target.SH().Order() >= 100);
|
|
711
|
+
|
|
712
|
+
// Copy vectorized multipole into individual multipoles
|
|
713
|
+
for (int i = 0; i < batch.Size(); i++)
|
|
714
|
+
{
|
|
715
|
+
auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
|
|
716
|
+
auto target_mati = VecVector2Matrix (batch[i]->mp_target->SH().Coefs());
|
|
717
|
+
batch[i]->mp_target->SH().RotateZFlip(-batch[i]->phi, batch[i]->flipz,
|
|
718
|
+
[source_mati, target_mati] (size_t ii, Complex factor)
|
|
719
|
+
{
|
|
720
|
+
AtomicAdd (target_mati.Row(ii), factor * source_mati.Row(ii));
|
|
721
|
+
});
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
|
|
420
725
|
struct Node
|
|
421
726
|
{
|
|
422
727
|
Vec<3> center;
|
|
@@ -428,7 +733,14 @@ namespace ngsbem
|
|
|
428
733
|
Array<tuple<Vec<3>, entry_type>> charges;
|
|
429
734
|
Array<tuple<Vec<3>, Vec<3>, entry_type>> dipoles;
|
|
430
735
|
Array<tuple<Vec<3>, Vec<3>, Complex,int>> currents;
|
|
736
|
+
|
|
737
|
+
using simd_entry_type = decltype(MakeSimd(declval<std::array<entry_type,FMM_SW>>()));
|
|
738
|
+
Array<tuple<Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_charges;
|
|
739
|
+
Array<tuple<Vec<3,SIMD<double,FMM_SW>>, Vec<3,SIMD<double,FMM_SW>>, simd_entry_type>> simd_dipoles;
|
|
740
|
+
|
|
431
741
|
int total_sources;
|
|
742
|
+
std::mutex node_mutex;
|
|
743
|
+
atomic<bool> have_childs{false};
|
|
432
744
|
|
|
433
745
|
Node (Vec<3> acenter, double ar, int alevel, double akappa)
|
|
434
746
|
: center(acenter), r(ar), level(alevel), mp(MPOrder(ar*akappa), akappa, ar) // min(1.0, ar*akappa))
|
|
@@ -449,12 +761,26 @@ namespace ngsbem
|
|
|
449
761
|
cc(2) += (i&4) ? r/2 : -r/2;
|
|
450
762
|
childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
|
|
451
763
|
}
|
|
764
|
+
have_childs = true;
|
|
452
765
|
}
|
|
453
766
|
|
|
454
767
|
|
|
455
768
|
void AddCharge (Vec<3> x, entry_type c)
|
|
456
769
|
{
|
|
457
|
-
if (
|
|
770
|
+
if (have_childs) // quick check without locking
|
|
771
|
+
{
|
|
772
|
+
// directly send to childs:
|
|
773
|
+
int childnum = 0;
|
|
774
|
+
if (x(0) > center(0)) childnum += 1;
|
|
775
|
+
if (x(1) > center(1)) childnum += 2;
|
|
776
|
+
if (x(2) > center(2)) childnum += 4;
|
|
777
|
+
childs[childnum] -> AddCharge(x, c);
|
|
778
|
+
return;
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
lock_guard<mutex> guard(node_mutex);
|
|
782
|
+
|
|
783
|
+
if (have_childs) // test again after locking
|
|
458
784
|
{
|
|
459
785
|
// directly send to childs:
|
|
460
786
|
int childnum = 0;
|
|
@@ -465,6 +791,8 @@ namespace ngsbem
|
|
|
465
791
|
return;
|
|
466
792
|
}
|
|
467
793
|
|
|
794
|
+
|
|
795
|
+
|
|
468
796
|
charges.Append( tuple{x,c} );
|
|
469
797
|
|
|
470
798
|
// if (r*mp.Kappa() < 1e-8) return;
|
|
@@ -489,7 +817,7 @@ namespace ngsbem
|
|
|
489
817
|
|
|
490
818
|
void AddDipole (Vec<3> x, Vec<3> d, entry_type c)
|
|
491
819
|
{
|
|
492
|
-
if (
|
|
820
|
+
if (have_childs)
|
|
493
821
|
{
|
|
494
822
|
// directly send to childs:
|
|
495
823
|
|
|
@@ -501,6 +829,23 @@ namespace ngsbem
|
|
|
501
829
|
return;
|
|
502
830
|
}
|
|
503
831
|
|
|
832
|
+
lock_guard<mutex> guard(node_mutex);
|
|
833
|
+
|
|
834
|
+
if (have_childs)
|
|
835
|
+
{
|
|
836
|
+
// directly send to childs:
|
|
837
|
+
|
|
838
|
+
int childnum = 0;
|
|
839
|
+
if (x(0) > center(0)) childnum += 1;
|
|
840
|
+
if (x(1) > center(1)) childnum += 2;
|
|
841
|
+
if (x(2) > center(2)) childnum += 4;
|
|
842
|
+
childs[childnum] -> AddDipole(x, d, c);
|
|
843
|
+
return;
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
|
|
504
849
|
dipoles.Append (tuple{x,d,c});
|
|
505
850
|
|
|
506
851
|
if (dipoles.Size() < maxdirect || r < 1e-8)
|
|
@@ -520,6 +865,7 @@ namespace ngsbem
|
|
|
520
865
|
currents.SetSize0();
|
|
521
866
|
}
|
|
522
867
|
|
|
868
|
+
// not parallel yet
|
|
523
869
|
void AddCurrent (Vec<3> sp, Vec<3> ep, Complex j, int num)
|
|
524
870
|
{
|
|
525
871
|
if (childs[0])
|
|
@@ -549,7 +895,7 @@ namespace ngsbem
|
|
|
549
895
|
}
|
|
550
896
|
return;
|
|
551
897
|
}
|
|
552
|
-
|
|
898
|
+
|
|
553
899
|
currents.Append (tuple{sp,ep,j,num});
|
|
554
900
|
|
|
555
901
|
// if (currents.Size() < maxdirect || r < 1e-8)
|
|
@@ -583,26 +929,74 @@ namespace ngsbem
|
|
|
583
929
|
return sum;
|
|
584
930
|
}
|
|
585
931
|
|
|
586
|
-
|
|
587
|
-
|
|
932
|
+
{
|
|
933
|
+
// static Timer t("fmm direct eval"); RegionTimer reg(t);
|
|
934
|
+
// t.AddFlops (charges.Size());
|
|
935
|
+
if (simd_charges.Size())
|
|
588
936
|
{
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
937
|
+
simd_entry_type vsum{0.0};
|
|
938
|
+
if (mp.Kappa() < 1e-8)
|
|
939
|
+
for (auto [x,c] : simd_charges)
|
|
940
|
+
{
|
|
941
|
+
auto rho = L2Norm(p-x);
|
|
942
|
+
auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW> (1,rho*mp.Kappa()) / rho;
|
|
943
|
+
kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
|
|
944
|
+
vsum += kernel * c;
|
|
945
|
+
}
|
|
946
|
+
else
|
|
947
|
+
for (auto [x,c] : simd_charges)
|
|
948
|
+
{
|
|
949
|
+
auto rho = L2Norm(p-x);
|
|
950
|
+
auto [si,co] = sincos(rho*mp.Kappa());
|
|
951
|
+
auto kernel = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) / rho;
|
|
952
|
+
kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
|
|
953
|
+
vsum += kernel * c;
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
sum += HSum(vsum);
|
|
592
957
|
}
|
|
593
958
|
else
|
|
594
|
-
|
|
595
|
-
if (
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
959
|
+
{
|
|
960
|
+
if (mp.Kappa() < 1e-8)
|
|
961
|
+
{
|
|
962
|
+
for (auto [x,c] : charges)
|
|
963
|
+
if (double rho = L2Norm(p-x); rho > 0)
|
|
964
|
+
sum += (1/(4*M_PI))*Complex(1,rho*mp.Kappa()) / rho * c;
|
|
965
|
+
}
|
|
966
|
+
else
|
|
967
|
+
for (auto [x,c] : charges)
|
|
968
|
+
if (double rho = L2Norm(p-x); rho > 0)
|
|
969
|
+
sum += (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) / rho * c;
|
|
970
|
+
}
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
if (simd_dipoles.Size())
|
|
974
|
+
{
|
|
975
|
+
simd_entry_type vsum{0.0};
|
|
976
|
+
for (auto [x,d,c] : simd_dipoles)
|
|
977
|
+
{
|
|
978
|
+
auto rho = L2Norm(p-x);
|
|
979
|
+
auto drhodp = (1.0/rho) * (p-x);
|
|
980
|
+
auto [si,co] = sincos(rho*mp.Kappa());
|
|
981
|
+
auto dGdrho = (1/(4*M_PI))*SIMD<Complex,FMM_SW>(co,si) *
|
|
982
|
+
(-1.0/(rho*rho) + SIMD<Complex,FMM_SW>(0, mp.Kappa())/rho);
|
|
983
|
+
auto kernel = dGdrho * InnerProduct(drhodp, d);
|
|
984
|
+
kernel = If(rho > 0.0, kernel, SIMD<Complex,FMM_SW>(0.0));
|
|
985
|
+
vsum += kernel * c;
|
|
986
|
+
}
|
|
987
|
+
sum += HSum(vsum);
|
|
988
|
+
}
|
|
989
|
+
else
|
|
990
|
+
{
|
|
991
|
+
for (auto [x,d,c] : dipoles)
|
|
599
992
|
if (double rho = L2Norm(p-x); rho > 0)
|
|
600
|
-
|
|
993
|
+
{
|
|
601
994
|
Vec<3> drhodp = 1.0/rho * (p-x);
|
|
602
995
|
Complex dGdrho = (1/(4*M_PI))*exp(Complex(0,rho*mp.Kappa())) *
|
|
603
|
-
|
|
996
|
+
(Complex(0, mp.Kappa())/rho - 1.0/sqr(rho));
|
|
604
997
|
sum += dGdrho * InnerProduct(drhodp, d) * c;
|
|
605
|
-
|
|
998
|
+
}
|
|
999
|
+
}
|
|
606
1000
|
|
|
607
1001
|
for (auto [sp,ep,j,num] : currents)
|
|
608
1002
|
{
|
|
@@ -664,23 +1058,27 @@ namespace ngsbem
|
|
|
664
1058
|
}
|
|
665
1059
|
}
|
|
666
1060
|
|
|
667
|
-
void CalcMP()
|
|
1061
|
+
void CalcMP(Array<RecordingSS> * recording, Array<Node*> * nodes_to_process)
|
|
668
1062
|
{
|
|
669
|
-
mp.SH().Coefs() = 0.0;
|
|
1063
|
+
// mp.SH().Coefs() = 0.0;
|
|
670
1064
|
if (childs[0])
|
|
671
1065
|
{
|
|
672
|
-
if (total_sources < 1000)
|
|
1066
|
+
if (total_sources < 1000 || recording)
|
|
673
1067
|
for (auto & child : childs)
|
|
674
|
-
child->CalcMP();
|
|
1068
|
+
child->CalcMP(recording, nodes_to_process);
|
|
675
1069
|
else
|
|
676
1070
|
ParallelFor (8, [&] (int nr)
|
|
677
1071
|
{
|
|
678
|
-
childs[nr] -> CalcMP();
|
|
1072
|
+
childs[nr] -> CalcMP(recording, nodes_to_process);
|
|
679
1073
|
});
|
|
680
1074
|
|
|
681
1075
|
|
|
682
|
-
for (auto & child : childs)
|
|
683
|
-
child->mp.
|
|
1076
|
+
for (auto & child : childs){
|
|
1077
|
+
if (recording && child->mp.SH().Coefs().Size() > 0)
|
|
1078
|
+
*recording += RecordingSS(&child->mp, &mp, center-child->center);
|
|
1079
|
+
else
|
|
1080
|
+
child->mp.TransformAdd(mp, center-child->center);
|
|
1081
|
+
}
|
|
684
1082
|
}
|
|
685
1083
|
else
|
|
686
1084
|
{
|
|
@@ -690,14 +1088,54 @@ namespace ngsbem
|
|
|
690
1088
|
return;
|
|
691
1089
|
}
|
|
692
1090
|
|
|
693
|
-
|
|
694
|
-
|
|
1091
|
+
// make simd charges, comment this block for testing ...
|
|
1092
|
+
simd_charges.SetSize( (charges.Size()+FMM_SW-1)/FMM_SW);
|
|
1093
|
+
size_t i = 0, ii = 0;
|
|
1094
|
+
for ( ; i+FMM_SW <= charges.Size(); i+=FMM_SW, ii++)
|
|
1095
|
+
{
|
|
1096
|
+
std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
|
|
1097
|
+
for (int j = 0; j < FMM_SW; j++) ca[j] = charges[i+j];
|
|
1098
|
+
simd_charges[ii] = MakeSimd(ca);
|
|
1099
|
+
}
|
|
1100
|
+
if (i < charges.Size())
|
|
1101
|
+
{
|
|
1102
|
+
std::array<tuple<Vec<3>,entry_type>, FMM_SW> ca;
|
|
1103
|
+
int j = 0;
|
|
1104
|
+
for ( ; i+j < charges.Size(); j++) ca[j] = charges[i+j];
|
|
1105
|
+
for ( ; j < FMM_SW; j++) ca[j] = tuple( get<0>(ca[0]), entry_type{0.0} );
|
|
1106
|
+
simd_charges[ii] = MakeSimd(ca);
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
simd_dipoles.SetSize( (dipoles.Size()+FMM_SW-1)/FMM_SW);
|
|
1110
|
+
i = 0, ii = 0;
|
|
1111
|
+
for ( ; i+FMM_SW <= dipoles.Size(); i+=FMM_SW, ii++)
|
|
1112
|
+
{
|
|
1113
|
+
std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
|
|
1114
|
+
for (int j = 0; j < FMM_SW; j++) di[j] = dipoles[i+j];
|
|
1115
|
+
simd_dipoles[ii] = MakeSimd(di);
|
|
1116
|
+
}
|
|
1117
|
+
if (i < dipoles.Size())
|
|
1118
|
+
{
|
|
1119
|
+
std::array<tuple<Vec<3>,Vec<3>,entry_type>, FMM_SW> di;
|
|
1120
|
+
int j = 0;
|
|
1121
|
+
for ( ; i+j < dipoles.Size(); j++) di[j] = dipoles[i+j];
|
|
1122
|
+
for ( ; j < FMM_SW; j++) di[j] = tuple( get<0>(di[0]), get<1>(di[0]), entry_type{0.0} );
|
|
1123
|
+
simd_dipoles[ii] = MakeSimd(di);
|
|
1124
|
+
}
|
|
1125
|
+
|
|
695
1126
|
|
|
696
|
-
|
|
697
|
-
|
|
1127
|
+
if (nodes_to_process)
|
|
1128
|
+
*nodes_to_process += this;
|
|
1129
|
+
else {
|
|
1130
|
+
for (auto [x,c] : charges)
|
|
1131
|
+
mp.AddCharge (x-center,c);
|
|
1132
|
+
|
|
1133
|
+
for (auto [x,d,c] : dipoles)
|
|
1134
|
+
mp.AddDipole (x-center, d, c);
|
|
698
1135
|
|
|
699
|
-
|
|
700
|
-
|
|
1136
|
+
for (auto [sp,ep,j,num] : currents)
|
|
1137
|
+
mp.AddCurrent (sp-center, ep-center, j, num);
|
|
1138
|
+
}
|
|
701
1139
|
}
|
|
702
1140
|
}
|
|
703
1141
|
|
|
@@ -836,6 +1274,10 @@ namespace ngsbem
|
|
|
836
1274
|
void CalcMP()
|
|
837
1275
|
{
|
|
838
1276
|
static Timer t("mptool compute singular MLMP"); RegionTimer rg(t);
|
|
1277
|
+
static Timer ts2mp("mptool compute singular MLMP - source2mp");
|
|
1278
|
+
static Timer tS2S("mptool compute singular MLMP - S->S");
|
|
1279
|
+
static Timer trec("mptool comput singular recording");
|
|
1280
|
+
static Timer tsort("mptool comput singular sort");
|
|
839
1281
|
|
|
840
1282
|
/*
|
|
841
1283
|
int maxlevel = 0;
|
|
@@ -847,7 +1289,87 @@ namespace ngsbem
|
|
|
847
1289
|
*/
|
|
848
1290
|
|
|
849
1291
|
root.CalcTotalSources();
|
|
850
|
-
|
|
1292
|
+
|
|
1293
|
+
if (false)
|
|
1294
|
+
// direct evaluation of S->S
|
|
1295
|
+
root.CalcMP(nullptr, nullptr);
|
|
1296
|
+
else
|
|
1297
|
+
{
|
|
1298
|
+
|
|
1299
|
+
Array<RecordingSS> recording;
|
|
1300
|
+
Array<Node*> nodes_to_process;
|
|
1301
|
+
|
|
1302
|
+
{
|
|
1303
|
+
RegionTimer reg(trec);
|
|
1304
|
+
root.CalcMP(&recording, &nodes_to_process);
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
{
|
|
1308
|
+
RegionTimer rs2mp(ts2mp);
|
|
1309
|
+
ParallelFor(nodes_to_process.Size(), [&](int i){
|
|
1310
|
+
auto node = nodes_to_process[i];
|
|
1311
|
+
for (auto [x,c]: node->charges)
|
|
1312
|
+
node->mp.AddCharge(x-node->center, c);
|
|
1313
|
+
for (auto [x,d,c]: node->dipoles)
|
|
1314
|
+
node->mp.AddDipole(x-node->center, d, c);
|
|
1315
|
+
for (auto [sp,ep,j,num]: node->currents)
|
|
1316
|
+
node->mp.AddCurrent(sp-node->center, ep-node->center, j, num);
|
|
1317
|
+
}, TasksPerThread(4));
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
{
|
|
1321
|
+
RegionTimer reg(tsort);
|
|
1322
|
+
QuickSort (recording, [] (auto & a, auto & b)
|
|
1323
|
+
{
|
|
1324
|
+
if (a.len < (1-1e-8) * b.len) return true;
|
|
1325
|
+
if (a.len > (1+1e-8) * b.len) return false;
|
|
1326
|
+
return a.theta < b.theta;
|
|
1327
|
+
});
|
|
1328
|
+
}
|
|
1329
|
+
|
|
1330
|
+
double current_len = -1e100;
|
|
1331
|
+
double current_theta = -1e100;
|
|
1332
|
+
Array<RecordingSS*> current_batch;
|
|
1333
|
+
Array<Array<RecordingSS*>> batch_group;
|
|
1334
|
+
Array<double> group_lengths;
|
|
1335
|
+
Array<double> group_thetas;
|
|
1336
|
+
for (auto & record : recording)
|
|
1337
|
+
{
|
|
1338
|
+
bool len_changed = fabs(record.len - current_len) > 1e-8;
|
|
1339
|
+
bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
|
|
1340
|
+
if ((len_changed || theta_changed) && current_batch.Size() > 0) {
|
|
1341
|
+
batch_group.Append(current_batch);
|
|
1342
|
+
group_lengths.Append(current_len);
|
|
1343
|
+
group_thetas.Append(current_theta);
|
|
1344
|
+
current_batch.SetSize(0);
|
|
1345
|
+
}
|
|
1346
|
+
|
|
1347
|
+
current_len = record.len;
|
|
1348
|
+
current_theta = record.theta;
|
|
1349
|
+
current_batch.Append(&record);
|
|
1350
|
+
}
|
|
1351
|
+
if (current_batch.Size() > 0) {
|
|
1352
|
+
batch_group.Append(current_batch);
|
|
1353
|
+
group_lengths.Append(current_len);
|
|
1354
|
+
group_thetas.Append(current_theta);
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
{
|
|
1358
|
+
RegionTimer rS2S(tS2S);
|
|
1359
|
+
// ParallelFor(batch_group.Size(), [&](int i) {
|
|
1360
|
+
for (int i = 0; i < batch_group.Size(); i++){
|
|
1361
|
+
// *testout << "Processing batch " << i << " of size " << batch_group[i].Size() << ", with len = " << group_lengths[i] << ", theta = " << group_thetas[i] << endl;
|
|
1362
|
+
int chunk_size = 24;
|
|
1363
|
+
if (batch_group[i].Size() < chunk_size)
|
|
1364
|
+
ProcessBatch(batch_group[i], group_lengths[i], group_thetas[i]);
|
|
1365
|
+
else
|
|
1366
|
+
ParallelForRange(IntRange(batch_group[i].Size()), [&](IntRange range) {
|
|
1367
|
+
auto sub_batch = batch_group[i].Range(range.First(), range.Next());
|
|
1368
|
+
ProcessBatch(sub_batch, group_lengths[i], group_thetas[i]);
|
|
1369
|
+
}, TasksPerThread(4));
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
}
|
|
851
1373
|
|
|
852
1374
|
havemp = true;
|
|
853
1375
|
}
|
|
@@ -892,19 +1414,128 @@ namespace ngsbem
|
|
|
892
1414
|
Vec<3> adist)
|
|
893
1415
|
: mpS(ampS), mpR(ampR), dist(adist)
|
|
894
1416
|
{
|
|
895
|
-
len =
|
|
896
|
-
if (len < 1e-30)
|
|
897
|
-
theta = 0;
|
|
898
|
-
else
|
|
899
|
-
theta = acos (dist(2) / len);
|
|
900
|
-
|
|
901
|
-
if (sqr(dist(0))+sqr(dist(1)) < 1e-30)
|
|
902
|
-
phi = 0;
|
|
903
|
-
else
|
|
904
|
-
phi = atan2(dist(1), dist(0));
|
|
1417
|
+
std::tie(len, theta, phi) = SphericalCoordinates(dist);
|
|
905
1418
|
}
|
|
906
1419
|
};
|
|
907
1420
|
|
|
1421
|
+
static void ProcessBatchRS(FlatArray<RecordingRS*> batch, double len, double theta) {
|
|
1422
|
+
// static Timer t("ProcessBatchRS"); RegionTimer reg(t, batch.Size());
|
|
1423
|
+
constexpr int vec_length = VecLength<elem_type>;
|
|
1424
|
+
int batch_size = batch.Size();
|
|
1425
|
+
int N = batch_size * vec_length;
|
|
1426
|
+
// *testout << "Processing batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", Type: " << typeid(elem_type).name() << ", len = " << len << ", theta = " << theta << endl;
|
|
1427
|
+
|
|
1428
|
+
if (N <= 1 || batch_size <= 1) {
|
|
1429
|
+
for (auto* rec : batch) {
|
|
1430
|
+
rec->mpS->TransformAdd(*rec->mpR, rec->dist);
|
|
1431
|
+
}
|
|
1432
|
+
}
|
|
1433
|
+
else if (N <= 3) {
|
|
1434
|
+
ProcessVectorizedBatch<3, vec_length>(batch, len, theta);
|
|
1435
|
+
}
|
|
1436
|
+
else if (N <= 4) {
|
|
1437
|
+
ProcessVectorizedBatch<4, vec_length>(batch, len, theta);
|
|
1438
|
+
}
|
|
1439
|
+
else if (N <= 6) {
|
|
1440
|
+
ProcessVectorizedBatch<6, vec_length>(batch, len, theta);
|
|
1441
|
+
}
|
|
1442
|
+
else if (N <= 12) {
|
|
1443
|
+
ProcessVectorizedBatch<12, vec_length>(batch, len, theta);
|
|
1444
|
+
}
|
|
1445
|
+
else if (N <= 24) {
|
|
1446
|
+
ProcessVectorizedBatch<24, vec_length>(batch, len, theta);
|
|
1447
|
+
}
|
|
1448
|
+
else if (N <= 48) {
|
|
1449
|
+
ProcessVectorizedBatch<48, vec_length>(batch, len, theta);
|
|
1450
|
+
}
|
|
1451
|
+
else if (N <= 96) {
|
|
1452
|
+
ProcessVectorizedBatch<96, vec_length>(batch, len, theta);
|
|
1453
|
+
}
|
|
1454
|
+
else if (N <= 192) {
|
|
1455
|
+
ProcessVectorizedBatch<192, vec_length>(batch, len, theta);
|
|
1456
|
+
}
|
|
1457
|
+
else {
|
|
1458
|
+
// Split large batches
|
|
1459
|
+
/*
|
|
1460
|
+
ProcessBatch(batch.Range(0, 192 / vec_length), len, theta);
|
|
1461
|
+
ProcessBatch(batch.Range(192 / vec_length, batch_size), len, theta);
|
|
1462
|
+
*/
|
|
1463
|
+
|
|
1464
|
+
/*
|
|
1465
|
+
ParallelFor (2, [&] (int i)
|
|
1466
|
+
{
|
|
1467
|
+
if (i == 0)
|
|
1468
|
+
ProcessBatchRS(batch.Range(0, 192 / vec_length), len, theta);
|
|
1469
|
+
else
|
|
1470
|
+
ProcessBatchRS(batch.Range(192 / vec_length, batch_size), len, theta);
|
|
1471
|
+
}, 2);
|
|
1472
|
+
*/
|
|
1473
|
+
|
|
1474
|
+
|
|
1475
|
+
size_t chunksize = 192/vec_length;
|
|
1476
|
+
size_t num = (batch.Size()+chunksize-1) / chunksize;
|
|
1477
|
+
ParallelFor (num, [&](int i)
|
|
1478
|
+
{
|
|
1479
|
+
ProcessBatchRS(batch.Range(i*chunksize, min((i+1)*chunksize, batch.Size())), len, theta);
|
|
1480
|
+
}, num);
|
|
1481
|
+
|
|
1482
|
+
}
|
|
1483
|
+
}
|
|
1484
|
+
|
|
1485
|
+
|
|
1486
|
+
template<int N, int vec_length>
|
|
1487
|
+
static void ProcessVectorizedBatch(FlatArray<RecordingRS*> batch, double len, double theta) {
|
|
1488
|
+
|
|
1489
|
+
// static Timer t("ProcessVectorizedBatch, N = "+ToString(N) + ", vec_len = " + ToString(vec_length));
|
|
1490
|
+
// RegionTimer reg(t, batch[0]->mpS->SH().Order());
|
|
1491
|
+
// static Timer ttobatch("mptools - copy to batch 2");
|
|
1492
|
+
// static Timer tfrombatch("mptools - copy from batch 2");
|
|
1493
|
+
|
|
1494
|
+
// *testout << "Processing vectorized batch of size " << batch.Size() << ", with N = " << N << ", vec_length = " << vec_length << ", len = " << len << ", theta = " << theta << endl;
|
|
1495
|
+
MultiPole<MPSingular, Vec<N,Complex>> vec_source(batch[0]->mpS->Order(), batch[0]->mpS->Kappa(), batch[0]->mpS->RTyp());
|
|
1496
|
+
// MultiPole<MPSingular, elem_type> tmp_source{*batch[0]->mpS};
|
|
1497
|
+
MultiPole<MPRegular, elem_type> tmp_target{*batch[0]->mpR};
|
|
1498
|
+
MultiPole<MPRegular, Vec<N,Complex>> vec_target(batch[0]->mpR->Order(), batch[0]->mpR->Kappa(), batch[0]->mpR->RTyp());
|
|
1499
|
+
|
|
1500
|
+
// Copy multipoles into vectorized multipole
|
|
1501
|
+
// ttobatch.Start();
|
|
1502
|
+
for (int i = 0; i < batch.Size(); i++)
|
|
1503
|
+
{
|
|
1504
|
+
auto source_i = VecVector2Matrix (batch[i]->mpS->SH().Coefs());
|
|
1505
|
+
auto source_mati = VecVector2Matrix (vec_source.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
|
|
1506
|
+
batch[i]->mpS->SH().RotateZ(batch[i]->phi,
|
|
1507
|
+
[source_i, source_mati] (size_t ii, Complex factor)
|
|
1508
|
+
{
|
|
1509
|
+
source_mati.Row(ii) = factor * source_i.Row(ii);
|
|
1510
|
+
});
|
|
1511
|
+
}
|
|
1512
|
+
|
|
1513
|
+
// ttobatch.Stop();
|
|
1514
|
+
|
|
1515
|
+
vec_source.SH().RotateY(theta);
|
|
1516
|
+
vec_source.ShiftZ(-len, vec_target);
|
|
1517
|
+
vec_target.SH().RotateY(-theta);
|
|
1518
|
+
|
|
1519
|
+
// Copy vectorized multipole into individual multipoles
|
|
1520
|
+
// tfrombatch.Start();
|
|
1521
|
+
for (int i = 0; i < batch.Size(); i++) {
|
|
1522
|
+
// auto source_i = VecVector2Matrix (tmp_target.SH().Coefs());
|
|
1523
|
+
auto source_mati = VecVector2Matrix (vec_target.SH().Coefs()).Cols(i*vec_length, (i+1)*vec_length);
|
|
1524
|
+
auto targeti = VecVector2Matrix(batch[i]->mpR->SH().Coefs());
|
|
1525
|
+
|
|
1526
|
+
tmp_target.SH().RotateZ(-batch[i]->phi,
|
|
1527
|
+
[source_mati, targeti] (size_t ii, Complex factor)
|
|
1528
|
+
{
|
|
1529
|
+
// source_i.Row(ii) = factor * source_mati.Row(ii);
|
|
1530
|
+
AtomicAdd (VectorView(targeti.Row(ii)), factor * source_mati.Row(ii));
|
|
1531
|
+
});
|
|
1532
|
+
// for (int j = 0; j < tmp_target.SH().Coefs().Size(); j++)
|
|
1533
|
+
// AtomicAdd(batch[i]->mpR->SH().Coefs()[j], tmp_target.SH().Coefs()[j]);
|
|
1534
|
+
}
|
|
1535
|
+
// tfrombatch.Stop();
|
|
1536
|
+
|
|
1537
|
+
}
|
|
1538
|
+
|
|
908
1539
|
|
|
909
1540
|
struct Node
|
|
910
1541
|
{
|
|
@@ -915,6 +1546,8 @@ namespace ngsbem
|
|
|
915
1546
|
MultiPole<MPRegular,elem_type> mp;
|
|
916
1547
|
Array<Vec<3>> targets;
|
|
917
1548
|
int total_targets;
|
|
1549
|
+
std::mutex node_mutex;
|
|
1550
|
+
atomic<bool> have_childs{false};
|
|
918
1551
|
|
|
919
1552
|
Array<const typename SingularMLMultiPole<elem_type>::Node*> singnodes;
|
|
920
1553
|
|
|
@@ -939,6 +1572,7 @@ namespace ngsbem
|
|
|
939
1572
|
cc(2) += (i&4) ? r/2 : -r/2;
|
|
940
1573
|
childs[i] = make_unique<Node> (cc, r/2, level+1, mp.Kappa());
|
|
941
1574
|
}
|
|
1575
|
+
have_childs = true;
|
|
942
1576
|
}
|
|
943
1577
|
|
|
944
1578
|
void AddSingularNode (const typename SingularMLMultiPole<elem_type>::Node & singnode, bool allow_refine,
|
|
@@ -946,7 +1580,7 @@ namespace ngsbem
|
|
|
946
1580
|
{
|
|
947
1581
|
if (mp.SH().Order() < 0) return;
|
|
948
1582
|
if (singnode.mp.SH().Order() < 0) return;
|
|
949
|
-
if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
|
|
1583
|
+
// if (L2Norm(singnode.mp.SH().Coefs()) == 0) return;
|
|
950
1584
|
if (level > 20)
|
|
951
1585
|
{
|
|
952
1586
|
singnodes.Append(&singnode);
|
|
@@ -1028,12 +1662,22 @@ namespace ngsbem
|
|
|
1028
1662
|
|
|
1029
1663
|
if (childs[0])
|
|
1030
1664
|
{
|
|
1031
|
-
|
|
1665
|
+
if (total_targets < 1000)
|
|
1032
1666
|
{
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1667
|
+
for (int nr = 0; nr < 8; nr++)
|
|
1668
|
+
{
|
|
1669
|
+
if (L2Norm(mp.SH().Coefs()) > 0)
|
|
1670
|
+
mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
|
|
1671
|
+
childs[nr]->LocalizeExpansion(allow_refine);
|
|
1672
|
+
}
|
|
1036
1673
|
}
|
|
1674
|
+
else
|
|
1675
|
+
ParallelFor(8, [&] (int nr)
|
|
1676
|
+
{
|
|
1677
|
+
if (L2Norm(mp.SH().Coefs()) > 0)
|
|
1678
|
+
mp.TransformAdd (childs[nr]->mp, childs[nr]->center-center);
|
|
1679
|
+
childs[nr]->LocalizeExpansion(allow_refine);
|
|
1680
|
+
});
|
|
1037
1681
|
mp = MultiPole<MPRegular,elem_type>(-1, mp.Kappa(), 1.);
|
|
1038
1682
|
//mp.SH().Coefs()=0.0;
|
|
1039
1683
|
}
|
|
@@ -1041,18 +1685,8 @@ namespace ngsbem
|
|
|
1041
1685
|
|
|
1042
1686
|
elem_type Evaluate (Vec<3> p) const
|
|
1043
1687
|
{
|
|
1044
|
-
// *testout << "eval p = " << p << ", level = " << level << ", center = " << center << ", r = " << r << endl;
|
|
1045
1688
|
elem_type sum{0.0};
|
|
1046
|
-
|
|
1047
|
-
if (childs[0])
|
|
1048
|
-
{
|
|
1049
|
-
int childnum = 0;
|
|
1050
|
-
if (p(0) > center(0)) childnum += 1;
|
|
1051
|
-
if (p(1) > center(1)) childnum += 2;
|
|
1052
|
-
if (p(2) > center(2)) childnum += 4;
|
|
1053
|
-
sum = childs[childnum]->Evaluate(p);
|
|
1054
|
-
}
|
|
1055
|
-
*/
|
|
1689
|
+
|
|
1056
1690
|
int childnum = 0;
|
|
1057
1691
|
if (p(0) > center(0)) childnum += 1;
|
|
1058
1692
|
if (p(1) > center(1)) childnum += 2;
|
|
@@ -1062,8 +1696,6 @@ namespace ngsbem
|
|
|
1062
1696
|
else
|
|
1063
1697
|
sum = mp.Eval(p-center);
|
|
1064
1698
|
|
|
1065
|
-
|
|
1066
|
-
// static Timer t("mptool direct evaluate"); RegionTimer r(t);
|
|
1067
1699
|
for (auto sn : singnodes)
|
|
1068
1700
|
sum += sn->EvaluateMP(p);
|
|
1069
1701
|
|
|
@@ -1112,7 +1744,8 @@ namespace ngsbem
|
|
|
1112
1744
|
|
|
1113
1745
|
void AddTarget (Vec<3> x)
|
|
1114
1746
|
{
|
|
1115
|
-
if (childs[0])
|
|
1747
|
+
// if (childs[0])
|
|
1748
|
+
if (have_childs) // quick check without locking
|
|
1116
1749
|
{
|
|
1117
1750
|
// directly send to childs:
|
|
1118
1751
|
int childnum = 0;
|
|
@@ -1123,6 +1756,20 @@ namespace ngsbem
|
|
|
1123
1756
|
return;
|
|
1124
1757
|
}
|
|
1125
1758
|
|
|
1759
|
+
lock_guard<mutex> guard(node_mutex);
|
|
1760
|
+
|
|
1761
|
+
if (have_childs) // test again after locking
|
|
1762
|
+
{
|
|
1763
|
+
// directly send to childs:
|
|
1764
|
+
int childnum = 0;
|
|
1765
|
+
if (x(0) > center(0)) childnum += 1;
|
|
1766
|
+
if (x(1) > center(1)) childnum += 2;
|
|
1767
|
+
if (x(2) > center(2)) childnum += 4;
|
|
1768
|
+
childs[childnum] -> AddTarget(x);
|
|
1769
|
+
return;
|
|
1770
|
+
}
|
|
1771
|
+
|
|
1772
|
+
|
|
1126
1773
|
targets.Append( x );
|
|
1127
1774
|
|
|
1128
1775
|
// if (r*mp.Kappa() < 1e-8) return;
|
|
@@ -1227,6 +1874,8 @@ namespace ngsbem
|
|
|
1227
1874
|
void CalcMP(shared_ptr<SingularMLMultiPole<elem_type>> asingmp)
|
|
1228
1875
|
{
|
|
1229
1876
|
static Timer t("mptool regular MLMP"); RegionTimer rg(t);
|
|
1877
|
+
static Timer trec("mptool regular MLMP - recording");
|
|
1878
|
+
static Timer tsort("mptool regular MLMP - sort");
|
|
1230
1879
|
|
|
1231
1880
|
singmp = asingmp;
|
|
1232
1881
|
|
|
@@ -1234,23 +1883,58 @@ namespace ngsbem
|
|
|
1234
1883
|
root.RemoveEmptyTrees();
|
|
1235
1884
|
|
|
1236
1885
|
|
|
1237
|
-
root.AddSingularNode(singmp->root, false, nullptr);
|
|
1238
|
-
/*
|
|
1239
|
-
Array<RecordingRS> recording;
|
|
1240
|
-
|
|
1886
|
+
// root.AddSingularNode(singmp->root, false, nullptr);
|
|
1887
|
+
// /*
|
|
1888
|
+
Array<RecordingRS> recording;
|
|
1889
|
+
{
|
|
1890
|
+
RegionTimer rrec(trec);
|
|
1891
|
+
root.AddSingularNode(singmp->root, false, &recording);
|
|
1892
|
+
}
|
|
1893
|
+
|
|
1241
1894
|
// cout << "recorded: " << recording.Size() << endl;
|
|
1895
|
+
{
|
|
1896
|
+
RegionTimer reg(tsort);
|
|
1242
1897
|
QuickSort (recording, [] (auto & a, auto & b)
|
|
1243
1898
|
{
|
|
1244
1899
|
if (a.len < (1-1e-8) * b.len) return true;
|
|
1245
1900
|
if (a.len > (1+1e-8) * b.len) return false;
|
|
1246
1901
|
return a.theta < b.theta;
|
|
1247
1902
|
});
|
|
1903
|
+
}
|
|
1904
|
+
|
|
1905
|
+
double current_len = -1e100;
|
|
1906
|
+
double current_theta = -1e100;
|
|
1907
|
+
Array<RecordingRS*> current_batch;
|
|
1908
|
+
Array<Array<RecordingRS*>> batch_group;
|
|
1909
|
+
Array<double> group_lengths;
|
|
1910
|
+
Array<double> group_thetas;
|
|
1248
1911
|
for (auto & record : recording)
|
|
1249
1912
|
{
|
|
1250
|
-
|
|
1251
|
-
|
|
1913
|
+
bool len_changed = fabs(record.len - current_len) > 1e-8;
|
|
1914
|
+
bool theta_changed = fabs(record.theta - current_theta) > 1e-8;
|
|
1915
|
+
if ((len_changed || theta_changed) && current_batch.Size() > 0) {
|
|
1916
|
+
// ProcessBatch(current_batch, current_len, current_theta);
|
|
1917
|
+
batch_group.Append(current_batch);
|
|
1918
|
+
group_lengths.Append(current_len);
|
|
1919
|
+
group_thetas.Append(current_theta);
|
|
1920
|
+
current_batch.SetSize(0);
|
|
1921
|
+
}
|
|
1922
|
+
|
|
1923
|
+
current_len = record.len;
|
|
1924
|
+
current_theta = record.theta;
|
|
1925
|
+
current_batch.Append(&record);
|
|
1252
1926
|
}
|
|
1253
|
-
|
|
1927
|
+
if (current_batch.Size() > 0) {
|
|
1928
|
+
// ProcessBatch(current_batch, current_len, current_theta);
|
|
1929
|
+
batch_group.Append(current_batch);
|
|
1930
|
+
group_lengths.Append(current_len);
|
|
1931
|
+
group_thetas.Append(current_theta);
|
|
1932
|
+
}
|
|
1933
|
+
|
|
1934
|
+
ParallelFor(batch_group.Size(), [&](int i) {
|
|
1935
|
+
ProcessBatchRS(batch_group[i], group_lengths[i], group_thetas[i]);
|
|
1936
|
+
}, TasksPerThread(4));
|
|
1937
|
+
// */
|
|
1254
1938
|
|
|
1255
1939
|
|
|
1256
1940
|
/*
|
|
@@ -1262,7 +1946,7 @@ namespace ngsbem
|
|
|
1262
1946
|
cout << "reg " << i << ": " << RegularMLMultiPole::nodes_on_level[i] << endl;
|
|
1263
1947
|
*/
|
|
1264
1948
|
|
|
1265
|
-
static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
|
|
1949
|
+
static Timer tloc("mptool regular localize expansion"); RegionTimer rloc(tloc);
|
|
1266
1950
|
root.LocalizeExpansion(false);
|
|
1267
1951
|
}
|
|
1268
1952
|
|
|
@@ -1296,6 +1980,7 @@ namespace ngsbem
|
|
|
1296
1980
|
|
|
1297
1981
|
};
|
|
1298
1982
|
|
|
1983
|
+
|
|
1299
1984
|
template <typename elem_type>
|
|
1300
1985
|
inline ostream & operator<< (ostream & ost, const RegularMLMultiPole<elem_type> & mlmp)
|
|
1301
1986
|
{
|